[ { "loss":1.2008, "grad_norm":0.3822754323, "learning_rate":0.0001407407, "entropy":1.0346003115, "num_tokens":322124.0, "mean_token_accuracy":0.7109046429, "epoch":0.0673400673, "step":20, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.9385, "grad_norm":0.2103841156, "learning_rate":0.0001999048, "entropy":0.9494877957, "num_tokens":642995.0, "mean_token_accuracy":0.7539383888, "epoch":0.1346801347, "step":40, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.915, "grad_norm":0.206428811, "learning_rate":0.0001993238, "entropy":0.9191693425, "num_tokens":966396.0, "mean_token_accuracy":0.7584572025, "epoch":0.202020202, "step":60, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.8441, "grad_norm":0.3027354181, "learning_rate":0.0001982178, "entropy":0.8431956261, "num_tokens":1287058.0, "mean_token_accuracy":0.7713396206, "epoch":0.2693602694, "step":80, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.8408, "grad_norm":0.2982031703, "learning_rate":0.0001965926, "entropy":0.8472392239, "num_tokens":1607723.0, "mean_token_accuracy":0.7738652974, "epoch":0.3367003367, "step":100, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":0.3367003367, "step":100, "eval_loss":0.8332510591, "eval_runtime":10.3382, "eval_samples_per_second":24.182, "eval_steps_per_second":3.095, "eval_entropy":0.8475092333, "eval_num_tokens":1607723.0, "eval_mean_token_accuracy":0.7752955835, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.8262, "grad_norm":0.2738818824, "learning_rate":0.0001944568, "entropy":0.8286631659, "num_tokens":1928620.0, "mean_token_accuracy":0.7755305201, "epoch":0.404040404, "step":120, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.8089, "grad_norm":0.2727711201, "learning_rate":0.0001918216, "entropy":0.8132541452, "num_tokens":2249401.0, "mean_token_accuracy":0.779610493, "epoch":0.4713804714, "step":140, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.7815, "grad_norm":0.253259182, "learning_rate":0.0001887011, "entropy":0.7838059939, "num_tokens":2571041.0, "mean_token_accuracy":0.785765557, "epoch":0.5387205387, "step":160, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.763, "grad_norm":0.2851669788, "learning_rate":0.0001851117, "entropy":0.7674662221, "num_tokens":2890814.0, "mean_token_accuracy":0.7893050611, "epoch":0.6060606061, "step":180, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.7434, "grad_norm":0.2782152891, "learning_rate":0.0001810723, "entropy":0.7478979569, "num_tokens":3212811.0, "mean_token_accuracy":0.7946783796, "epoch":0.6734006734, "step":200, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":0.6734006734, "step":200, "eval_loss":0.7540781498, "eval_runtime":10.3368, "eval_samples_per_second":24.185, "eval_steps_per_second":3.096, "eval_entropy":0.7548957299, "eval_num_tokens":3212811.0, "eval_mean_token_accuracy":0.7921991255, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.718, "grad_norm":0.2911323905, "learning_rate":0.0001766044, "entropy":0.7216884721, "num_tokens":3534962.0, "mean_token_accuracy":0.8007057041, "epoch":0.7407407407, "step":220, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.7015, "grad_norm":0.3469219804, "learning_rate":0.0001717316, "entropy":0.7073224507, "num_tokens":3855519.0, "mean_token_accuracy":0.8033309393, "epoch":0.8080808081, "step":240, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.7066, "grad_norm":0.3413038254, "learning_rate":0.0001664796, "entropy":0.7131307989, "num_tokens":4174694.0, "mean_token_accuracy":0.8030782551, "epoch":0.8754208754, "step":260, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.6725, "grad_norm":0.3970124125, "learning_rate":0.0001608761, "entropy":0.6751278345, "num_tokens":4495214.0, "mean_token_accuracy":0.8109409161, "epoch":0.9427609428, "step":280, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.6567, "grad_norm":0.4383921921, "learning_rate":0.0001549509, "entropy":0.6729893133, "num_tokens":4815033.0, "mean_token_accuracy":0.8148056932, "epoch":1.0101010101, "step":300, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":1.0101010101, "step":300, "eval_loss":0.6720606685, "eval_runtime":10.367, "eval_samples_per_second":24.115, "eval_steps_per_second":3.087, "eval_entropy":0.6333643645, "eval_num_tokens":4815033.0, "eval_mean_token_accuracy":0.8116748761, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.5649, "grad_norm":0.4388367832, "learning_rate":0.0001487352, "entropy":0.5757804383, "num_tokens":5135569.0, "mean_token_accuracy":0.8381757662, "epoch":1.0774410774, "step":320, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.564, "grad_norm":0.4527507126, "learning_rate":0.0001422618, "entropy":0.5801134199, "num_tokens":5456292.0, "mean_token_accuracy":0.8384027012, "epoch":1.1447811448, "step":340, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.5403, "grad_norm":0.6442076564, "learning_rate":0.0001355651, "entropy":0.5545659784, "num_tokens":5779926.0, "mean_token_accuracy":0.8451263145, "epoch":1.2121212121, "step":360, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.5554, "grad_norm":0.5305426717, "learning_rate":0.0001286803, "entropy":0.5719649505, "num_tokens":6100921.0, "mean_token_accuracy":0.8403378457, "epoch":1.2794612795, "step":380, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.5345, "grad_norm":0.5867527723, "learning_rate":0.000121644, "entropy":0.5514425825, "num_tokens":6423622.0, "mean_token_accuracy":0.8455459923, "epoch":1.3468013468, "step":400, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":1.3468013468, "step":400, "eval_loss":0.5978295803, "eval_runtime":10.3395, "eval_samples_per_second":24.179, "eval_steps_per_second":3.095, "eval_entropy":0.5523942402, "eval_num_tokens":6423622.0, "eval_mean_token_accuracy":0.8328636196, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.5122, "grad_norm":0.5380092859, "learning_rate":0.0001144932, "entropy":0.5361574471, "num_tokens":6744569.0, "mean_token_accuracy":0.8533119515, "epoch":1.4141414141, "step":420, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4923, "grad_norm":0.5738714933, "learning_rate":0.0001072658, "entropy":0.5068911854, "num_tokens":7065251.0, "mean_token_accuracy":0.8583682023, "epoch":1.4814814815, "step":440, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4808, "grad_norm":0.5104277134, "learning_rate":0.0001, "entropy":0.504258769, "num_tokens":7385952.0, "mean_token_accuracy":0.861315985, "epoch":1.5488215488, "step":460, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4867, "grad_norm":0.5913535357, "learning_rate":0.0000927342, "entropy":0.5113813952, "num_tokens":7704982.0, "mean_token_accuracy":0.8592018247, "epoch":1.6161616162, "step":480, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4591, "grad_norm":0.5065989494, "learning_rate":0.0000855068, "entropy":0.4817487616, "num_tokens":8026316.0, "mean_token_accuracy":0.8679369375, "epoch":1.6835016835, "step":500, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":1.6835016835, "step":500, "eval_loss":0.511384666, "eval_runtime":10.3657, "eval_samples_per_second":24.118, "eval_steps_per_second":3.087, "eval_entropy":0.514307227, "eval_num_tokens":8026316.0, "eval_mean_token_accuracy":0.8547733743, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4296, "grad_norm":0.5839767456, "learning_rate":0.000078356, "entropy":0.4638194107, "num_tokens":8348213.0, "mean_token_accuracy":0.8757635169, "epoch":1.7508417508, "step":520, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4351, "grad_norm":0.6890075207, "learning_rate":0.0000713197, "entropy":0.4628964256, "num_tokens":8671513.0, "mean_token_accuracy":0.8734818839, "epoch":1.8181818182, "step":540, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4173, "grad_norm":0.5538685918, "learning_rate":0.0000644349, "entropy":0.4462708168, "num_tokens":8990602.0, "mean_token_accuracy":0.8780993037, "epoch":1.8855218855, "step":560, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.4249, "grad_norm":0.6900932789, "learning_rate":0.0000577382, "entropy":0.4559292875, "num_tokens":9310065.0, "mean_token_accuracy":0.8769208066, "epoch":1.9528619529, "step":580, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3863, "grad_norm":0.6177843809, "learning_rate":0.0000512648, "entropy":0.4270478457, "num_tokens":9629281.0, "mean_token_accuracy":0.8871142037, "epoch":2.0202020202, "step":600, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":2.0202020202, "step":600, "eval_loss":0.4490914941, "eval_runtime":10.371, "eval_samples_per_second":24.106, "eval_steps_per_second":3.086, "eval_entropy":0.4390519308, "eval_num_tokens":9629281.0, "eval_mean_token_accuracy":0.8718043752, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3554, "grad_norm":0.5899857879, "learning_rate":0.0000450491, "entropy":0.396466079, "num_tokens":9951673.0, "mean_token_accuracy":0.8959640451, "epoch":2.0875420875, "step":620, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3401, "grad_norm":0.6384023428, "learning_rate":0.0000391239, "entropy":0.3796210378, "num_tokens":10273617.0, "mean_token_accuracy":0.8999493234, "epoch":2.1548821549, "step":640, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3281, "grad_norm":0.6890760064, "learning_rate":0.0000335204, "entropy":0.3717882721, "num_tokens":10594830.0, "mean_token_accuracy":0.9037643224, "epoch":2.2222222222, "step":660, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.32, "grad_norm":0.6508978605, "learning_rate":0.0000282684, "entropy":0.3625029052, "num_tokens":10916597.0, "mean_token_accuracy":0.9063778028, "epoch":2.2895622896, "step":680, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3189, "grad_norm":0.6131536961, "learning_rate":0.0000233956, "entropy":0.3583062481, "num_tokens":11235743.0, "mean_token_accuracy":0.9068948857, "epoch":2.3569023569, "step":700, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":2.3569023569, "step":700, "eval_loss":0.4149619639, "eval_runtime":10.3707, "eval_samples_per_second":24.106, "eval_steps_per_second":3.086, "eval_entropy":0.4037288642, "eval_num_tokens":11235743.0, "eval_mean_token_accuracy":0.8824688997, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3248, "grad_norm":0.5035169125, "learning_rate":0.0000189277, "entropy":0.3696210571, "num_tokens":11556934.0, "mean_token_accuracy":0.9046057545, "epoch":2.4242424242, "step":720, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3126, "grad_norm":0.5420159698, "learning_rate":0.0000148883, "entropy":0.3530109294, "num_tokens":11879049.0, "mean_token_accuracy":0.910102234, "epoch":2.4915824916, "step":740, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3017, "grad_norm":0.4808464348, "learning_rate":0.0000112989, "entropy":0.3429520307, "num_tokens":12199559.0, "mean_token_accuracy":0.9116890863, "epoch":2.5589225589, "step":760, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.2944, "grad_norm":0.5233286023, "learning_rate":0.0000081784, "entropy":0.3373699239, "num_tokens":12518745.0, "mean_token_accuracy":0.9141617462, "epoch":2.6262626263, "step":780, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3132, "grad_norm":0.540781498, "learning_rate":0.0000055432, "entropy":0.3541788673, "num_tokens":12839323.0, "mean_token_accuracy":0.9097139165, "epoch":2.6936026936, "step":800, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":2.6936026936, "step":800, "eval_loss":0.3966158926, "eval_runtime":10.3484, "eval_samples_per_second":24.158, "eval_steps_per_second":3.092, "eval_entropy":0.3902668599, "eval_num_tokens":12839323.0, "eval_mean_token_accuracy":0.8880477473, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3082, "grad_norm":0.5258508921, "learning_rate":0.0000034074, "entropy":0.3506111713, "num_tokens":13160627.0, "mean_token_accuracy":0.9098422483, "epoch":2.7609427609, "step":820, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3079, "grad_norm":0.4996784031, "learning_rate":0.0000017822, "entropy":0.3474921705, "num_tokens":13481114.0, "mean_token_accuracy":0.9094100349, "epoch":2.8282828283, "step":840, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3092, "grad_norm":0.4853805304, "learning_rate":0.0000006762, "entropy":0.3521205258, "num_tokens":13803114.0, "mean_token_accuracy":0.9098572351, "epoch":2.8956228956, "step":860, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":0.3038, "grad_norm":0.5111385584, "learning_rate":0.0000000952, "entropy":0.3490505032, "num_tokens":14125289.0, "mean_token_accuracy":0.9108231679, "epoch":2.962962963, "step":880, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":null, "train_samples_per_second":null, "train_steps_per_second":null, "total_flos":null, "train_loss":null }, { "loss":null, "grad_norm":null, "learning_rate":null, "entropy":null, "num_tokens":null, "mean_token_accuracy":null, "epoch":3.0, "step":891, "eval_loss":null, "eval_runtime":null, "eval_samples_per_second":null, "eval_steps_per_second":null, "eval_entropy":null, "eval_num_tokens":null, "eval_mean_token_accuracy":null, "train_runtime":1912.4699, "train_samples_per_second":7.451, "train_steps_per_second":0.466, "total_flos":1.163395683e+17, "train_loss":0.5388089069 } ]