| [ | |
| { | |
| "loss":1.2008, | |
| "grad_norm":0.3822754323, | |
| "learning_rate":0.0001407407, | |
| "entropy":1.0346003115, | |
| "num_tokens":322124.0, | |
| "mean_token_accuracy":0.7109046429, | |
| "epoch":0.0673400673, | |
| "step":20, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.9385, | |
| "grad_norm":0.2103841156, | |
| "learning_rate":0.0001999048, | |
| "entropy":0.9494877957, | |
| "num_tokens":642995.0, | |
| "mean_token_accuracy":0.7539383888, | |
| "epoch":0.1346801347, | |
| "step":40, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.915, | |
| "grad_norm":0.206428811, | |
| "learning_rate":0.0001993238, | |
| "entropy":0.9191693425, | |
| "num_tokens":966396.0, | |
| "mean_token_accuracy":0.7584572025, | |
| "epoch":0.202020202, | |
| "step":60, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.8441, | |
| "grad_norm":0.3027354181, | |
| "learning_rate":0.0001982178, | |
| "entropy":0.8431956261, | |
| "num_tokens":1287058.0, | |
| "mean_token_accuracy":0.7713396206, | |
| "epoch":0.2693602694, | |
| "step":80, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.8408, | |
| "grad_norm":0.2982031703, | |
| "learning_rate":0.0001965926, | |
| "entropy":0.8472392239, | |
| "num_tokens":1607723.0, | |
| "mean_token_accuracy":0.7738652974, | |
| "epoch":0.3367003367, | |
| "step":100, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":0.3367003367, | |
| "step":100, | |
| "eval_loss":0.8332510591, | |
| "eval_runtime":10.3382, | |
| "eval_samples_per_second":24.182, | |
| "eval_steps_per_second":3.095, | |
| "eval_entropy":0.8475092333, | |
| "eval_num_tokens":1607723.0, | |
| "eval_mean_token_accuracy":0.7752955835, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.8262, | |
| "grad_norm":0.2738818824, | |
| "learning_rate":0.0001944568, | |
| "entropy":0.8286631659, | |
| "num_tokens":1928620.0, | |
| "mean_token_accuracy":0.7755305201, | |
| "epoch":0.404040404, | |
| "step":120, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.8089, | |
| "grad_norm":0.2727711201, | |
| "learning_rate":0.0001918216, | |
| "entropy":0.8132541452, | |
| "num_tokens":2249401.0, | |
| "mean_token_accuracy":0.779610493, | |
| "epoch":0.4713804714, | |
| "step":140, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.7815, | |
| "grad_norm":0.253259182, | |
| "learning_rate":0.0001887011, | |
| "entropy":0.7838059939, | |
| "num_tokens":2571041.0, | |
| "mean_token_accuracy":0.785765557, | |
| "epoch":0.5387205387, | |
| "step":160, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.763, | |
| "grad_norm":0.2851669788, | |
| "learning_rate":0.0001851117, | |
| "entropy":0.7674662221, | |
| "num_tokens":2890814.0, | |
| "mean_token_accuracy":0.7893050611, | |
| "epoch":0.6060606061, | |
| "step":180, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.7434, | |
| "grad_norm":0.2782152891, | |
| "learning_rate":0.0001810723, | |
| "entropy":0.7478979569, | |
| "num_tokens":3212811.0, | |
| "mean_token_accuracy":0.7946783796, | |
| "epoch":0.6734006734, | |
| "step":200, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":0.6734006734, | |
| "step":200, | |
| "eval_loss":0.7540781498, | |
| "eval_runtime":10.3368, | |
| "eval_samples_per_second":24.185, | |
| "eval_steps_per_second":3.096, | |
| "eval_entropy":0.7548957299, | |
| "eval_num_tokens":3212811.0, | |
| "eval_mean_token_accuracy":0.7921991255, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.718, | |
| "grad_norm":0.2911323905, | |
| "learning_rate":0.0001766044, | |
| "entropy":0.7216884721, | |
| "num_tokens":3534962.0, | |
| "mean_token_accuracy":0.8007057041, | |
| "epoch":0.7407407407, | |
| "step":220, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.7015, | |
| "grad_norm":0.3469219804, | |
| "learning_rate":0.0001717316, | |
| "entropy":0.7073224507, | |
| "num_tokens":3855519.0, | |
| "mean_token_accuracy":0.8033309393, | |
| "epoch":0.8080808081, | |
| "step":240, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.7066, | |
| "grad_norm":0.3413038254, | |
| "learning_rate":0.0001664796, | |
| "entropy":0.7131307989, | |
| "num_tokens":4174694.0, | |
| "mean_token_accuracy":0.8030782551, | |
| "epoch":0.8754208754, | |
| "step":260, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.6725, | |
| "grad_norm":0.3970124125, | |
| "learning_rate":0.0001608761, | |
| "entropy":0.6751278345, | |
| "num_tokens":4495214.0, | |
| "mean_token_accuracy":0.8109409161, | |
| "epoch":0.9427609428, | |
| "step":280, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.6567, | |
| "grad_norm":0.4383921921, | |
| "learning_rate":0.0001549509, | |
| "entropy":0.6729893133, | |
| "num_tokens":4815033.0, | |
| "mean_token_accuracy":0.8148056932, | |
| "epoch":1.0101010101, | |
| "step":300, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":1.0101010101, | |
| "step":300, | |
| "eval_loss":0.6720606685, | |
| "eval_runtime":10.367, | |
| "eval_samples_per_second":24.115, | |
| "eval_steps_per_second":3.087, | |
| "eval_entropy":0.6333643645, | |
| "eval_num_tokens":4815033.0, | |
| "eval_mean_token_accuracy":0.8116748761, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.5649, | |
| "grad_norm":0.4388367832, | |
| "learning_rate":0.0001487352, | |
| "entropy":0.5757804383, | |
| "num_tokens":5135569.0, | |
| "mean_token_accuracy":0.8381757662, | |
| "epoch":1.0774410774, | |
| "step":320, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.564, | |
| "grad_norm":0.4527507126, | |
| "learning_rate":0.0001422618, | |
| "entropy":0.5801134199, | |
| "num_tokens":5456292.0, | |
| "mean_token_accuracy":0.8384027012, | |
| "epoch":1.1447811448, | |
| "step":340, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.5403, | |
| "grad_norm":0.6442076564, | |
| "learning_rate":0.0001355651, | |
| "entropy":0.5545659784, | |
| "num_tokens":5779926.0, | |
| "mean_token_accuracy":0.8451263145, | |
| "epoch":1.2121212121, | |
| "step":360, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.5554, | |
| "grad_norm":0.5305426717, | |
| "learning_rate":0.0001286803, | |
| "entropy":0.5719649505, | |
| "num_tokens":6100921.0, | |
| "mean_token_accuracy":0.8403378457, | |
| "epoch":1.2794612795, | |
| "step":380, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.5345, | |
| "grad_norm":0.5867527723, | |
| "learning_rate":0.000121644, | |
| "entropy":0.5514425825, | |
| "num_tokens":6423622.0, | |
| "mean_token_accuracy":0.8455459923, | |
| "epoch":1.3468013468, | |
| "step":400, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":1.3468013468, | |
| "step":400, | |
| "eval_loss":0.5978295803, | |
| "eval_runtime":10.3395, | |
| "eval_samples_per_second":24.179, | |
| "eval_steps_per_second":3.095, | |
| "eval_entropy":0.5523942402, | |
| "eval_num_tokens":6423622.0, | |
| "eval_mean_token_accuracy":0.8328636196, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.5122, | |
| "grad_norm":0.5380092859, | |
| "learning_rate":0.0001144932, | |
| "entropy":0.5361574471, | |
| "num_tokens":6744569.0, | |
| "mean_token_accuracy":0.8533119515, | |
| "epoch":1.4141414141, | |
| "step":420, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4923, | |
| "grad_norm":0.5738714933, | |
| "learning_rate":0.0001072658, | |
| "entropy":0.5068911854, | |
| "num_tokens":7065251.0, | |
| "mean_token_accuracy":0.8583682023, | |
| "epoch":1.4814814815, | |
| "step":440, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4808, | |
| "grad_norm":0.5104277134, | |
| "learning_rate":0.0001, | |
| "entropy":0.504258769, | |
| "num_tokens":7385952.0, | |
| "mean_token_accuracy":0.861315985, | |
| "epoch":1.5488215488, | |
| "step":460, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4867, | |
| "grad_norm":0.5913535357, | |
| "learning_rate":0.0000927342, | |
| "entropy":0.5113813952, | |
| "num_tokens":7704982.0, | |
| "mean_token_accuracy":0.8592018247, | |
| "epoch":1.6161616162, | |
| "step":480, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4591, | |
| "grad_norm":0.5065989494, | |
| "learning_rate":0.0000855068, | |
| "entropy":0.4817487616, | |
| "num_tokens":8026316.0, | |
| "mean_token_accuracy":0.8679369375, | |
| "epoch":1.6835016835, | |
| "step":500, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":1.6835016835, | |
| "step":500, | |
| "eval_loss":0.511384666, | |
| "eval_runtime":10.3657, | |
| "eval_samples_per_second":24.118, | |
| "eval_steps_per_second":3.087, | |
| "eval_entropy":0.514307227, | |
| "eval_num_tokens":8026316.0, | |
| "eval_mean_token_accuracy":0.8547733743, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4296, | |
| "grad_norm":0.5839767456, | |
| "learning_rate":0.000078356, | |
| "entropy":0.4638194107, | |
| "num_tokens":8348213.0, | |
| "mean_token_accuracy":0.8757635169, | |
| "epoch":1.7508417508, | |
| "step":520, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4351, | |
| "grad_norm":0.6890075207, | |
| "learning_rate":0.0000713197, | |
| "entropy":0.4628964256, | |
| "num_tokens":8671513.0, | |
| "mean_token_accuracy":0.8734818839, | |
| "epoch":1.8181818182, | |
| "step":540, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4173, | |
| "grad_norm":0.5538685918, | |
| "learning_rate":0.0000644349, | |
| "entropy":0.4462708168, | |
| "num_tokens":8990602.0, | |
| "mean_token_accuracy":0.8780993037, | |
| "epoch":1.8855218855, | |
| "step":560, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.4249, | |
| "grad_norm":0.6900932789, | |
| "learning_rate":0.0000577382, | |
| "entropy":0.4559292875, | |
| "num_tokens":9310065.0, | |
| "mean_token_accuracy":0.8769208066, | |
| "epoch":1.9528619529, | |
| "step":580, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3863, | |
| "grad_norm":0.6177843809, | |
| "learning_rate":0.0000512648, | |
| "entropy":0.4270478457, | |
| "num_tokens":9629281.0, | |
| "mean_token_accuracy":0.8871142037, | |
| "epoch":2.0202020202, | |
| "step":600, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":2.0202020202, | |
| "step":600, | |
| "eval_loss":0.4490914941, | |
| "eval_runtime":10.371, | |
| "eval_samples_per_second":24.106, | |
| "eval_steps_per_second":3.086, | |
| "eval_entropy":0.4390519308, | |
| "eval_num_tokens":9629281.0, | |
| "eval_mean_token_accuracy":0.8718043752, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3554, | |
| "grad_norm":0.5899857879, | |
| "learning_rate":0.0000450491, | |
| "entropy":0.396466079, | |
| "num_tokens":9951673.0, | |
| "mean_token_accuracy":0.8959640451, | |
| "epoch":2.0875420875, | |
| "step":620, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3401, | |
| "grad_norm":0.6384023428, | |
| "learning_rate":0.0000391239, | |
| "entropy":0.3796210378, | |
| "num_tokens":10273617.0, | |
| "mean_token_accuracy":0.8999493234, | |
| "epoch":2.1548821549, | |
| "step":640, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3281, | |
| "grad_norm":0.6890760064, | |
| "learning_rate":0.0000335204, | |
| "entropy":0.3717882721, | |
| "num_tokens":10594830.0, | |
| "mean_token_accuracy":0.9037643224, | |
| "epoch":2.2222222222, | |
| "step":660, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.32, | |
| "grad_norm":0.6508978605, | |
| "learning_rate":0.0000282684, | |
| "entropy":0.3625029052, | |
| "num_tokens":10916597.0, | |
| "mean_token_accuracy":0.9063778028, | |
| "epoch":2.2895622896, | |
| "step":680, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3189, | |
| "grad_norm":0.6131536961, | |
| "learning_rate":0.0000233956, | |
| "entropy":0.3583062481, | |
| "num_tokens":11235743.0, | |
| "mean_token_accuracy":0.9068948857, | |
| "epoch":2.3569023569, | |
| "step":700, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":2.3569023569, | |
| "step":700, | |
| "eval_loss":0.4149619639, | |
| "eval_runtime":10.3707, | |
| "eval_samples_per_second":24.106, | |
| "eval_steps_per_second":3.086, | |
| "eval_entropy":0.4037288642, | |
| "eval_num_tokens":11235743.0, | |
| "eval_mean_token_accuracy":0.8824688997, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3248, | |
| "grad_norm":0.5035169125, | |
| "learning_rate":0.0000189277, | |
| "entropy":0.3696210571, | |
| "num_tokens":11556934.0, | |
| "mean_token_accuracy":0.9046057545, | |
| "epoch":2.4242424242, | |
| "step":720, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3126, | |
| "grad_norm":0.5420159698, | |
| "learning_rate":0.0000148883, | |
| "entropy":0.3530109294, | |
| "num_tokens":11879049.0, | |
| "mean_token_accuracy":0.910102234, | |
| "epoch":2.4915824916, | |
| "step":740, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3017, | |
| "grad_norm":0.4808464348, | |
| "learning_rate":0.0000112989, | |
| "entropy":0.3429520307, | |
| "num_tokens":12199559.0, | |
| "mean_token_accuracy":0.9116890863, | |
| "epoch":2.5589225589, | |
| "step":760, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.2944, | |
| "grad_norm":0.5233286023, | |
| "learning_rate":0.0000081784, | |
| "entropy":0.3373699239, | |
| "num_tokens":12518745.0, | |
| "mean_token_accuracy":0.9141617462, | |
| "epoch":2.6262626263, | |
| "step":780, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3132, | |
| "grad_norm":0.540781498, | |
| "learning_rate":0.0000055432, | |
| "entropy":0.3541788673, | |
| "num_tokens":12839323.0, | |
| "mean_token_accuracy":0.9097139165, | |
| "epoch":2.6936026936, | |
| "step":800, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":2.6936026936, | |
| "step":800, | |
| "eval_loss":0.3966158926, | |
| "eval_runtime":10.3484, | |
| "eval_samples_per_second":24.158, | |
| "eval_steps_per_second":3.092, | |
| "eval_entropy":0.3902668599, | |
| "eval_num_tokens":12839323.0, | |
| "eval_mean_token_accuracy":0.8880477473, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3082, | |
| "grad_norm":0.5258508921, | |
| "learning_rate":0.0000034074, | |
| "entropy":0.3506111713, | |
| "num_tokens":13160627.0, | |
| "mean_token_accuracy":0.9098422483, | |
| "epoch":2.7609427609, | |
| "step":820, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3079, | |
| "grad_norm":0.4996784031, | |
| "learning_rate":0.0000017822, | |
| "entropy":0.3474921705, | |
| "num_tokens":13481114.0, | |
| "mean_token_accuracy":0.9094100349, | |
| "epoch":2.8282828283, | |
| "step":840, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3092, | |
| "grad_norm":0.4853805304, | |
| "learning_rate":0.0000006762, | |
| "entropy":0.3521205258, | |
| "num_tokens":13803114.0, | |
| "mean_token_accuracy":0.9098572351, | |
| "epoch":2.8956228956, | |
| "step":860, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":0.3038, | |
| "grad_norm":0.5111385584, | |
| "learning_rate":0.0000000952, | |
| "entropy":0.3490505032, | |
| "num_tokens":14125289.0, | |
| "mean_token_accuracy":0.9108231679, | |
| "epoch":2.962962963, | |
| "step":880, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":null, | |
| "train_samples_per_second":null, | |
| "train_steps_per_second":null, | |
| "total_flos":null, | |
| "train_loss":null | |
| }, | |
| { | |
| "loss":null, | |
| "grad_norm":null, | |
| "learning_rate":null, | |
| "entropy":null, | |
| "num_tokens":null, | |
| "mean_token_accuracy":null, | |
| "epoch":3.0, | |
| "step":891, | |
| "eval_loss":null, | |
| "eval_runtime":null, | |
| "eval_samples_per_second":null, | |
| "eval_steps_per_second":null, | |
| "eval_entropy":null, | |
| "eval_num_tokens":null, | |
| "eval_mean_token_accuracy":null, | |
| "train_runtime":1912.4699, | |
| "train_samples_per_second":7.451, | |
| "train_steps_per_second":0.466, | |
| "total_flos":1.163395683e+17, | |
| "train_loss":0.5388089069 | |
| } | |
| ] |