Qwen2.5-Coder-1.5B-LoRA-Deep-v2 / Qwen2.5-Coder-1.5B-LoRA-Deep_training_logs.json
sinem02's picture
Upload Qwen2.5-Coder-1.5B-LoRA-Deep_training_logs.json
23f39e5 verified
[
{
"loss":1.2008,
"grad_norm":0.3822754323,
"learning_rate":0.0001407407,
"entropy":1.0346003115,
"num_tokens":322124.0,
"mean_token_accuracy":0.7109046429,
"epoch":0.0673400673,
"step":20,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.9385,
"grad_norm":0.2103841156,
"learning_rate":0.0001999048,
"entropy":0.9494877957,
"num_tokens":642995.0,
"mean_token_accuracy":0.7539383888,
"epoch":0.1346801347,
"step":40,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.915,
"grad_norm":0.206428811,
"learning_rate":0.0001993238,
"entropy":0.9191693425,
"num_tokens":966396.0,
"mean_token_accuracy":0.7584572025,
"epoch":0.202020202,
"step":60,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.8441,
"grad_norm":0.3027354181,
"learning_rate":0.0001982178,
"entropy":0.8431956261,
"num_tokens":1287058.0,
"mean_token_accuracy":0.7713396206,
"epoch":0.2693602694,
"step":80,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.8408,
"grad_norm":0.2982031703,
"learning_rate":0.0001965926,
"entropy":0.8472392239,
"num_tokens":1607723.0,
"mean_token_accuracy":0.7738652974,
"epoch":0.3367003367,
"step":100,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":0.3367003367,
"step":100,
"eval_loss":0.8332510591,
"eval_runtime":10.3382,
"eval_samples_per_second":24.182,
"eval_steps_per_second":3.095,
"eval_entropy":0.8475092333,
"eval_num_tokens":1607723.0,
"eval_mean_token_accuracy":0.7752955835,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.8262,
"grad_norm":0.2738818824,
"learning_rate":0.0001944568,
"entropy":0.8286631659,
"num_tokens":1928620.0,
"mean_token_accuracy":0.7755305201,
"epoch":0.404040404,
"step":120,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.8089,
"grad_norm":0.2727711201,
"learning_rate":0.0001918216,
"entropy":0.8132541452,
"num_tokens":2249401.0,
"mean_token_accuracy":0.779610493,
"epoch":0.4713804714,
"step":140,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.7815,
"grad_norm":0.253259182,
"learning_rate":0.0001887011,
"entropy":0.7838059939,
"num_tokens":2571041.0,
"mean_token_accuracy":0.785765557,
"epoch":0.5387205387,
"step":160,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.763,
"grad_norm":0.2851669788,
"learning_rate":0.0001851117,
"entropy":0.7674662221,
"num_tokens":2890814.0,
"mean_token_accuracy":0.7893050611,
"epoch":0.6060606061,
"step":180,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.7434,
"grad_norm":0.2782152891,
"learning_rate":0.0001810723,
"entropy":0.7478979569,
"num_tokens":3212811.0,
"mean_token_accuracy":0.7946783796,
"epoch":0.6734006734,
"step":200,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":0.6734006734,
"step":200,
"eval_loss":0.7540781498,
"eval_runtime":10.3368,
"eval_samples_per_second":24.185,
"eval_steps_per_second":3.096,
"eval_entropy":0.7548957299,
"eval_num_tokens":3212811.0,
"eval_mean_token_accuracy":0.7921991255,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.718,
"grad_norm":0.2911323905,
"learning_rate":0.0001766044,
"entropy":0.7216884721,
"num_tokens":3534962.0,
"mean_token_accuracy":0.8007057041,
"epoch":0.7407407407,
"step":220,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.7015,
"grad_norm":0.3469219804,
"learning_rate":0.0001717316,
"entropy":0.7073224507,
"num_tokens":3855519.0,
"mean_token_accuracy":0.8033309393,
"epoch":0.8080808081,
"step":240,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.7066,
"grad_norm":0.3413038254,
"learning_rate":0.0001664796,
"entropy":0.7131307989,
"num_tokens":4174694.0,
"mean_token_accuracy":0.8030782551,
"epoch":0.8754208754,
"step":260,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.6725,
"grad_norm":0.3970124125,
"learning_rate":0.0001608761,
"entropy":0.6751278345,
"num_tokens":4495214.0,
"mean_token_accuracy":0.8109409161,
"epoch":0.9427609428,
"step":280,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.6567,
"grad_norm":0.4383921921,
"learning_rate":0.0001549509,
"entropy":0.6729893133,
"num_tokens":4815033.0,
"mean_token_accuracy":0.8148056932,
"epoch":1.0101010101,
"step":300,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":1.0101010101,
"step":300,
"eval_loss":0.6720606685,
"eval_runtime":10.367,
"eval_samples_per_second":24.115,
"eval_steps_per_second":3.087,
"eval_entropy":0.6333643645,
"eval_num_tokens":4815033.0,
"eval_mean_token_accuracy":0.8116748761,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.5649,
"grad_norm":0.4388367832,
"learning_rate":0.0001487352,
"entropy":0.5757804383,
"num_tokens":5135569.0,
"mean_token_accuracy":0.8381757662,
"epoch":1.0774410774,
"step":320,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.564,
"grad_norm":0.4527507126,
"learning_rate":0.0001422618,
"entropy":0.5801134199,
"num_tokens":5456292.0,
"mean_token_accuracy":0.8384027012,
"epoch":1.1447811448,
"step":340,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.5403,
"grad_norm":0.6442076564,
"learning_rate":0.0001355651,
"entropy":0.5545659784,
"num_tokens":5779926.0,
"mean_token_accuracy":0.8451263145,
"epoch":1.2121212121,
"step":360,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.5554,
"grad_norm":0.5305426717,
"learning_rate":0.0001286803,
"entropy":0.5719649505,
"num_tokens":6100921.0,
"mean_token_accuracy":0.8403378457,
"epoch":1.2794612795,
"step":380,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.5345,
"grad_norm":0.5867527723,
"learning_rate":0.000121644,
"entropy":0.5514425825,
"num_tokens":6423622.0,
"mean_token_accuracy":0.8455459923,
"epoch":1.3468013468,
"step":400,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":1.3468013468,
"step":400,
"eval_loss":0.5978295803,
"eval_runtime":10.3395,
"eval_samples_per_second":24.179,
"eval_steps_per_second":3.095,
"eval_entropy":0.5523942402,
"eval_num_tokens":6423622.0,
"eval_mean_token_accuracy":0.8328636196,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.5122,
"grad_norm":0.5380092859,
"learning_rate":0.0001144932,
"entropy":0.5361574471,
"num_tokens":6744569.0,
"mean_token_accuracy":0.8533119515,
"epoch":1.4141414141,
"step":420,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4923,
"grad_norm":0.5738714933,
"learning_rate":0.0001072658,
"entropy":0.5068911854,
"num_tokens":7065251.0,
"mean_token_accuracy":0.8583682023,
"epoch":1.4814814815,
"step":440,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4808,
"grad_norm":0.5104277134,
"learning_rate":0.0001,
"entropy":0.504258769,
"num_tokens":7385952.0,
"mean_token_accuracy":0.861315985,
"epoch":1.5488215488,
"step":460,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4867,
"grad_norm":0.5913535357,
"learning_rate":0.0000927342,
"entropy":0.5113813952,
"num_tokens":7704982.0,
"mean_token_accuracy":0.8592018247,
"epoch":1.6161616162,
"step":480,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4591,
"grad_norm":0.5065989494,
"learning_rate":0.0000855068,
"entropy":0.4817487616,
"num_tokens":8026316.0,
"mean_token_accuracy":0.8679369375,
"epoch":1.6835016835,
"step":500,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":1.6835016835,
"step":500,
"eval_loss":0.511384666,
"eval_runtime":10.3657,
"eval_samples_per_second":24.118,
"eval_steps_per_second":3.087,
"eval_entropy":0.514307227,
"eval_num_tokens":8026316.0,
"eval_mean_token_accuracy":0.8547733743,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4296,
"grad_norm":0.5839767456,
"learning_rate":0.000078356,
"entropy":0.4638194107,
"num_tokens":8348213.0,
"mean_token_accuracy":0.8757635169,
"epoch":1.7508417508,
"step":520,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4351,
"grad_norm":0.6890075207,
"learning_rate":0.0000713197,
"entropy":0.4628964256,
"num_tokens":8671513.0,
"mean_token_accuracy":0.8734818839,
"epoch":1.8181818182,
"step":540,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4173,
"grad_norm":0.5538685918,
"learning_rate":0.0000644349,
"entropy":0.4462708168,
"num_tokens":8990602.0,
"mean_token_accuracy":0.8780993037,
"epoch":1.8855218855,
"step":560,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.4249,
"grad_norm":0.6900932789,
"learning_rate":0.0000577382,
"entropy":0.4559292875,
"num_tokens":9310065.0,
"mean_token_accuracy":0.8769208066,
"epoch":1.9528619529,
"step":580,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3863,
"grad_norm":0.6177843809,
"learning_rate":0.0000512648,
"entropy":0.4270478457,
"num_tokens":9629281.0,
"mean_token_accuracy":0.8871142037,
"epoch":2.0202020202,
"step":600,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":2.0202020202,
"step":600,
"eval_loss":0.4490914941,
"eval_runtime":10.371,
"eval_samples_per_second":24.106,
"eval_steps_per_second":3.086,
"eval_entropy":0.4390519308,
"eval_num_tokens":9629281.0,
"eval_mean_token_accuracy":0.8718043752,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3554,
"grad_norm":0.5899857879,
"learning_rate":0.0000450491,
"entropy":0.396466079,
"num_tokens":9951673.0,
"mean_token_accuracy":0.8959640451,
"epoch":2.0875420875,
"step":620,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3401,
"grad_norm":0.6384023428,
"learning_rate":0.0000391239,
"entropy":0.3796210378,
"num_tokens":10273617.0,
"mean_token_accuracy":0.8999493234,
"epoch":2.1548821549,
"step":640,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3281,
"grad_norm":0.6890760064,
"learning_rate":0.0000335204,
"entropy":0.3717882721,
"num_tokens":10594830.0,
"mean_token_accuracy":0.9037643224,
"epoch":2.2222222222,
"step":660,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.32,
"grad_norm":0.6508978605,
"learning_rate":0.0000282684,
"entropy":0.3625029052,
"num_tokens":10916597.0,
"mean_token_accuracy":0.9063778028,
"epoch":2.2895622896,
"step":680,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3189,
"grad_norm":0.6131536961,
"learning_rate":0.0000233956,
"entropy":0.3583062481,
"num_tokens":11235743.0,
"mean_token_accuracy":0.9068948857,
"epoch":2.3569023569,
"step":700,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":2.3569023569,
"step":700,
"eval_loss":0.4149619639,
"eval_runtime":10.3707,
"eval_samples_per_second":24.106,
"eval_steps_per_second":3.086,
"eval_entropy":0.4037288642,
"eval_num_tokens":11235743.0,
"eval_mean_token_accuracy":0.8824688997,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3248,
"grad_norm":0.5035169125,
"learning_rate":0.0000189277,
"entropy":0.3696210571,
"num_tokens":11556934.0,
"mean_token_accuracy":0.9046057545,
"epoch":2.4242424242,
"step":720,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3126,
"grad_norm":0.5420159698,
"learning_rate":0.0000148883,
"entropy":0.3530109294,
"num_tokens":11879049.0,
"mean_token_accuracy":0.910102234,
"epoch":2.4915824916,
"step":740,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3017,
"grad_norm":0.4808464348,
"learning_rate":0.0000112989,
"entropy":0.3429520307,
"num_tokens":12199559.0,
"mean_token_accuracy":0.9116890863,
"epoch":2.5589225589,
"step":760,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.2944,
"grad_norm":0.5233286023,
"learning_rate":0.0000081784,
"entropy":0.3373699239,
"num_tokens":12518745.0,
"mean_token_accuracy":0.9141617462,
"epoch":2.6262626263,
"step":780,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3132,
"grad_norm":0.540781498,
"learning_rate":0.0000055432,
"entropy":0.3541788673,
"num_tokens":12839323.0,
"mean_token_accuracy":0.9097139165,
"epoch":2.6936026936,
"step":800,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":2.6936026936,
"step":800,
"eval_loss":0.3966158926,
"eval_runtime":10.3484,
"eval_samples_per_second":24.158,
"eval_steps_per_second":3.092,
"eval_entropy":0.3902668599,
"eval_num_tokens":12839323.0,
"eval_mean_token_accuracy":0.8880477473,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3082,
"grad_norm":0.5258508921,
"learning_rate":0.0000034074,
"entropy":0.3506111713,
"num_tokens":13160627.0,
"mean_token_accuracy":0.9098422483,
"epoch":2.7609427609,
"step":820,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3079,
"grad_norm":0.4996784031,
"learning_rate":0.0000017822,
"entropy":0.3474921705,
"num_tokens":13481114.0,
"mean_token_accuracy":0.9094100349,
"epoch":2.8282828283,
"step":840,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3092,
"grad_norm":0.4853805304,
"learning_rate":0.0000006762,
"entropy":0.3521205258,
"num_tokens":13803114.0,
"mean_token_accuracy":0.9098572351,
"epoch":2.8956228956,
"step":860,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":0.3038,
"grad_norm":0.5111385584,
"learning_rate":0.0000000952,
"entropy":0.3490505032,
"num_tokens":14125289.0,
"mean_token_accuracy":0.9108231679,
"epoch":2.962962963,
"step":880,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":null,
"train_samples_per_second":null,
"train_steps_per_second":null,
"total_flos":null,
"train_loss":null
},
{
"loss":null,
"grad_norm":null,
"learning_rate":null,
"entropy":null,
"num_tokens":null,
"mean_token_accuracy":null,
"epoch":3.0,
"step":891,
"eval_loss":null,
"eval_runtime":null,
"eval_samples_per_second":null,
"eval_steps_per_second":null,
"eval_entropy":null,
"eval_num_tokens":null,
"eval_mean_token_accuracy":null,
"train_runtime":1912.4699,
"train_samples_per_second":7.451,
"train_steps_per_second":0.466,
"total_flos":1.163395683e+17,
"train_loss":0.5388089069
}
]