{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.208277304470539, "epoch": 0.044101433296582136, "grad_norm": 0.7034044861793518, "learning_rate": 3.6e-05, "loss": 1.8075, "mean_token_accuracy": 0.7044319450855255, "num_tokens": 162027.0, "step": 10 }, { "entropy": 1.2619145691394806, "epoch": 0.08820286659316427, "grad_norm": 0.56658935546875, "learning_rate": 7.6e-05, "loss": 1.7395, "mean_token_accuracy": 0.7111444145441055, "num_tokens": 322033.0, "step": 20 }, { "entropy": 1.3051191717386246, "epoch": 0.13230429988974643, "grad_norm": 0.9701246023178101, "learning_rate": 0.000116, "loss": 1.4759, "mean_token_accuracy": 0.7372552484273911, "num_tokens": 493134.0, "step": 30 }, { "entropy": 1.2549610823392867, "epoch": 0.17640573318632854, "grad_norm": 0.7130260467529297, "learning_rate": 0.00015600000000000002, "loss": 1.1858, "mean_token_accuracy": 0.7737368658185005, "num_tokens": 657636.0, "step": 40 }, { "entropy": 1.0649129822850227, "epoch": 0.2205071664829107, "grad_norm": 0.31629839539527893, "learning_rate": 0.000196, "loss": 1.0158, "mean_token_accuracy": 0.7959408611059189, "num_tokens": 809381.0, "step": 50 }, { "entropy": 0.9025423139333725, "epoch": 0.26460859977949286, "grad_norm": 0.2634122669696808, "learning_rate": 0.0001971473851030111, "loss": 0.8932, "mean_token_accuracy": 0.8144056230783463, "num_tokens": 970344.0, "step": 60 }, { "entropy": 0.8994027122855186, "epoch": 0.308710033076075, "grad_norm": 0.29325830936431885, "learning_rate": 0.00019397781299524563, "loss": 0.8828, "mean_token_accuracy": 0.8157614529132843, "num_tokens": 1129434.0, "step": 70 }, { "entropy": 0.8638058304786682, "epoch": 0.3528114663726571, "grad_norm": 0.22787100076675415, "learning_rate": 0.0001908082408874802, "loss": 0.8518, "mean_token_accuracy": 0.8206798136234283, "num_tokens": 1290818.0, "step": 80 }, { "entropy": 0.8639321938157082, "epoch": 0.39691289966923926, "grad_norm": 0.27325868606567383, "learning_rate": 0.00018763866877971475, "loss": 0.8633, "mean_token_accuracy": 0.81620042771101, "num_tokens": 1452401.0, "step": 90 }, { "entropy": 0.8432708650827407, "epoch": 0.4410143329658214, "grad_norm": 0.29134032130241394, "learning_rate": 0.0001844690966719493, "loss": 0.8225, "mean_token_accuracy": 0.8236450552940369, "num_tokens": 1616085.0, "step": 100 }, { "epoch": 0.4410143329658214, "eval_entropy": 0.7245843861952866, "eval_loss": 0.6365505456924438, "eval_mean_token_accuracy": 0.8594882246291283, "eval_num_tokens": 1616085.0, "eval_runtime": 8.1048, "eval_samples_per_second": 99.447, "eval_steps_per_second": 12.462, "step": 100 }, { "entropy": 0.7841475278139114, "epoch": 0.48511576626240355, "grad_norm": 0.25806924700737, "learning_rate": 0.00018129952456418384, "loss": 0.7675, "mean_token_accuracy": 0.8291461855173111, "num_tokens": 1780700.0, "step": 110 }, { "entropy": 0.8083247914910316, "epoch": 0.5292171995589857, "grad_norm": 0.2405901849269867, "learning_rate": 0.00017812995245641838, "loss": 0.7929, "mean_token_accuracy": 0.8249855980277061, "num_tokens": 1944758.0, "step": 120 }, { "entropy": 0.8017621666193009, "epoch": 0.5733186328555678, "grad_norm": 0.2787521183490753, "learning_rate": 0.00017496038034865293, "loss": 0.8007, "mean_token_accuracy": 0.8232445836067199, "num_tokens": 2108373.0, "step": 130 }, { "entropy": 0.783287800848484, "epoch": 0.61742006615215, "grad_norm": 0.24215690791606903, "learning_rate": 0.0001717908082408875, "loss": 0.7806, "mean_token_accuracy": 0.82769885212183, "num_tokens": 2279985.0, "step": 140 }, { "entropy": 0.7877024173736572, "epoch": 0.6615214994487321, "grad_norm": 0.32073867321014404, "learning_rate": 0.00016862123613312205, "loss": 0.7749, "mean_token_accuracy": 0.8289618909358978, "num_tokens": 2437290.0, "step": 150 }, { "entropy": 0.7758993163704873, "epoch": 0.7056229327453142, "grad_norm": 0.24493683874607086, "learning_rate": 0.0001654516640253566, "loss": 0.7613, "mean_token_accuracy": 0.8307989597320556, "num_tokens": 2599040.0, "step": 160 }, { "entropy": 0.7710816964507103, "epoch": 0.7497243660418964, "grad_norm": 0.26176023483276367, "learning_rate": 0.00016228209191759114, "loss": 0.7701, "mean_token_accuracy": 0.8283671870827675, "num_tokens": 2763931.0, "step": 170 }, { "entropy": 0.7687478274106979, "epoch": 0.7938257993384785, "grad_norm": 0.33372247219085693, "learning_rate": 0.00015911251980982568, "loss": 0.7599, "mean_token_accuracy": 0.8326913744211197, "num_tokens": 2921138.0, "step": 180 }, { "entropy": 0.7683898612856865, "epoch": 0.8379272326350606, "grad_norm": 0.30479806661605835, "learning_rate": 0.00015594294770206023, "loss": 0.7652, "mean_token_accuracy": 0.8292697682976723, "num_tokens": 3078075.0, "step": 190 }, { "entropy": 0.7426786199212074, "epoch": 0.8820286659316428, "grad_norm": 0.30385708808898926, "learning_rate": 0.0001527733755942948, "loss": 0.7484, "mean_token_accuracy": 0.834077812731266, "num_tokens": 3250532.0, "step": 200 }, { "epoch": 0.8820286659316428, "eval_entropy": 0.6650676030923824, "eval_loss": 0.5983571410179138, "eval_mean_token_accuracy": 0.8642630500368552, "eval_num_tokens": 3250532.0, "eval_runtime": 8.088, "eval_samples_per_second": 99.654, "eval_steps_per_second": 12.488, "step": 200 }, { "entropy": 0.7351636976003647, "epoch": 0.9261300992282249, "grad_norm": 0.3023395836353302, "learning_rate": 0.00014960380348652932, "loss": 0.7276, "mean_token_accuracy": 0.8379455998539924, "num_tokens": 3412136.0, "step": 210 }, { "entropy": 0.7627917662262916, "epoch": 0.9702315325248071, "grad_norm": 0.27449166774749756, "learning_rate": 0.00014643423137876386, "loss": 0.7657, "mean_token_accuracy": 0.832044218480587, "num_tokens": 3578511.0, "step": 220 }, { "entropy": 0.7629164243355776, "epoch": 1.0132304299889747, "grad_norm": 0.3300953209400177, "learning_rate": 0.0001432646592709984, "loss": 0.7491, "mean_token_accuracy": 0.8328371659303323, "num_tokens": 3722301.0, "step": 230 }, { "entropy": 0.746186052262783, "epoch": 1.0573318632855568, "grad_norm": 0.3027808666229248, "learning_rate": 0.00014009508716323295, "loss": 0.7462, "mean_token_accuracy": 0.8322931200265884, "num_tokens": 3896679.0, "step": 240 }, { "entropy": 0.7389517679810524, "epoch": 1.101433296582139, "grad_norm": 0.2887279689311981, "learning_rate": 0.00013692551505546752, "loss": 0.7406, "mean_token_accuracy": 0.8339706152677536, "num_tokens": 4060607.0, "step": 250 }, { "entropy": 0.7400932610034943, "epoch": 1.145534729878721, "grad_norm": 0.29185837507247925, "learning_rate": 0.00013375594294770207, "loss": 0.7335, "mean_token_accuracy": 0.8348478749394417, "num_tokens": 4235108.0, "step": 260 }, { "entropy": 0.7405846387147903, "epoch": 1.1896361631753032, "grad_norm": 0.3650504946708679, "learning_rate": 0.0001305863708399366, "loss": 0.7242, "mean_token_accuracy": 0.8392871618270874, "num_tokens": 4399960.0, "step": 270 }, { "entropy": 0.7342920154333115, "epoch": 1.2337375964718853, "grad_norm": 0.32232147455215454, "learning_rate": 0.00012741679873217116, "loss": 0.7329, "mean_token_accuracy": 0.8371502041816712, "num_tokens": 4562192.0, "step": 280 }, { "entropy": 0.7346035555005074, "epoch": 1.2778390297684674, "grad_norm": 0.35367703437805176, "learning_rate": 0.0001242472266244057, "loss": 0.7251, "mean_token_accuracy": 0.8377319499850273, "num_tokens": 4721881.0, "step": 290 }, { "entropy": 0.7108615353703499, "epoch": 1.3219404630650495, "grad_norm": 0.2766059637069702, "learning_rate": 0.00012107765451664026, "loss": 0.7134, "mean_token_accuracy": 0.8399718284606934, "num_tokens": 4885379.0, "step": 300 }, { "epoch": 1.3219404630650495, "eval_entropy": 0.6444886132042007, "eval_loss": 0.5822195410728455, "eval_mean_token_accuracy": 0.866522473864036, "eval_num_tokens": 4885379.0, "eval_runtime": 8.0528, "eval_samples_per_second": 100.089, "eval_steps_per_second": 12.542, "step": 300 }, { "entropy": 0.7376143395900726, "epoch": 1.3660418963616316, "grad_norm": 0.2916136384010315, "learning_rate": 0.0001179080824088748, "loss": 0.7378, "mean_token_accuracy": 0.8362751066684723, "num_tokens": 5049776.0, "step": 310 }, { "entropy": 0.7360161304473877, "epoch": 1.4101433296582138, "grad_norm": 0.3246314525604248, "learning_rate": 0.00011473851030110936, "loss": 0.7383, "mean_token_accuracy": 0.8336721956729889, "num_tokens": 5209191.0, "step": 320 }, { "entropy": 0.7085310086607933, "epoch": 1.454244762954796, "grad_norm": 0.2847578823566437, "learning_rate": 0.00011156893819334391, "loss": 0.7041, "mean_token_accuracy": 0.8415971100330353, "num_tokens": 5375904.0, "step": 330 }, { "entropy": 0.7118788257241249, "epoch": 1.4983461962513782, "grad_norm": 0.3348420262336731, "learning_rate": 0.00010839936608557845, "loss": 0.7121, "mean_token_accuracy": 0.839908429980278, "num_tokens": 5530092.0, "step": 340 }, { "entropy": 0.7398809120059013, "epoch": 1.5424476295479603, "grad_norm": 0.31565436720848083, "learning_rate": 0.00010522979397781301, "loss": 0.738, "mean_token_accuracy": 0.8339577659964561, "num_tokens": 5680997.0, "step": 350 }, { "entropy": 0.7200135216116905, "epoch": 1.5865490628445424, "grad_norm": 0.3452684283256531, "learning_rate": 0.00010206022187004756, "loss": 0.726, "mean_token_accuracy": 0.8387370139360428, "num_tokens": 5847675.0, "step": 360 }, { "entropy": 0.7074938386678695, "epoch": 1.6306504961411246, "grad_norm": 0.307235985994339, "learning_rate": 9.889064976228209e-05, "loss": 0.689, "mean_token_accuracy": 0.8433170482516289, "num_tokens": 6014120.0, "step": 370 }, { "entropy": 0.713779816031456, "epoch": 1.6747519294377067, "grad_norm": 0.33486098051071167, "learning_rate": 9.572107765451665e-05, "loss": 0.7166, "mean_token_accuracy": 0.8390020251274108, "num_tokens": 6175759.0, "step": 380 }, { "entropy": 0.7257090628147125, "epoch": 1.718853362734289, "grad_norm": 0.29937514662742615, "learning_rate": 9.255150554675119e-05, "loss": 0.7296, "mean_token_accuracy": 0.8377068281173706, "num_tokens": 6337886.0, "step": 390 }, { "entropy": 0.7071069806814194, "epoch": 1.7629547960308711, "grad_norm": 0.29501640796661377, "learning_rate": 8.938193343898574e-05, "loss": 0.7023, "mean_token_accuracy": 0.8423839300870896, "num_tokens": 6499459.0, "step": 400 }, { "epoch": 1.7629547960308711, "eval_entropy": 0.6217773565561464, "eval_loss": 0.5744128227233887, "eval_mean_token_accuracy": 0.8680370243469088, "eval_num_tokens": 6499459.0, "eval_runtime": 8.0487, "eval_samples_per_second": 100.14, "eval_steps_per_second": 12.549, "step": 400 }, { "entropy": 0.7010652974247933, "epoch": 1.8070562293274532, "grad_norm": 0.3465122580528259, "learning_rate": 8.62123613312203e-05, "loss": 0.7056, "mean_token_accuracy": 0.8412188425660133, "num_tokens": 6661490.0, "step": 410 }, { "entropy": 0.7035703644156456, "epoch": 1.8511576626240354, "grad_norm": 0.2653435170650482, "learning_rate": 8.304278922345484e-05, "loss": 0.7047, "mean_token_accuracy": 0.8421282634139061, "num_tokens": 6806303.0, "step": 420 }, { "entropy": 0.7134277895092964, "epoch": 1.8952590959206175, "grad_norm": 0.3735100328922272, "learning_rate": 7.987321711568939e-05, "loss": 0.7222, "mean_token_accuracy": 0.8374130159616471, "num_tokens": 6977619.0, "step": 430 }, { "entropy": 0.6837946087121963, "epoch": 1.9393605292171996, "grad_norm": 0.3172140121459961, "learning_rate": 7.670364500792393e-05, "loss": 0.6872, "mean_token_accuracy": 0.8447714239358902, "num_tokens": 7134964.0, "step": 440 }, { "entropy": 0.7037998244166375, "epoch": 1.9834619625137817, "grad_norm": 0.3320825397968292, "learning_rate": 7.353407290015848e-05, "loss": 0.7065, "mean_token_accuracy": 0.8428374692797661, "num_tokens": 7302144.0, "step": 450 }, { "entropy": 0.6984619360703689, "epoch": 2.0264608599779494, "grad_norm": 0.285248339176178, "learning_rate": 7.036450079239303e-05, "loss": 0.6904, "mean_token_accuracy": 0.8467852473258972, "num_tokens": 7443873.0, "step": 460 }, { "entropy": 0.7012363314628601, "epoch": 2.0705622932745316, "grad_norm": 0.32722488045692444, "learning_rate": 6.719492868462758e-05, "loss": 0.6951, "mean_token_accuracy": 0.8441829264163971, "num_tokens": 7602458.0, "step": 470 }, { "entropy": 0.7040385738015175, "epoch": 2.1146637265711137, "grad_norm": 0.3185954689979553, "learning_rate": 6.402535657686212e-05, "loss": 0.7082, "mean_token_accuracy": 0.8414013043045998, "num_tokens": 7763939.0, "step": 480 }, { "entropy": 0.6956586122512818, "epoch": 2.158765159867696, "grad_norm": 0.34575557708740234, "learning_rate": 6.0855784469096676e-05, "loss": 0.695, "mean_token_accuracy": 0.8428655683994293, "num_tokens": 7927670.0, "step": 490 }, { "entropy": 0.702117745578289, "epoch": 2.202866593164278, "grad_norm": 0.3255711793899536, "learning_rate": 5.768621236133123e-05, "loss": 0.7111, "mean_token_accuracy": 0.8387768477201462, "num_tokens": 8090934.0, "step": 500 }, { "epoch": 2.202866593164278, "eval_entropy": 0.6130540751584685, "eval_loss": 0.5708180665969849, "eval_mean_token_accuracy": 0.8685296850629373, "eval_num_tokens": 8090934.0, "eval_runtime": 8.0458, "eval_samples_per_second": 100.176, "eval_steps_per_second": 12.553, "step": 500 }, { "entropy": 0.6894026726484299, "epoch": 2.24696802646086, "grad_norm": 0.3195938467979431, "learning_rate": 5.451664025356578e-05, "loss": 0.6884, "mean_token_accuracy": 0.8427316024899483, "num_tokens": 8260273.0, "step": 510 }, { "entropy": 0.6812330767512321, "epoch": 2.291069459757442, "grad_norm": 0.4105236232280731, "learning_rate": 5.134706814580032e-05, "loss": 0.6849, "mean_token_accuracy": 0.8467972829937935, "num_tokens": 8418421.0, "step": 520 }, { "entropy": 0.6863151758909225, "epoch": 2.3351708930540243, "grad_norm": 0.3161937892436981, "learning_rate": 4.817749603803487e-05, "loss": 0.6813, "mean_token_accuracy": 0.8450520291924477, "num_tokens": 8584244.0, "step": 530 }, { "entropy": 0.6781483091413975, "epoch": 2.3792723263506064, "grad_norm": 0.37656792998313904, "learning_rate": 4.5007923930269414e-05, "loss": 0.6754, "mean_token_accuracy": 0.8465494722127914, "num_tokens": 8748714.0, "step": 540 }, { "entropy": 0.6889190331101418, "epoch": 2.4233737596471885, "grad_norm": 0.3911200761795044, "learning_rate": 4.1838351822503966e-05, "loss": 0.7013, "mean_token_accuracy": 0.8418177396059037, "num_tokens": 8908894.0, "step": 550 }, { "entropy": 0.6950134262442589, "epoch": 2.4674751929437706, "grad_norm": 0.32076194882392883, "learning_rate": 3.866877971473851e-05, "loss": 0.7057, "mean_token_accuracy": 0.8429578125476838, "num_tokens": 9068647.0, "step": 560 }, { "entropy": 0.6590564094483853, "epoch": 2.5115766262403527, "grad_norm": 0.33828845620155334, "learning_rate": 3.549920760697306e-05, "loss": 0.6653, "mean_token_accuracy": 0.8498437628149986, "num_tokens": 9241573.0, "step": 570 }, { "entropy": 0.6963606104254723, "epoch": 2.555678059536935, "grad_norm": 0.3341946303844452, "learning_rate": 3.2329635499207614e-05, "loss": 0.7008, "mean_token_accuracy": 0.8427884921431541, "num_tokens": 9402668.0, "step": 580 }, { "entropy": 0.7054937720298767, "epoch": 2.599779492833517, "grad_norm": 0.2775990962982178, "learning_rate": 2.9160063391442156e-05, "loss": 0.7011, "mean_token_accuracy": 0.841389861702919, "num_tokens": 9566515.0, "step": 590 }, { "entropy": 0.7010829344391822, "epoch": 2.643880926130099, "grad_norm": 0.35286372900009155, "learning_rate": 2.5990491283676704e-05, "loss": 0.6916, "mean_token_accuracy": 0.8444753900170326, "num_tokens": 9737713.0, "step": 600 }, { "epoch": 2.643880926130099, "eval_entropy": 0.6011341580069891, "eval_loss": 0.5674853324890137, "eval_mean_token_accuracy": 0.8689056859158053, "eval_num_tokens": 9737713.0, "eval_runtime": 8.0369, "eval_samples_per_second": 100.287, "eval_steps_per_second": 12.567, "step": 600 }, { "entropy": 0.6718267679214478, "epoch": 2.687982359426681, "grad_norm": 0.3520824909210205, "learning_rate": 2.2820919175911253e-05, "loss": 0.671, "mean_token_accuracy": 0.8478609830141067, "num_tokens": 9897471.0, "step": 610 }, { "entropy": 0.6985628321766854, "epoch": 2.7320837927232633, "grad_norm": 0.4039785861968994, "learning_rate": 1.96513470681458e-05, "loss": 0.7002, "mean_token_accuracy": 0.8422206774353981, "num_tokens": 10057790.0, "step": 620 }, { "entropy": 0.7058150038123131, "epoch": 2.7761852260198454, "grad_norm": 0.3753729462623596, "learning_rate": 1.648177496038035e-05, "loss": 0.704, "mean_token_accuracy": 0.8416995897889137, "num_tokens": 10210708.0, "step": 630 }, { "entropy": 0.6703542187809944, "epoch": 2.8202866593164275, "grad_norm": 0.2910383343696594, "learning_rate": 1.3312202852614896e-05, "loss": 0.6668, "mean_token_accuracy": 0.8489345908164978, "num_tokens": 10375803.0, "step": 640 }, { "entropy": 0.6847924128174782, "epoch": 2.86438809261301, "grad_norm": 0.29248589277267456, "learning_rate": 1.0142630744849446e-05, "loss": 0.6864, "mean_token_accuracy": 0.8445799365639687, "num_tokens": 10542148.0, "step": 650 }, { "entropy": 0.6939975455403328, "epoch": 2.908489525909592, "grad_norm": 0.34305769205093384, "learning_rate": 6.973058637083994e-06, "loss": 0.6961, "mean_token_accuracy": 0.8430912375450135, "num_tokens": 10704238.0, "step": 660 }, { "entropy": 0.7086115226149559, "epoch": 2.9525909592061743, "grad_norm": 0.3405883312225342, "learning_rate": 3.8034865293185422e-06, "loss": 0.7003, "mean_token_accuracy": 0.8403518676757813, "num_tokens": 10860678.0, "step": 670 }, { "entropy": 0.6850679434835911, "epoch": 2.9966923925027564, "grad_norm": 0.36006155610084534, "learning_rate": 6.339144215530904e-07, "loss": 0.6811, "mean_token_accuracy": 0.8457101911306382, "num_tokens": 11023491.0, "step": 680 } ], "logging_steps": 10, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1688805543051264e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }