{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 20958, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047714476572192, "grad_norm": 0.023498278111219406, "learning_rate": 0.00019523809523809525, "loss": 0.4674, "mean_token_accuracy": 0.8906204112768173, "num_tokens": 360041.0, "step": 500 }, { "epoch": 0.095428953144384, "grad_norm": 3.823547601699829, "learning_rate": 0.00019046664758087605, "loss": 0.3194, "mean_token_accuracy": 0.9205255397558212, "num_tokens": 727772.0, "step": 1000 }, { "epoch": 0.143143429716576, "grad_norm": 0.0001839943724917248, "learning_rate": 0.00018569519992365686, "loss": 0.2674, "mean_token_accuracy": 0.9327976078987121, "num_tokens": 1097406.0, "step": 1500 }, { "epoch": 0.190857906288768, "grad_norm": 4.199777126312256, "learning_rate": 0.00018092375226643766, "loss": 0.326, "mean_token_accuracy": 0.920593403160572, "num_tokens": 1462220.0, "step": 2000 }, { "epoch": 0.23857238286096002, "grad_norm": 5.751025676727295, "learning_rate": 0.00017615230460921847, "loss": 0.2693, "mean_token_accuracy": 0.936355301618576, "num_tokens": 1828224.0, "step": 2500 }, { "epoch": 0.286286859433152, "grad_norm": 3.971045732498169, "learning_rate": 0.00017138085695199925, "loss": 0.2827, "mean_token_accuracy": 0.9261101527214051, "num_tokens": 2211342.0, "step": 3000 }, { "epoch": 0.33400133600534404, "grad_norm": 2.15079665184021, "learning_rate": 0.00016660940929478005, "loss": 0.2461, "mean_token_accuracy": 0.9350018633604049, "num_tokens": 2572498.0, "step": 3500 }, { "epoch": 0.381715812577536, "grad_norm": 0.5263189077377319, "learning_rate": 0.00016183796163756083, "loss": 0.2534, "mean_token_accuracy": 0.9374513441324234, "num_tokens": 2949312.0, "step": 4000 }, { "epoch": 0.42943028914972803, "grad_norm": 0.004941379185765982, "learning_rate": 0.00015706651398034164, "loss": 0.2395, "mean_token_accuracy": 0.9378743978738785, "num_tokens": 3314910.0, "step": 4500 }, { "epoch": 0.47714476572192005, "grad_norm": 0.007795912679284811, "learning_rate": 0.00015229506632312244, "loss": 0.2284, "mean_token_accuracy": 0.9409392136335373, "num_tokens": 3686624.0, "step": 5000 }, { "epoch": 0.5248592422941121, "grad_norm": 0.0033825524151325226, "learning_rate": 0.00014752361866590325, "loss": 0.2145, "mean_token_accuracy": 0.9464458491802216, "num_tokens": 4063459.0, "step": 5500 }, { "epoch": 0.572573718866304, "grad_norm": 3.8457958698272705, "learning_rate": 0.00014275217100868402, "loss": 0.2268, "mean_token_accuracy": 0.9410561621189117, "num_tokens": 4423993.0, "step": 6000 }, { "epoch": 0.620288195438496, "grad_norm": 0.020238121971488, "learning_rate": 0.00013798072335146483, "loss": 0.223, "mean_token_accuracy": 0.9393890690803528, "num_tokens": 4788203.0, "step": 6500 }, { "epoch": 0.6680026720106881, "grad_norm": 0.0006973391864448786, "learning_rate": 0.00013320927569424564, "loss": 0.2434, "mean_token_accuracy": 0.9404154337644577, "num_tokens": 5162496.0, "step": 7000 }, { "epoch": 0.71571714858288, "grad_norm": 0.0010198453674092889, "learning_rate": 0.00012843782803702644, "loss": 0.191, "mean_token_accuracy": 0.9535960764884949, "num_tokens": 5520427.0, "step": 7500 }, { "epoch": 0.763431625155072, "grad_norm": 0.001297333394177258, "learning_rate": 0.00012366638037980725, "loss": 0.2167, "mean_token_accuracy": 0.942195966720581, "num_tokens": 5901763.0, "step": 8000 }, { "epoch": 0.8111461017272641, "grad_norm": 6.195448398590088, "learning_rate": 0.00011889493272258805, "loss": 0.2305, "mean_token_accuracy": 0.9376264967918396, "num_tokens": 6272492.0, "step": 8500 }, { "epoch": 0.8588605782994561, "grad_norm": 0.0025545568205416203, "learning_rate": 0.00011412348506536883, "loss": 0.2303, "mean_token_accuracy": 0.9435879285335541, "num_tokens": 6657487.0, "step": 9000 }, { "epoch": 0.906575054871648, "grad_norm": 0.0006595577578991652, "learning_rate": 0.00010935203740814964, "loss": 0.179, "mean_token_accuracy": 0.9520990616083145, "num_tokens": 7022908.0, "step": 9500 }, { "epoch": 0.9542895314438401, "grad_norm": 3.1752443313598633, "learning_rate": 0.00010458058975093044, "loss": 0.1827, "mean_token_accuracy": 0.9488343714475632, "num_tokens": 7376243.0, "step": 10000 }, { "epoch": 1.002004008016032, "grad_norm": 0.001512572169303894, "learning_rate": 9.980914209371123e-05, "loss": 0.2101, "mean_token_accuracy": 0.9424606282711029, "num_tokens": 7749938.0, "step": 10500 }, { "epoch": 1.0497184845882241, "grad_norm": 3.992393732070923, "learning_rate": 9.503769443649203e-05, "loss": 0.1309, "mean_token_accuracy": 0.9618149808645249, "num_tokens": 8127846.0, "step": 11000 }, { "epoch": 1.097432961160416, "grad_norm": 0.00025509227998554707, "learning_rate": 9.026624677927283e-05, "loss": 0.1219, "mean_token_accuracy": 0.9635183781385421, "num_tokens": 8503956.0, "step": 11500 }, { "epoch": 1.145147437732608, "grad_norm": 0.0013997952919453382, "learning_rate": 8.549479912205364e-05, "loss": 0.1235, "mean_token_accuracy": 0.963010191321373, "num_tokens": 8862553.0, "step": 12000 }, { "epoch": 1.1928619143048, "grad_norm": 2.114091157913208, "learning_rate": 8.072335146483443e-05, "loss": 0.1271, "mean_token_accuracy": 0.9629592669010162, "num_tokens": 9233864.0, "step": 12500 }, { "epoch": 1.240576390876992, "grad_norm": 0.277444452047348, "learning_rate": 7.595190380761523e-05, "loss": 0.1187, "mean_token_accuracy": 0.9649668201208115, "num_tokens": 9596971.0, "step": 13000 }, { "epoch": 1.288290867449184, "grad_norm": 4.878781318664551, "learning_rate": 7.118045615039604e-05, "loss": 0.1226, "mean_token_accuracy": 0.9633177869319915, "num_tokens": 9966614.0, "step": 13500 }, { "epoch": 1.3360053440213762, "grad_norm": 5.269028186798096, "learning_rate": 6.640900849317683e-05, "loss": 0.1336, "mean_token_accuracy": 0.9619411797523498, "num_tokens": 10338731.0, "step": 14000 }, { "epoch": 1.3837198205935681, "grad_norm": 0.000423251127358526, "learning_rate": 6.163756083595764e-05, "loss": 0.1386, "mean_token_accuracy": 0.9611272529363633, "num_tokens": 10724381.0, "step": 14500 }, { "epoch": 1.43143429716576, "grad_norm": 0.754705548286438, "learning_rate": 5.6866113178738436e-05, "loss": 0.1182, "mean_token_accuracy": 0.9661157331466674, "num_tokens": 11094900.0, "step": 15000 }, { "epoch": 1.479148773737952, "grad_norm": 0.0017857268685474992, "learning_rate": 5.2094665521519235e-05, "loss": 0.1175, "mean_token_accuracy": 0.9679548003673554, "num_tokens": 11454717.0, "step": 15500 }, { "epoch": 1.5268632503101442, "grad_norm": 3.5908143520355225, "learning_rate": 4.732321786430003e-05, "loss": 0.11, "mean_token_accuracy": 0.9671754879951477, "num_tokens": 11832591.0, "step": 16000 }, { "epoch": 1.5745777268823362, "grad_norm": 0.010208655148744583, "learning_rate": 4.255177020708083e-05, "loss": 0.115, "mean_token_accuracy": 0.9662747744321823, "num_tokens": 12223203.0, "step": 16500 }, { "epoch": 1.6222922034545282, "grad_norm": 0.002450750907883048, "learning_rate": 3.778032254986163e-05, "loss": 0.1112, "mean_token_accuracy": 0.9668832242488861, "num_tokens": 12593144.0, "step": 17000 }, { "epoch": 1.6700066800267201, "grad_norm": 0.004513042513281107, "learning_rate": 3.300887489264243e-05, "loss": 0.1004, "mean_token_accuracy": 0.9708232057094573, "num_tokens": 12954505.0, "step": 17500 }, { "epoch": 1.7177211565989121, "grad_norm": 0.007037173956632614, "learning_rate": 2.8237427235423232e-05, "loss": 0.1114, "mean_token_accuracy": 0.9671108702421188, "num_tokens": 13307858.0, "step": 18000 }, { "epoch": 1.765435633171104, "grad_norm": 0.015288141556084156, "learning_rate": 2.3465979578204027e-05, "loss": 0.1034, "mean_token_accuracy": 0.9698293421268463, "num_tokens": 13673816.0, "step": 18500 }, { "epoch": 1.813150109743296, "grad_norm": 6.889008045196533, "learning_rate": 1.869453192098483e-05, "loss": 0.0977, "mean_token_accuracy": 0.9710470995903016, "num_tokens": 14038630.0, "step": 19000 }, { "epoch": 1.860864586315488, "grad_norm": 2.6273930072784424, "learning_rate": 1.3923084263765626e-05, "loss": 0.0936, "mean_token_accuracy": 0.9728796405792236, "num_tokens": 14401139.0, "step": 19500 }, { "epoch": 1.90857906288768, "grad_norm": 0.004859536420553923, "learning_rate": 9.151636606546427e-06, "loss": 0.0929, "mean_token_accuracy": 0.9708857105970383, "num_tokens": 14770982.0, "step": 20000 }, { "epoch": 1.9562935394598722, "grad_norm": 0.4923778176307678, "learning_rate": 4.380188949327226e-06, "loss": 0.0963, "mean_token_accuracy": 0.9713275592327117, "num_tokens": 15136348.0, "step": 20500 } ], "logging_steps": 500, "max_steps": 20958, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.399363404153889e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }