akseljoonas's picture
akseljoonas HF Staff
Model save
883065d verified
raw
history blame
6.64 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 112,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.32653061224489793,
"grad_norm": 3.1376605701360365,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.4086,
"mean_token_accuracy": 0.6991279818117618,
"num_tokens": 3032788.0,
"step": 5
},
{
"epoch": 0.6530612244897959,
"grad_norm": 2.886401259073607,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.7103,
"mean_token_accuracy": 0.8366268374025821,
"num_tokens": 6108050.0,
"step": 10
},
{
"epoch": 0.9795918367346939,
"grad_norm": 3.8085762872209985,
"learning_rate": 3.9960534568565436e-05,
"loss": 0.4951,
"mean_token_accuracy": 0.8857219524681568,
"num_tokens": 9153481.0,
"step": 15
},
{
"epoch": 1.2612244897959184,
"grad_norm": 0.7657669831409691,
"learning_rate": 3.951833523877495e-05,
"loss": 0.4221,
"mean_token_accuracy": 0.8977213052735813,
"num_tokens": 11636574.0,
"step": 20
},
{
"epoch": 1.5877551020408163,
"grad_norm": 0.2877956756473211,
"learning_rate": 3.859552971776503e-05,
"loss": 0.4363,
"mean_token_accuracy": 0.895506302267313,
"num_tokens": 14619147.0,
"step": 25
},
{
"epoch": 1.9142857142857141,
"grad_norm": 0.28656112123732563,
"learning_rate": 3.721484054007888e-05,
"loss": 0.3936,
"mean_token_accuracy": 0.9049416035413742,
"num_tokens": 17710131.0,
"step": 30
},
{
"epoch": 2.195918367346939,
"grad_norm": 0.2823103236835904,
"learning_rate": 3.541026485551579e-05,
"loss": 0.3158,
"mean_token_accuracy": 0.9200559068417203,
"num_tokens": 20112186.0,
"step": 35
},
{
"epoch": 2.522448979591837,
"grad_norm": 0.2655359257585452,
"learning_rate": 3.322623730647304e-05,
"loss": 0.3448,
"mean_token_accuracy": 0.9147586159408092,
"num_tokens": 23165264.0,
"step": 40
},
{
"epoch": 2.8489795918367347,
"grad_norm": 0.20795518587914283,
"learning_rate": 3.0716535899579936e-05,
"loss": 0.2919,
"mean_token_accuracy": 0.9269049897789955,
"num_tokens": 26259224.0,
"step": 45
},
{
"epoch": 3.130612244897959,
"grad_norm": 0.2884355032278423,
"learning_rate": 2.7942957812695613e-05,
"loss": 0.282,
"mean_token_accuracy": 0.9283535532329393,
"num_tokens": 28739676.0,
"step": 50
},
{
"epoch": 3.4571428571428573,
"grad_norm": 0.21897940738440588,
"learning_rate": 2.4973797743297103e-05,
"loss": 0.2199,
"mean_token_accuracy": 0.9427422039210797,
"num_tokens": 31756738.0,
"step": 55
},
{
"epoch": 3.783673469387755,
"grad_norm": 0.23860764801694623,
"learning_rate": 2.1882166266370292e-05,
"loss": 0.2215,
"mean_token_accuracy": 0.9424595959484577,
"num_tokens": 34883372.0,
"step": 60
},
{
"epoch": 4.0653061224489795,
"grad_norm": 0.46858487289456846,
"learning_rate": 1.8744189609413733e-05,
"loss": 0.1972,
"mean_token_accuracy": 0.9491587445355844,
"num_tokens": 37296561.0,
"step": 65
},
{
"epoch": 4.391836734693878,
"grad_norm": 0.2511271428576979,
"learning_rate": 1.5637135172069155e-05,
"loss": 0.1837,
"mean_token_accuracy": 0.9537528082728386,
"num_tokens": 40373787.0,
"step": 70
},
{
"epoch": 4.718367346938775,
"grad_norm": 0.2967198621367196,
"learning_rate": 1.2637508946306443e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9580980874598026,
"num_tokens": 43363161.0,
"step": 75
},
{
"epoch": 5.0,
"grad_norm": 0.23363519084392373,
"learning_rate": 9.819171684992575e-06,
"loss": 0.1312,
"mean_token_accuracy": 0.9628054084985153,
"num_tokens": 45860256.0,
"step": 80
},
{
"epoch": 5.326530612244898,
"grad_norm": 0.23534750602445398,
"learning_rate": 7.251520205026206e-06,
"loss": 0.1269,
"mean_token_accuracy": 0.967383312433958,
"num_tokens": 48879333.0,
"step": 85
},
{
"epoch": 5.653061224489796,
"grad_norm": 0.2799375041862015,
"learning_rate": 4.997778607390809e-06,
"loss": 0.1221,
"mean_token_accuracy": 0.9683701202273369,
"num_tokens": 51946379.0,
"step": 90
},
{
"epoch": 5.979591836734694,
"grad_norm": 0.1875365707782748,
"learning_rate": 3.1134414899597033e-06,
"loss": 0.1279,
"mean_token_accuracy": 0.9681350864470005,
"num_tokens": 54995950.0,
"step": 95
},
{
"epoch": 6.261224489795918,
"grad_norm": 0.2033347485797425,
"learning_rate": 1.6449074863203773e-06,
"loss": 0.1222,
"mean_token_accuracy": 0.972901335660962,
"num_tokens": 57369628.0,
"step": 100
},
{
"epoch": 6.587755102040816,
"grad_norm": 0.1697720175472823,
"learning_rate": 6.283367774273785e-07,
"loss": 0.105,
"mean_token_accuracy": 0.9732269406318664,
"num_tokens": 60481208.0,
"step": 105
},
{
"epoch": 6.914285714285715,
"grad_norm": 0.18026706851340263,
"learning_rate": 8.876070793840008e-08,
"loss": 0.0986,
"mean_token_accuracy": 0.9752129040658474,
"num_tokens": 63509425.0,
"step": 110
},
{
"epoch": 7.0,
"mean_token_accuracy": 0.9745120831898281,
"num_tokens": 64131339.0,
"step": 112,
"total_flos": 130497026588672.0,
"train_loss": 0.3111057321407965,
"train_runtime": 2546.477,
"train_samples_per_second": 5.369,
"train_steps_per_second": 0.044
}
],
"logging_steps": 5,
"max_steps": 112,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 130497026588672.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}