router-mmBERT-small-text-only-v3 / trainer_state.json
AmirMohseni's picture
Model save
b2c1b4d verified
{
"best_global_step": 160,
"best_metric": 0.7282191492717808,
"best_model_checkpoint": "runs/router-mmBERT-small-text-only-v3/checkpoint-160",
"epoch": 2.0,
"eval_steps": 20,
"global_step": 176,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_accuracy": 0.4431818181818182,
"eval_f1": 0.45373205741626793,
"eval_loss": 1.5113513469696045,
"eval_precision": 0.48037190082644626,
"eval_recall": 0.4431818181818182,
"eval_runtime": 4.7496,
"eval_samples_per_second": 37.056,
"eval_steps_per_second": 1.263,
"step": 0
},
{
"epoch": 0.056818181818181816,
"grad_norm": 12.686948776245117,
"learning_rate": 9.987260573051269e-05,
"loss": 3.2334,
"step": 5
},
{
"epoch": 0.11363636363636363,
"grad_norm": 104.65008544921875,
"learning_rate": 9.935617890443557e-05,
"loss": 2.5264,
"step": 10
},
{
"epoch": 0.17045454545454544,
"grad_norm": 77.57421112060547,
"learning_rate": 9.844686508907537e-05,
"loss": 0.9934,
"step": 15
},
{
"epoch": 0.22727272727272727,
"grad_norm": 25.431455612182617,
"learning_rate": 9.715190263989561e-05,
"loss": 0.7931,
"step": 20
},
{
"epoch": 0.22727272727272727,
"eval_accuracy": 0.6647727272727273,
"eval_f1": 0.5783962367355869,
"eval_loss": 0.9312057495117188,
"eval_precision": 0.6848484848484849,
"eval_recall": 0.6647727272727273,
"eval_runtime": 0.302,
"eval_samples_per_second": 582.771,
"eval_steps_per_second": 19.867,
"step": 20
},
{
"epoch": 0.2840909090909091,
"grad_norm": 59.058265686035156,
"learning_rate": 9.548159976772592e-05,
"loss": 0.9107,
"step": 25
},
{
"epoch": 0.3409090909090909,
"grad_norm": 47.96800231933594,
"learning_rate": 9.344925248293837e-05,
"loss": 0.708,
"step": 30
},
{
"epoch": 0.3977272727272727,
"grad_norm": 23.16752815246582,
"learning_rate": 9.107103875602459e-05,
"loss": 0.5048,
"step": 35
},
{
"epoch": 0.45454545454545453,
"grad_norm": 66.16763305664062,
"learning_rate": 8.836588973708129e-05,
"loss": 0.7752,
"step": 40
},
{
"epoch": 0.45454545454545453,
"eval_accuracy": 0.48863636363636365,
"eval_f1": 0.43204905618558603,
"eval_loss": 0.9223223328590393,
"eval_precision": 0.7377156177156178,
"eval_recall": 0.48863636363636365,
"eval_runtime": 0.321,
"eval_samples_per_second": 548.336,
"eval_steps_per_second": 18.693,
"step": 40
},
{
"epoch": 0.5113636363636364,
"grad_norm": 9.133078575134277,
"learning_rate": 8.535533905932738e-05,
"loss": 0.6457,
"step": 45
},
{
"epoch": 0.5681818181818182,
"grad_norm": 6.507447719573975,
"learning_rate": 8.206335142623305e-05,
"loss": 0.8555,
"step": 50
},
{
"epoch": 0.625,
"grad_norm": 7.759437561035156,
"learning_rate": 7.85161318467482e-05,
"loss": 0.5602,
"step": 55
},
{
"epoch": 0.6818181818181818,
"grad_norm": 3.4017348289489746,
"learning_rate": 7.474191703716339e-05,
"loss": 0.5251,
"step": 60
},
{
"epoch": 0.6818181818181818,
"eval_accuracy": 0.6988636363636364,
"eval_f1": 0.6658946623722376,
"eval_loss": 0.5345972180366516,
"eval_precision": 0.6939832136717565,
"eval_recall": 0.6988636363636364,
"eval_runtime": 0.4317,
"eval_samples_per_second": 407.665,
"eval_steps_per_second": 13.898,
"step": 60
},
{
"epoch": 0.7386363636363636,
"grad_norm": 38.210453033447266,
"learning_rate": 7.077075065009433e-05,
"loss": 0.7065,
"step": 65
},
{
"epoch": 0.7954545454545454,
"grad_norm": 10.191884994506836,
"learning_rate": 6.663424411982121e-05,
"loss": 0.6372,
"step": 70
},
{
"epoch": 0.8522727272727273,
"grad_norm": 20.093486785888672,
"learning_rate": 6.236532502771078e-05,
"loss": 0.5907,
"step": 75
},
{
"epoch": 0.9090909090909091,
"grad_norm": 8.867379188537598,
"learning_rate": 5.799797499079301e-05,
"loss": 0.5975,
"step": 80
},
{
"epoch": 0.9090909090909091,
"eval_accuracy": 0.6534090909090909,
"eval_f1": 0.6573467315982285,
"eval_loss": 0.5975217223167419,
"eval_precision": 0.7275870824215018,
"eval_recall": 0.6534090909090909,
"eval_runtime": 0.3987,
"eval_samples_per_second": 441.426,
"eval_steps_per_second": 15.049,
"step": 80
},
{
"epoch": 0.9659090909090909,
"grad_norm": 10.983565330505371,
"learning_rate": 5.3566959159961615e-05,
"loss": 0.6915,
"step": 85
},
{
"epoch": 1.0227272727272727,
"grad_norm": 5.20986795425415,
"learning_rate": 4.9107549481057696e-05,
"loss": 0.5815,
"step": 90
},
{
"epoch": 1.0795454545454546,
"grad_norm": 4.2207183837890625,
"learning_rate": 4.4655243921744374e-05,
"loss": 0.4689,
"step": 95
},
{
"epoch": 1.1363636363636362,
"grad_norm": 17.133651733398438,
"learning_rate": 4.0245483899193595e-05,
"loss": 0.551,
"step": 100
},
{
"epoch": 1.1363636363636362,
"eval_accuracy": 0.7272727272727273,
"eval_f1": 0.7090006767426121,
"eval_loss": 0.5679962635040283,
"eval_precision": 0.7224598930481284,
"eval_recall": 0.7272727272727273,
"eval_runtime": 0.3507,
"eval_samples_per_second": 501.913,
"eval_steps_per_second": 17.111,
"step": 100
},
{
"epoch": 1.1931818181818181,
"grad_norm": 6.083935737609863,
"learning_rate": 3.591337215792852e-05,
"loss": 0.5319,
"step": 105
},
{
"epoch": 1.25,
"grad_norm": 37.91518020629883,
"learning_rate": 3.1693393343581044e-05,
"loss": 0.571,
"step": 110
},
{
"epoch": 1.3068181818181819,
"grad_norm": 10.03105640411377,
"learning_rate": 2.7619139496864378e-05,
"loss": 0.5283,
"step": 115
},
{
"epoch": 1.3636363636363638,
"grad_norm": 3.3100969791412354,
"learning_rate": 2.3723042652894362e-05,
"loss": 0.5093,
"step": 120
},
{
"epoch": 1.3636363636363638,
"eval_accuracy": 0.7045454545454546,
"eval_f1": 0.6950933483441223,
"eval_loss": 0.5871580243110657,
"eval_precision": 0.6951515151515152,
"eval_recall": 0.7045454545454546,
"eval_runtime": 0.3979,
"eval_samples_per_second": 442.333,
"eval_steps_per_second": 15.08,
"step": 120
},
{
"epoch": 1.4204545454545454,
"grad_norm": 16.1977596282959,
"learning_rate": 2.0036116674432654e-05,
"loss": 0.5697,
"step": 125
},
{
"epoch": 1.4772727272727273,
"grad_norm": 4.914312362670898,
"learning_rate": 1.6587710374121203e-05,
"loss": 0.4587,
"step": 130
},
{
"epoch": 1.5340909090909092,
"grad_norm": 6.9654645919799805,
"learning_rate": 1.340527389091374e-05,
"loss": 0.7103,
"step": 135
},
{
"epoch": 1.5909090909090908,
"grad_norm": 8.979105949401855,
"learning_rate": 1.0514140180404204e-05,
"loss": 0.5315,
"step": 140
},
{
"epoch": 1.5909090909090908,
"eval_accuracy": 0.75,
"eval_f1": 0.72649847340511,
"eval_loss": 0.5397942066192627,
"eval_precision": 0.7593206296603149,
"eval_recall": 0.75,
"eval_runtime": 0.3718,
"eval_samples_per_second": 473.373,
"eval_steps_per_second": 16.138,
"step": 140
},
{
"epoch": 1.6477272727272727,
"grad_norm": 31.318988800048828,
"learning_rate": 7.937323358440935e-06,
"loss": 0.5135,
"step": 145
},
{
"epoch": 1.7045454545454546,
"grad_norm": 2.2285165786743164,
"learning_rate": 5.69533550325988e-06,
"loss": 0.481,
"step": 150
},
{
"epoch": 1.7613636363636362,
"grad_norm": 16.231529235839844,
"learning_rate": 3.8060233744356633e-06,
"loss": 0.5126,
"step": 155
},
{
"epoch": 1.8181818181818183,
"grad_norm": 31.868896484375,
"learning_rate": 2.2844263484068096e-06,
"loss": 0.5231,
"step": 160
},
{
"epoch": 1.8181818181818183,
"eval_accuracy": 0.7443181818181818,
"eval_f1": 0.7282191492717808,
"eval_loss": 0.5264365673065186,
"eval_precision": 0.7421696641208836,
"eval_recall": 0.7443181818181818,
"eval_runtime": 0.3604,
"eval_samples_per_second": 488.355,
"eval_steps_per_second": 16.648,
"step": 160
},
{
"epoch": 1.875,
"grad_norm": 26.505346298217773,
"learning_rate": 1.1426567014420297e-06,
"loss": 0.5943,
"step": 165
},
{
"epoch": 1.9318181818181817,
"grad_norm": 23.811979293823242,
"learning_rate": 3.8980319302407977e-07,
"loss": 0.5122,
"step": 170
},
{
"epoch": 1.9886363636363638,
"grad_norm": 7.987723350524902,
"learning_rate": 3.185871715041255e-08,
"loss": 0.5237,
"step": 175
},
{
"epoch": 2.0,
"step": 176,
"total_flos": 315928015988940.0,
"train_loss": 0.7388186820528724,
"train_runtime": 87.4213,
"train_samples_per_second": 32.189,
"train_steps_per_second": 2.013
}
],
"logging_steps": 5,
"max_steps": 176,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 315928015988940.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}