Christina Theodoris
commited on
Commit
·
ebe5ee8
1
Parent(s):
402ba9b
Update gene classification example to create directory after training arguments are defined
Browse files
examples/gene_classification.ipynb
CHANGED
|
@@ -36,6 +36,7 @@
|
|
| 36 |
"from sklearn import preprocessing\n",
|
| 37 |
"from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve\n",
|
| 38 |
"from sklearn.model_selection import StratifiedKFold\n",
|
|
|
|
| 39 |
"from transformers import BertForTokenClassification\n",
|
| 40 |
"from transformers import Trainer\n",
|
| 41 |
"from transformers.training_args import TrainingArguments\n",
|
|
@@ -424,26 +425,6 @@
|
|
| 424 |
"## Fine-Tune With Gene Classification Learning Objective and Quantify Predictive Performance"
|
| 425 |
]
|
| 426 |
},
|
| 427 |
-
{
|
| 428 |
-
"cell_type": "code",
|
| 429 |
-
"execution_count": null,
|
| 430 |
-
"metadata": {},
|
| 431 |
-
"outputs": [],
|
| 432 |
-
"source": [
|
| 433 |
-
"# define output directory path\n",
|
| 434 |
-
"current_date = datetime.datetime.now()\n",
|
| 435 |
-
"datestamp = f\"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}\"\n",
|
| 436 |
-
"training_output_dir = f\"/path/to/models/{datestamp}_geneformer_GeneClassifier_dosageTF_L{max_sequence_length}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_n{subsample_size}_F{freeze_layers}/\"\n",
|
| 437 |
-
"\n",
|
| 438 |
-
"# ensure not overwriting previously saved model\n",
|
| 439 |
-
"ksplit_model_test = os.path.join(training_output_dir, \"ksplit0/models/pytorch_model.bin\")\n",
|
| 440 |
-
"if os.path.isfile(ksplit_model_test) == True:\n",
|
| 441 |
-
" raise Exception(\"Model already saved to this directory.\")\n",
|
| 442 |
-
"\n",
|
| 443 |
-
"# make output directory\n",
|
| 444 |
-
"subprocess.call(f'mkdir {training_output_dir}', shell=True)"
|
| 445 |
-
]
|
| 446 |
-
},
|
| 447 |
{
|
| 448 |
"cell_type": "code",
|
| 449 |
"execution_count": null,
|
|
@@ -489,6 +470,7 @@
|
|
| 489 |
" \"learning_rate\": max_lr,\n",
|
| 490 |
" \"do_train\": True,\n",
|
| 491 |
" \"evaluation_strategy\": \"no\",\n",
|
|
|
|
| 492 |
" \"logging_steps\": 100,\n",
|
| 493 |
" \"group_by_length\": True,\n",
|
| 494 |
" \"length_column_name\": \"length\",\n",
|
|
@@ -499,10 +481,29 @@
|
|
| 499 |
" \"per_device_train_batch_size\": geneformer_batch_size,\n",
|
| 500 |
" \"per_device_eval_batch_size\": geneformer_batch_size,\n",
|
| 501 |
" \"num_train_epochs\": epochs,\n",
|
| 502 |
-
" \"load_best_model_at_end\": True,\n",
|
| 503 |
"}"
|
| 504 |
]
|
| 505 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
{
|
| 507 |
"cell_type": "code",
|
| 508 |
"execution_count": 23,
|
|
|
|
| 36 |
"from sklearn import preprocessing\n",
|
| 37 |
"from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve\n",
|
| 38 |
"from sklearn.model_selection import StratifiedKFold\n",
|
| 39 |
+
"import torch\n",
|
| 40 |
"from transformers import BertForTokenClassification\n",
|
| 41 |
"from transformers import Trainer\n",
|
| 42 |
"from transformers.training_args import TrainingArguments\n",
|
|
|
|
| 425 |
"## Fine-Tune With Gene Classification Learning Objective and Quantify Predictive Performance"
|
| 426 |
]
|
| 427 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
{
|
| 429 |
"cell_type": "code",
|
| 430 |
"execution_count": null,
|
|
|
|
| 470 |
" \"learning_rate\": max_lr,\n",
|
| 471 |
" \"do_train\": True,\n",
|
| 472 |
" \"evaluation_strategy\": \"no\",\n",
|
| 473 |
+
" \"save_strategy\": \"epoch\",\n",
|
| 474 |
" \"logging_steps\": 100,\n",
|
| 475 |
" \"group_by_length\": True,\n",
|
| 476 |
" \"length_column_name\": \"length\",\n",
|
|
|
|
| 481 |
" \"per_device_train_batch_size\": geneformer_batch_size,\n",
|
| 482 |
" \"per_device_eval_batch_size\": geneformer_batch_size,\n",
|
| 483 |
" \"num_train_epochs\": epochs,\n",
|
|
|
|
| 484 |
"}"
|
| 485 |
]
|
| 486 |
},
|
| 487 |
+
{
|
| 488 |
+
"cell_type": "code",
|
| 489 |
+
"execution_count": null,
|
| 490 |
+
"metadata": {},
|
| 491 |
+
"outputs": [],
|
| 492 |
+
"source": [
|
| 493 |
+
"# define output directory path\n",
|
| 494 |
+
"current_date = datetime.datetime.now()\n",
|
| 495 |
+
"datestamp = f\"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}\"\n",
|
| 496 |
+
"training_output_dir = f\"/path/to/models/{datestamp}_geneformer_GeneClassifier_dosageTF_L{max_input_size}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_n{subsample_size}_F{freeze_layers}/\"\n",
|
| 497 |
+
"\n",
|
| 498 |
+
"# ensure not overwriting previously saved model\n",
|
| 499 |
+
"ksplit_model_test = os.path.join(training_output_dir, \"ksplit0/models/pytorch_model.bin\")\n",
|
| 500 |
+
"if os.path.isfile(ksplit_model_test) == True:\n",
|
| 501 |
+
" raise Exception(\"Model already saved to this directory.\")\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"# make output directory\n",
|
| 504 |
+
"subprocess.call(f'mkdir {training_output_dir}', shell=True)"
|
| 505 |
+
]
|
| 506 |
+
},
|
| 507 |
{
|
| 508 |
"cell_type": "code",
|
| 509 |
"execution_count": 23,
|