Update gene classification example to create directory after training arguments are defined

Files changed (1) hide show

examples/gene_classification.ipynb +22 -21

examples/gene_classification.ipynb CHANGED Viewed

@@ -36,6 +36,7 @@
     "from sklearn import preprocessing\n",
     "from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve\n",
     "from sklearn.model_selection import StratifiedKFold\n",
     "from transformers import BertForTokenClassification\n",
     "from transformers import Trainer\n",
     "from transformers.training_args import TrainingArguments\n",
@@ -424,26 +425,6 @@
     "## Fine-Tune With Gene Classification Learning Objective and Quantify Predictive Performance"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# define output directory path\n",
-    "current_date = datetime.datetime.now()\n",
-    "datestamp = f\"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}\"\n",
-    "training_output_dir = f\"/path/to/models/{datestamp}_geneformer_GeneClassifier_dosageTF_L{max_sequence_length}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_n{subsample_size}_F{freeze_layers}/\"\n",
-    "\n",
-    "# ensure not overwriting previously saved model\n",
-    "ksplit_model_test = os.path.join(training_output_dir, \"ksplit0/models/pytorch_model.bin\")\n",
-    "if os.path.isfile(ksplit_model_test) == True:\n",
-    "    raise Exception(\"Model already saved to this directory.\")\n",
-    "\n",
-    "# make output directory\n",
-    "subprocess.call(f'mkdir {training_output_dir}', shell=True)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -489,6 +470,7 @@
     "    \"learning_rate\": max_lr,\n",
     "    \"do_train\": True,\n",
     "    \"evaluation_strategy\": \"no\",\n",
     "    \"logging_steps\": 100,\n",
     "    \"group_by_length\": True,\n",
     "    \"length_column_name\": \"length\",\n",
@@ -499,10 +481,29 @@
     "    \"per_device_train_batch_size\": geneformer_batch_size,\n",
     "    \"per_device_eval_batch_size\": geneformer_batch_size,\n",
     "    \"num_train_epochs\": epochs,\n",
-    "    \"load_best_model_at_end\": True,\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 23,

     "from sklearn import preprocessing\n",
     "from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve\n",
     "from sklearn.model_selection import StratifiedKFold\n",
+    "import torch\n",
     "from transformers import BertForTokenClassification\n",
     "from transformers import Trainer\n",
     "from transformers.training_args import TrainingArguments\n",
     "## Fine-Tune With Gene Classification Learning Objective and Quantify Predictive Performance"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
     "    \"learning_rate\": max_lr,\n",
     "    \"do_train\": True,\n",
     "    \"evaluation_strategy\": \"no\",\n",
+    "    \"save_strategy\": \"epoch\",\n",
     "    \"logging_steps\": 100,\n",
     "    \"group_by_length\": True,\n",
     "    \"length_column_name\": \"length\",\n",
     "    \"per_device_train_batch_size\": geneformer_batch_size,\n",
     "    \"per_device_eval_batch_size\": geneformer_batch_size,\n",
     "    \"num_train_epochs\": epochs,\n",
     "}"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define output directory path\n",
+    "current_date = datetime.datetime.now()\n",
+    "datestamp = f\"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}\"\n",
+    "training_output_dir = f\"/path/to/models/{datestamp}_geneformer_GeneClassifier_dosageTF_L{max_input_size}_B{geneformer_batch_size}_LR{max_lr}_LS{lr_schedule_fn}_WU{warmup_steps}_E{epochs}_O{optimizer}_n{subsample_size}_F{freeze_layers}/\"\n",
+    "\n",
+    "# ensure not overwriting previously saved model\n",
+    "ksplit_model_test = os.path.join(training_output_dir, \"ksplit0/models/pytorch_model.bin\")\n",
+    "if os.path.isfile(ksplit_model_test) == True:\n",
+    "    raise Exception(\"Model already saved to this directory.\")\n",
+    "\n",
+    "# make output directory\n",
+    "subprocess.call(f'mkdir {training_output_dir}', shell=True)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 23,