Upload 5 files
Browse files- notebooks/Ensemble.ipynb +398 -0
- notebooks/Input.ipynb +152 -0
- notebooks/mlp_esm2.ipynb +565 -0
- notebooks/mlp_protbert.ipynb +825 -0
- notebooks/mlp_protbertbfd.ipynb +802 -0
notebooks/Ensemble.ipynb
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 6,
|
| 6 |
+
"id": "0fbbb46c-1a00-4585-9ecd-a490a46e8b99",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"name": "stderr",
|
| 18 |
+
"output_type": "stream",
|
| 19 |
+
"text": [
|
| 20 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\sklearn\\base.py:380: InconsistentVersionWarning: Trying to unpickle estimator MultiLabelBinarizer from version 1.1.3 when using version 1.6.1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
|
| 21 |
+
"https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
|
| 22 |
+
" warnings.warn(\n"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"name": "stdout",
|
| 27 |
+
"output_type": "stream",
|
| 28 |
+
"text": [
|
| 29 |
+
" ProtBERT Fmax=0.6611 Thr=0.45 AuPRC=0.6951 Smin=13.4386\n",
|
| 30 |
+
" ProtBERT-BFD Fmax=0.6588 Thr=0.46 AuPRC=0.6991 Smin=13.5461\n",
|
| 31 |
+
" ESM-2 Fmax=0.6378 Thr=0.35 AuPRC=0.6850 Smin=14.4083\n",
|
| 32 |
+
" Ensemble Fmax=0.6880 Thr=0.37 AuPRC=0.7334 Smin=12.7141\n"
|
| 33 |
+
]
|
| 34 |
+
}
|
| 35 |
+
],
|
| 36 |
+
"source": [
|
| 37 |
+
"# %%\n",
|
| 38 |
+
"import numpy as np, joblib, math\n",
|
| 39 |
+
"from sklearn.metrics import precision_recall_curve, auc\n",
|
| 40 |
+
"from goatools.obo_parser import GODag\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"GO_FILE = \"go.obo\"\n",
|
| 43 |
+
"dag = GODag(GO_FILE)\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"# y_true + GO terms (referência ProtBERT)\n",
|
| 46 |
+
"test_pb = joblib.load(\"embeddings/test_protbert.pkl\")\n",
|
| 47 |
+
"y_true = test_pb[\"labels\"] # (1724, 597) ← ground-truth\n",
|
| 48 |
+
"go_ref = list(test_pb[\"go_terms\"]) # ordem exacta das colunas\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"n_go = len(go_ref) # 597\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"# Carregar predições\n",
|
| 53 |
+
"y_pb = np.load(\"predictions/mf-protbert-pam1.npy\") # 1724×597\n",
|
| 54 |
+
"y_bfd = np.load(\"predictions/mf-protbertbfd-pam1.npy\") # 1724×597\n",
|
| 55 |
+
"y_esm0 = np.load(\"predictions/mf-esm2.npy\") # 1724×602\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"# Remapear ESM-2 para ordem ProtBERT\n",
|
| 58 |
+
"mlb_esm = joblib.load(\"data/mlb.pkl\") # 602 GO terms\n",
|
| 59 |
+
"idx_map = [list(mlb_esm.classes_).index(t) for t in go_ref]\n",
|
| 60 |
+
"y_esm = y_esm0[:, idx_map] # 1724×597\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"# Garantir shapes iguais\n",
|
| 63 |
+
"assert (y_true.shape == y_pb.shape == y_bfd.shape\n",
|
| 64 |
+
" == y_esm.shape == (1724, n_go)), \"Ainda há desalinhamento!\"\n",
|
| 65 |
+
"\n",
|
| 66 |
+
"# Métricas\n",
|
| 67 |
+
"THR = np.linspace(0,1,101)\n",
|
| 68 |
+
"def fmax(y_t,y_p):\n",
|
| 69 |
+
" best,thr = 0,0\n",
|
| 70 |
+
" for t in THR:\n",
|
| 71 |
+
" y_b = (y_p>=t).astype(int)\n",
|
| 72 |
+
" tp = (y_t*y_b).sum(1); fp=((1-y_t)*y_b).sum(1); fn=(y_t*(1-y_b)).sum(1)\n",
|
| 73 |
+
" f1 = 2*tp/(2*tp+fp+fn+1e-8); m=f1.mean()\n",
|
| 74 |
+
" if m>best: best,thr = m,t\n",
|
| 75 |
+
" return best,thr\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"def auprc(y_t,y_p):\n",
|
| 78 |
+
" p,r,_ = precision_recall_curve(y_t.ravel(), y_p.ravel()); return auc(r,p)\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"def smin(y_t,y_p,thr,alpha=0.5):\n",
|
| 81 |
+
" y_b=(y_p>=thr).astype(int)\n",
|
| 82 |
+
" ic=-(np.log((y_t+y_b).sum(0)+1e-8)-np.log((y_t+y_b).sum()+1e-8))\n",
|
| 83 |
+
" ru=np.logical_and(y_b, np.logical_not(y_t))*ic\n",
|
| 84 |
+
" mi=np.logical_and(y_t, np.logical_not(y_b))*ic\n",
|
| 85 |
+
" return np.sqrt((alpha*ru.sum(1))**2 + ((1-alpha)*mi.sum(1))**2).mean()\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"def show(name,y_p):\n",
|
| 88 |
+
" f,thr=fmax(y_true,y_p)\n",
|
| 89 |
+
" print(f\"{name:>13s} Fmax={f:.4f} Thr={thr:.2f} \"\n",
|
| 90 |
+
" f\"AuPRC={auprc(y_true,y_p):.4f} Smin={smin(y_true,y_p,thr):.4f}\")\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"show(\"ProtBERT\", y_pb)\n",
|
| 93 |
+
"show(\"ProtBERT-BFD\", y_bfd)\n",
|
| 94 |
+
"show(\"ESM-2\", y_esm)\n",
|
| 95 |
+
"show(\"Ensemble\", (y_pb + y_bfd + y_esm)/3)\n"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": 9,
|
| 101 |
+
"id": "f1807404-c2ce-48d0-b87c-a7e0fecc1728",
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"outputs": [
|
| 104 |
+
{
|
| 105 |
+
"name": "stdout",
|
| 106 |
+
"output_type": "stream",
|
| 107 |
+
"text": [
|
| 108 |
+
"Epoch 1/50\n",
|
| 109 |
+
"19/19 [==============================] - 1s 12ms/step - loss: 0.3895 - val_loss: 0.0855\n",
|
| 110 |
+
"Epoch 2/50\n",
|
| 111 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0879 - val_loss: 0.0704\n",
|
| 112 |
+
"Epoch 3/50\n",
|
| 113 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0625 - val_loss: 0.0567\n",
|
| 114 |
+
"Epoch 4/50\n",
|
| 115 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0553 - val_loss: 0.0526\n",
|
| 116 |
+
"Epoch 5/50\n",
|
| 117 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0508 - val_loss: 0.0484\n",
|
| 118 |
+
"Epoch 6/50\n",
|
| 119 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0468 - val_loss: 0.0452\n",
|
| 120 |
+
"Epoch 7/50\n",
|
| 121 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0433 - val_loss: 0.0428\n",
|
| 122 |
+
"Epoch 8/50\n",
|
| 123 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0407 - val_loss: 0.0409\n",
|
| 124 |
+
"Epoch 9/50\n",
|
| 125 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0387 - val_loss: 0.0395\n",
|
| 126 |
+
"Epoch 10/50\n",
|
| 127 |
+
"19/19 [==============================] - 0s 10ms/step - loss: 0.0369 - val_loss: 0.0382\n",
|
| 128 |
+
"Epoch 11/50\n",
|
| 129 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0352 - val_loss: 0.0367\n",
|
| 130 |
+
"Epoch 12/50\n",
|
| 131 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0339 - val_loss: 0.0359\n",
|
| 132 |
+
"Epoch 13/50\n",
|
| 133 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0328 - val_loss: 0.0352\n",
|
| 134 |
+
"Epoch 14/50\n",
|
| 135 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0315 - val_loss: 0.0344\n",
|
| 136 |
+
"Epoch 15/50\n",
|
| 137 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0305 - val_loss: 0.0341\n",
|
| 138 |
+
"Epoch 16/50\n",
|
| 139 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0296 - val_loss: 0.0336\n",
|
| 140 |
+
"Epoch 17/50\n",
|
| 141 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0291 - val_loss: 0.0332\n",
|
| 142 |
+
"Epoch 18/50\n",
|
| 143 |
+
"19/19 [==============================] - 0s 8ms/step - loss: 0.0282 - val_loss: 0.0331\n",
|
| 144 |
+
"Epoch 19/50\n",
|
| 145 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0273 - val_loss: 0.0329\n",
|
| 146 |
+
"Epoch 20/50\n",
|
| 147 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0269 - val_loss: 0.0329\n",
|
| 148 |
+
"Epoch 21/50\n",
|
| 149 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0264 - val_loss: 0.0324\n",
|
| 150 |
+
"Epoch 22/50\n",
|
| 151 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0257 - val_loss: 0.0325\n",
|
| 152 |
+
"Epoch 23/50\n",
|
| 153 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0253 - val_loss: 0.0324\n",
|
| 154 |
+
"Epoch 24/50\n",
|
| 155 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0246 - val_loss: 0.0322\n",
|
| 156 |
+
"Epoch 25/50\n",
|
| 157 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0247 - val_loss: 0.0323\n",
|
| 158 |
+
"Epoch 26/50\n",
|
| 159 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0241 - val_loss: 0.0321\n",
|
| 160 |
+
"Epoch 27/50\n",
|
| 161 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0236 - val_loss: 0.0323\n",
|
| 162 |
+
"Epoch 28/50\n",
|
| 163 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0233 - val_loss: 0.0324\n",
|
| 164 |
+
"Epoch 29/50\n",
|
| 165 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0228 - val_loss: 0.0325\n",
|
| 166 |
+
"Epoch 30/50\n",
|
| 167 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0227 - val_loss: 0.0323\n",
|
| 168 |
+
"Epoch 31/50\n",
|
| 169 |
+
"19/19 [==============================] - 0s 9ms/step - loss: 0.0219 - val_loss: 0.0325\n",
|
| 170 |
+
"27/27 [==============================] - 0s 2ms/step\n",
|
| 171 |
+
"\n",
|
| 172 |
+
" STACKING (GPU-Keras MLP)\n",
|
| 173 |
+
"Fmax = 0.6956\n",
|
| 174 |
+
"Thr. = 0.37\n",
|
| 175 |
+
"AuPRC = 0.7591\n",
|
| 176 |
+
"Smin = 12.2272\n"
|
| 177 |
+
]
|
| 178 |
+
}
|
| 179 |
+
],
|
| 180 |
+
"source": [
|
| 181 |
+
"# %%\n",
|
| 182 |
+
"from tensorflow.keras.models import Sequential\n",
|
| 183 |
+
"from tensorflow.keras.layers import Dense, Dropout\n",
|
| 184 |
+
"from tensorflow.keras.optimizers import Adam\n",
|
| 185 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 186 |
+
"from sklearn.metrics import precision_recall_curve, auc\n",
|
| 187 |
+
"import numpy as np\n",
|
| 188 |
+
"import math\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"# Preparar dados para stacking\n",
|
| 191 |
+
"# (já com y_pb, y_bfd, y_esm com shape (1724, 597))\n",
|
| 192 |
+
"X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1) # (1724, 597*3)\n",
|
| 193 |
+
"y_stack = y_true.copy() # (1724, 597)\n",
|
| 194 |
+
"\n",
|
| 195 |
+
"# Divisão treino/validação\n",
|
| 196 |
+
"X_train, X_val, y_train, y_val = train_test_split(X_stack, y_stack, test_size=0.3, random_state=42)\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"# Modelo MLP (usa GPU automaticamente se disponível)\n",
|
| 199 |
+
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"model = Sequential([\n",
|
| 202 |
+
" Dense(512, activation=\"relu\", input_shape=(X_train.shape[1],)),\n",
|
| 203 |
+
" Dropout(0.3),\n",
|
| 204 |
+
" Dense(256, activation=\"relu\"),\n",
|
| 205 |
+
" Dropout(0.3),\n",
|
| 206 |
+
" Dense(y_stack.shape[1], activation=\"sigmoid\")\n",
|
| 207 |
+
"])\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"model.compile(optimizer=Adam(1e-3), loss=\"binary_crossentropy\")\n",
|
| 210 |
+
"\n",
|
| 211 |
+
"model.fit(X_train, y_train, validation_data=(X_val, y_val),\n",
|
| 212 |
+
" epochs=50, batch_size=64, verbose=1,\n",
|
| 213 |
+
" callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])\n",
|
| 214 |
+
"\n",
|
| 215 |
+
"# Prever com stacking\n",
|
| 216 |
+
"y_pred_stack = model.predict(X_stack, batch_size=64)\n",
|
| 217 |
+
"\n",
|
| 218 |
+
"# Métricas\n",
|
| 219 |
+
"THR = np.linspace(0, 1, 101)\n",
|
| 220 |
+
"def fmax(y_t, y_p):\n",
|
| 221 |
+
" best, thr = 0, 0\n",
|
| 222 |
+
" for t in THR:\n",
|
| 223 |
+
" y_b = (y_p >= t).astype(int)\n",
|
| 224 |
+
" tp = (y_t * y_b).sum(1); fp = ((1 - y_t) * y_b).sum(1); fn = (y_t * (1 - y_b)).sum(1)\n",
|
| 225 |
+
" f1 = 2 * tp / (2 * tp + fp + fn + 1e-8); m = f1.mean()\n",
|
| 226 |
+
" if m > best: best, thr = m, t\n",
|
| 227 |
+
" return best, thr\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"def auprc(y_t, y_p):\n",
|
| 230 |
+
" p, r, _ = precision_recall_curve(y_t.ravel(), y_p.ravel())\n",
|
| 231 |
+
" return auc(r, p)\n",
|
| 232 |
+
"\n",
|
| 233 |
+
"def smin(y_t, y_p, thr, alpha=0.5):\n",
|
| 234 |
+
" y_b = (y_p >= thr).astype(int)\n",
|
| 235 |
+
" ic = -(np.log((y_t + y_b).sum(0) + 1e-8) - np.log((y_t + y_b).sum() + 1e-8))\n",
|
| 236 |
+
" ru = np.logical_and(y_b, np.logical_not(y_t)) * ic\n",
|
| 237 |
+
" mi = np.logical_and(y_t, np.logical_not(y_b)) * ic\n",
|
| 238 |
+
" return np.sqrt((alpha * ru.sum(1))**2 + ((1 - alpha) * mi.sum(1))**2).mean()\n",
|
| 239 |
+
"\n",
|
| 240 |
+
"f, thr = fmax(y_stack, y_pred_stack)\n",
|
| 241 |
+
"print(f\"\\n STACKING (GPU-Keras MLP)\")\n",
|
| 242 |
+
"print(f\"Fmax = {f:.4f}\")\n",
|
| 243 |
+
"print(f\"Thr. = {thr:.2f}\")\n",
|
| 244 |
+
"print(f\"AuPRC = {auprc(y_stack, y_pred_stack):.4f}\")\n",
|
| 245 |
+
"print(f\"Smin = {smin(y_stack, y_pred_stack, thr):.4f}\")\n"
|
| 246 |
+
]
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"cell_type": "code",
|
| 250 |
+
"execution_count": 11,
|
| 251 |
+
"id": "2fac1b06-2695-4c94-855e-24e5bd993e1c",
|
| 252 |
+
"metadata": {},
|
| 253 |
+
"outputs": [
|
| 254 |
+
{
|
| 255 |
+
"name": "stdout",
|
| 256 |
+
"output_type": "stream",
|
| 257 |
+
"text": [
|
| 258 |
+
"Modelo guardado em models/\n"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
{
|
| 262 |
+
"name": "stderr",
|
| 263 |
+
"output_type": "stream",
|
| 264 |
+
"text": [
|
| 265 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py:3000: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n",
|
| 266 |
+
" saving_api.save_model(\n"
|
| 267 |
+
]
|
| 268 |
+
}
|
| 269 |
+
],
|
| 270 |
+
"source": [
|
| 271 |
+
"model.save(\"models/ensemble_stack.h5\")\n",
|
| 272 |
+
"model.save(\"models/ensemble_stack.keras\")\n",
|
| 273 |
+
"print(\"Modelo guardado em models/\")"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"cell_type": "code",
|
| 278 |
+
"execution_count": 10,
|
| 279 |
+
"id": "00695029-3d24-4803-a6e1-8ac5fd70b710",
|
| 280 |
+
"metadata": {},
|
| 281 |
+
"outputs": [
|
| 282 |
+
{
|
| 283 |
+
"name": "stdout",
|
| 284 |
+
"output_type": "stream",
|
| 285 |
+
"text": [
|
| 286 |
+
"\n",
|
| 287 |
+
"🔎 STACKING (MLP) — Avaliação completa\n",
|
| 288 |
+
"Fmax = 0.6956\n",
|
| 289 |
+
"Thr. = 0.37\n",
|
| 290 |
+
"AuPRC = 0.7591\n",
|
| 291 |
+
"Smin = 12.2272\n"
|
| 292 |
+
]
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
"data": {
|
| 296 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAHqCAYAAACZcdjsAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjMsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvZiW1igAAAAlwSFlzAAAPYQAAD2EBqD+naQAAk/lJREFUeJzs3Xd4U3X/xvF3kibde0Ipe++NiAgoCG5cIC5AURy4cPL4U0B9RHHh1scBbhFFRUEQUVAR2SAglE2ZHZS2dLdJfn+ERkoLtHSctL1f13UuktPk5E57aM8n32VyOp1OREREREREKsBsdAAREREREan5VFiIiIiIiEiFqbAQEREREZEKU2EhIiIiIiIVpsJCREREREQqTIWFiIiIiIhUmAoLERERERGpMBUWIiIiIiJSYSosRERERESkwlRYiIjUEDNmzMBkMrFq1SqjowBVk2fUqFE0btz4tI/bvXs3JpOJGTNmlOm4X375JWFhYWRmZlYsYBXq378/7du3P+3jGjduzKhRo6o+kIeaP38+AQEBJCcnGx1FRE6gwkJEqkTRRWdp26OPPmp0PI9QdHFclm337t1Gx62x7HY7EydO5O677yYgIMC9Pz8/n1deeYUuXboQFBRESEgI7dq147bbbmPLli3ux/35559MmjSJtLQ0A9J7lqLzccyYMaV+/bHHHnM/JiUlxb1/1KhRxb73pTnxd4aPjw8tW7Zk3LhxJCYmuh83ZMgQmjdvzpQpUyrnTYlIpfEyOoCI1G5PPvkkTZo0KbavLJ/K1gWRkZF8/PHHxfa9+OKL7Nu3j5dffrnEY+XMfP/998THx3PbbbcV23/VVVfx448/MmLECG699VYKCgrYsmULP/zwA2effTatW7cGXIXF5MmTGTVqFCEhIQa8g+Li4+Mxm437XNDHx4evv/6aN998E5vNVuxrn3/+OT4+PuTm5p7x8Yt+Z+Tm5vLHH3/w1ltvMW/ePDZu3Iifnx8AY8eO5cEHH2Ty5MkEBgZW6P2ISOVRYSEiVerCCy+ke/fuRsfwSP7+/txwww3F9n3xxRccOXKkxP6Kcjqd5Obm4uvrW6nHrQmmT59Onz59iI2Nde9buXIlP/zwA//973/5z3/+U+zxr7/+uke3Tnh7exv6+kOGDGHOnDn8+OOPXH755e79f/75J7t27eKqq67i66+/PuPjH/87Y8yYMYSHh/PSSy/x3XffMWLECMBVFN59993MmjWLm2++uWJvSEQqjbpCiYihTCYT48aNY9asWbRt2xZfX1969+7Nhg0bAHjnnXdo3rw5Pj4+9O/fv0SXoN9//51rrrmGhg0b4u3tTVxcHPfffz85OTnuxyQlJREZGUn//v1xOp3u/du3b8ff35/hw4efNN9XX32FyWRiyZIlJb72zjvvYDKZ2LhxIwCHDh1i9OjRNGjQAG9vb+rVq8fll19e6d2Y8vLyGD9+PJGRkfj7+3PFFVeU6G/euHFjLrnkEhYsWED37t3x9fXlnXfeASAtLY377ruPuLg4vL29ad68Oc899xwOh6PYMb744gu6detGYGAgQUFBdOjQgVdeeeWM8gC8+eabtGvXDm9vb+rXr89dd91Vpgv4tLQ0Ro0aRXBwMCEhIYwcObLMF/65ubnMnz+fgQMHFtu/Y8cOAPr06VPiORaLhfDwcAAmTZrEQw89BECTJk1KdE2bPn065513HlFRUXh7e9O2bVveeuutUrP8+OOP9OvXz/397NGjB5999tkp8//000/4+fkxYsQICgsLgZJjLIq6EC1duvS0PweHw8GkSZOoX78+fn5+DBgwgH/++adc4zZiY2M599xzS2T/9NNP6dChQ6W3SJ533nkA7Nq1y70vKiqKjh078t1331Xqa4lIxajFQkSqVHp6erG+1gARERHF7v/+++/MmTOHu+66C4ApU6ZwySWX8PDDD/Pmm29y5513cuTIEaZOncrNN9/ML7/84n7urFmzyM7O5o477iA8PJwVK1bw2muvsW/fPmbNmgW4LkLeeustrrnmGl577TXuueceHA4Ho0aNIjAwkDfffPOk+S+++GICAgL48ssv6devX7GvzZw5k3bt2rkvpK666io2bdrE3XffTePGjUlKSmLhwoUkJCSUaUByWd19992EhoYyceJEdu/ezbRp0xg3bhwzZ84s9rj4+HhGjBjB2LFjufXWW2nVqhXZ2dn069eP/fv3M3bsWBo2bMiff/7JhAkTOHjwINOmTQNg4cKFjBgxgvPPP5/nnnsOgM2bN7N06VLuvffecueZNGkSkydPZuDAgdxxxx3Ex8fz1ltvsXLlSpYuXYrVai31vTqdTi6//HL++OMPbr/9dtq0acM333zDyJEjy/S9Wr16Nfn5+XTt2rXY/kaNGgGui+E+ffrg5VX6n8Mrr7ySrVu38vnnn/Pyyy+7z92irmlvvfUW7dq147LLLsPLy4vvv/+eO++8E4fD4T6fwXXxf/PNN9OuXTsmTJhASEgIa9euZf78+Vx33XWlvvYPP/zA1VdfzfDhw/nggw+wWCynfK9l+TlMmDCBqVOncumllzJ48GDWr1/P4MGDy9116brrruPee+8lMzOTgIAACgsLmTVrFuPHj69QN6jSFBWBRcVekW7duvHtt99W6muJSAU5RUSqwPTp051AqdvxAKe3t7dz165d7n3vvPOOE3DGxMQ4MzIy3PsnTJjgBIo9Njs7u8RrT5kyxWkymZx79uwptn/EiBFOPz8/59atW53PP/+8E3B+++23p30vI0aMcEZFRTkLCwvd+w4ePOg0m83OJ5980ul0Op1HjhxxAs7nn3/+tMc7lYsvvtjZqFGjUr9W9D0dOHCg0+FwuPfff//9TovF4kxLS3Pva9SokRNwzp8/v9gxnnrqKae/v79z69atxfY/+uijTovF4kxISHA6nU7nvffe6wwKCir2ns80T1JSktNmszkvuOACp91udz/u9ddfdwLODz74wL1v5MiRxd7/t99+6wScU6dOde8rLCx09u3b1wk4p0+fftJ8TqfT+d577zkB54YNG4rtdzgczn79+jkBZ3R0tHPEiBHON954o8Q543Q63efK8eddkdLOv8GDBzubNm3qvp+WluYMDAx09urVy5mTk1MiR5F+/fo527Vr53Q6nc6vv/7aabVanbfeemux75nT6frZjhw50n2/rD+HQ4cOOb28vJxDhw4tdrxJkyY5gWLHPBnAeddddzlTU1OdNpvN+fHHHzudTqdz7ty5TpPJ5Ny9e7dz4sSJTsCZnJzsft7IkSOd/v7+pzx20fv4+eefncnJyc69e/c6v/jiC2d4eLjT19fXuW/fvmKPf+aZZ5yAMzEx8bS5RaR6qCuUiFSpN954g4ULFxbbTnT++ecX+0S/V69egKsF4PiBmUX7d+7c6d53/JiBrKwsUlJSOPvss3E6naxdu7bY67z++usEBwdz9dVX8/jjj3PjjTcW6yN+MsOHDycpKYnFixe793311Vc4HA53NypfX19sNhuLFy/myJEjpz1mRdx2222YTCb3/b59+2K329mzZ0+xxzVp0oTBgwcX2zdr1iz69u1LaGgoKSkp7m3gwIHY7XZ+++03AEJCQsjKyir151XePD///DP5+fncd999xQYd33rrrQQFBTF37tyTHnvevHl4eXlxxx13uPdZLBbuvvvu0+YCOHz4MAChoaHF9ptMJhYsWMDTTz9NaGgon3/+OXfddReNGjVi+PDhZe5qdfz5V9Q6169fP3bu3El6ejrgav05evQojz76KD4+PiVynOjzzz9n+PDhjB07lnfeeafMA7VP93NYtGgRhYWF3HnnncWeV9bv5fFCQ0MZMmQIn3/+OQCfffYZZ599trslqCIGDhxIZGQkcXFxXHvttQQEBPDNN98UGyNTlAEo0SIqIsZRVygRqVI9e/Y87eDthg0bFrsfHBwMQFxcXKn7j79wT0hI4IknnmDOnDklLuiLLuyKhIWF8eqrr3LNNdcQHR3Nq6++Wqb3MGTIEIKDg5k5cybnn38+4OoG1blzZ1q2bAm4BtQ+99xzPPDAA0RHR3PWWWdxySWXcNNNNxETE1Om1ymrE79fRRdYJ77/E2fjAti2bRt///33SWeZSkpKAuDOO+/kyy+/5MILLyQ2NpYLLriAYcOGMWTIkHLnKbqwbdWqVbHH2Ww2mjZtWqIgOt6ePXuoV69eialKTzzW6TiPG1tTxNvbm8cee4zHHnuMgwcPsmTJEl555RW+/PJLrFYrn3zyyWmPu3TpUiZOnMiyZcvIzs4u9rX09HSCg4PdXXnKMvZg165d3HDDDe5ue+VR1p9D8+bNiz0uLCysROFVFtdddx033ngjCQkJfPvtt0ydOrXcxyjNG2+8QcuWLfHy8iI6OppWrVqVWlwV/UxLK85ExBhqsRARw52s7/jJ9hddUNjtdgYNGsTcuXN55JFH+Pbbb1m4cKF70bQTByMDLFiwAHBdbO3bt69M+by9vRk6dCjffPMNhYWF7N+/n6VLl5YY9H3fffexdetWpkyZgo+PD48//jht2rQp0XJSUaf7vhQpbQYoh8PBoEGDSrQiFW1XXXUV4BqXsm7dOubMmcNll13Gr7/+yoUXXljq2Iay5jFCUb/807Ui1atXj2uvvZbffvuNFi1a8OWXX7oHS5/Mjh07OP/880lJSeGll15i7ty5LFy4kPvvvx8o/fw7nXr16nH22Wczb968ci88WN0/h8suuwxvb29GjhxJXl4ew4YNq5Tj9uzZk4EDB9K/f3/atGlz0habop/piWO2RMQ4KixEpMbasGEDW7du5cUXX+SRRx7h8ssvZ+DAgdSvX7/Ux8+fP5/33nuPhx9+mMjISEaOHHnai8ciw4cPJyUlhUWLFjFr1iycTmeps0k1a9aMBx54gJ9++omNGzeSn5/Piy++WKH3WZmaNWtGZmYmAwcOLHU7/lNvm83GpZdeyptvvsmOHTsYO3YsH330Edu3by/XaxZ1j4mPjy+2Pz8/n127dp2y+0yjRo04ePBgiRWzTzzWyRStRXH8jEKnYrVa6dixIwUFBe4uNif7RPz7778nLy+POXPmMHbsWC666CIGDhxYoqBr1qwZgHv2sFPx8fHhhx9+oEWLFgwZMoRNmzaVKXdZFH2fT/z5HT58+Iy67/n6+jJ06FAWL17MoEGDqv0Cf9euXURERGiNFxEPosJCRGqsok9oj/9E1ul0ljolalpaGmPGjKFnz54888wzvPfee6xZs4ZnnnmmTK81cOBAwsLCmDlzJjNnzqRnz57FuhplZ2eXmA2nWbNmBAYGkpeXdyZvr0oMGzaMZcuWuVtujpeWluYutIrGJhQxm8107NgRoNzvZ+DAgdhsNl599dViP6v333+f9PR0Lr744pM+96KLLqKwsLDYFK52u73M3YS6deuGzWYr8en/tm3bSEhIKPH4tLQ0li1bRmhoqPuC1d/f3/2145V2/qWnpzN9+vRij7vgggsIDAxkypQpJc6R0loTgoODWbBgAVFRUQwaNMjdlaqizj//fLy8vEpMh/v666+f8TEffPBBJk6cyOOPP17ReOW2evVqevfuXe2vKyInpzEWIlJjtW7dmmbNmvHggw+yf/9+goKC+Prrr0v99PXee+/l8OHD/Pzzz1gsFoYMGcKYMWN4+umnufzyy+nUqdMpX8tqtXLllVfyxRdfkJWVxQsvvFDs61u3buX8889n2LBhtG3bFi8vL7755hsSExO59tprK/V9V8RDDz3EnDlzuOSSSxg1ahTdunUjKyuLDRs28NVXX7F7924iIiIYM2YMqampnHfeeTRo0IA9e/bw2muv0blzZ9q0aVOu14yMjGTChAlMnjyZIUOGcNlllxEfH8+bb75Jjx49TrkY4KWXXkqfPn149NFH2b17N23btmX27Nklxs+cjI+PDxdccAE///wzTz75pHv/+vXrue6667jwwgvp27cvYWFh7N+/nw8//JADBw4wbdo0d+HQrVs3AB577DGuvfZarFYrl156KRdccIG7VWfs2LFkZmby7rvvEhUVxcGDB92vFRQUxMsvv8yYMWPo0aMH1113HaGhoaxfv57s7Gw+/PDDErkjIiJYuHAh55xzDgMHDuSPP/4oMXi5vKKjo7n33nt58cUXueyyyxgyZAjr16/nxx9/JCIi4ozGKnTq1Om0/3eKFBQU8PTTT5fYHxYWVmJA+ekkJSXx999/F5vSV0SMp8JCRGosq9XK999/zz333OMe13DFFVcwbty4Yhc7c+bM4aOPPuLFF190d40BeOmll1i4cCEjR45k5cqVJ11Locjw4cN57733MJlMJfqTx8XFMWLECBYtWsTHH3+Ml5cXrVu35ssvv3SPW/AEfn5+LFmyhGeeeYZZs2bx0UcfERQURMuWLZk8ebJ7gPwNN9zA//73P958803S0tKIiYlh+PDhTJo0qcyzFB1v0qRJREZG8vrrr3P//fcTFhbGbbfdxjPPPHPK77vZbGbOnDncd999fPLJJ5hMJi677DJefPFFunTpUqbXvvnmm7nqqqvYu3eve0KAc889l6eeeooff/yRl156ieTkZAIDA+nSpQvPPfdcsZ9Zjx49eOqpp3j77beZP38+DoeDXbt20apVK7766iv+7//+jwcffJCYmBjuuOMOIiMjS6wGfcsttxAVFcWzzz7LU089hdVqpXXr1u7xGKWJjY3l559/pm/fvgwaNIjffvutwt2NnnvuOfz8/Hj33Xf5+eef6d27Nz/99BPnnHNOiRmrKlt+fn6pLRvNmjUrd2Exe/ZsvL29K21ch4hUDpPTE0bXiYiIVBG73U7btm0ZNmwYTz31lNFxPE5aWhqhoaE8/fTTPPbYY0bHKZMuXbrQv39/Xn75ZaOjiMhxNMZCRERqNYvFwpNPPskbb7xRYhB4XZOTk1NiX9Fq6/3796/eMGdo/vz5bNu2jQkTJhgdRUROoBYLERGROmLGjBnMmDGDiy66iICAAP744w8+//xzLrjgglIH9IuIlIfGWIiIiNQRHTt2xMvLi6lTp5KRkeEe0F3aoGoRkfJSi4WIiIiIiFSYxliIiIiIiEiFqbAQEREREZEKq3NjLBwOBwcOHCAwMPCMFgMSEREREakrnE4nR48epX79+qddx6jOFRYHDhxwL5AkIiIiIiKnt3fvXho0aHDKx9S5wiIwMBBwfXOCgoKq7HUKCgr46aefuOCCC067mq/UHTovarisLKhf33X7wAHw96/wIXVOyIl0TkhpdF7IiarrnMjIyCAuLs59DX0qda6wKOr+FBQUVOWFhZ+fH0FBQfoFIG46L2o4i+Xf20FBlVZY6JyQ4+mckNLovJATVfc5UZYhBBq8LSIiIiIiFabCQkREREREKkyFhYiIiIiIVFidG2MhIiIitYPdbqegoMDoGNWioKAALy8vcnNzsdvtRscRD1BZ54TVasVy/BjCClBhISJSVlYrTJz4720RMYTT6eTQoUOkpaUZHaXaOJ1OYmJi2Lt3r9bhEqByz4mQkBBiYmIqfBwVFiIiZWWzwaRJRqcQqfOKioqoqCj8/PzqxIW2w+EgMzOTgICA0y5SJnVDZZwTTqeT7OxskpKSAKhXr16FMqmwEBERkRrDbre7i4rw8HCj41Qbh8NBfn4+Pj4+KiwEqLxzwtfXF4CkpCSioqIq1C1KZ6aISFk5HLBpk2tzOIxOI1InFY2p8PPzMziJSO1R9P+pomOWPKKweOONN2jcuDE+Pj706tWLFStWnPSx/fv3x2QyldguvvjiakwsInVSTg60b+/acnKMTiNSp9WF7k/V5ciRI0yePJmDBw8aHUUMUln/nwwvLGbOnMn48eOZOHEia9asoVOnTgwePNjd1+tEs2fP5uDBg+5t48aNWCwWrrnmmmpOLiIiIlKzOZ1ORo4cSU5Ozmn710+aNInOnTu7748aNYqhQ4dWOEN8fDwxMTEcPXq0wseqSvn5+TRu3JhVq1YZHcVjGV5YvPTSS9x6662MHj2atm3b8vbbb+Pn58cHH3xQ6uPDwsKIiYlxbwsXLsTPz0+FhYiIiHi0UaNGldrrYvv27QD89ttvXHrppdSvXx+TycS3335b5Zmef/55goKCmDJlSrmf+8orrzBjxowKZ5gwYQJ33303gYGBACxevLjU79P//d//Vfi1KsJms/Hggw/yyCOPGJrDkxk6eDs/P5/Vq1czYcIE9z6z2czAgQNZtmxZmY7x/vvvc+211+Lv71/q1/Py8sjLy3Pfz8jIAFx9yKpy7uuiY9eV+bWlbHRe1HAFBVjdNwugEn6OOifkRDonTq2goACn04nD4cBRw8Y6OZ1OBg8eXOLD08jISBwOB0ePHqVjx46MGjWKq6++uth7dDqd7n8r830/+OCD7uMWvcap8gPu1y8qBCqSJyEhgR9++IFXXnnFfZyifzdv3kxQUJD7sQEBAYb/zEeMGMEDDzzAhg0baNeunaFZKvOccDgcOJ1OCgoKSgzeLs/vIkMLi5SUFOx2O9HR0cX2R0dHs2XLltM+f8WKFWzcuJH333//pI+ZMmUKkydPLrH/p59+qpaBXwsXLqzy15CaR+dFzWTJzeWSY7cXLFiA3cen0o6tc0JOpHOidF5eXsTExJCZmUl+fr7Rccql6KLtxOuPrKwsAPr06UOfPn3c+3NyctwfiBY5sbvQs88+y9y5cxk7dizPPvssaWlpDB8+nKlTp/L666/z5ptv4nA4GDt2rLuIAEhPT+fxxx9n3rx55Ofn07lzZ/773//SoUMH92Nefvll3nrrLXJychg6dCjh4eHY7XZ3pjvvvJP09HQ+/fRTwPVh7hNPPMHs2bM5evQonTt35plnnqFr164n/Z58/PHHtG/fnsDAQPdxs7OzAddsRcd/rxwOBxkZGXz22WdMmDCBd955h8cff5z9+/czaNAg3nrrLb777jumTJlCRkYGw4cP55lnnnFfKH/xxRe88847bN++HT8/P/r27cuUKVOIjIwEYOrUqUyfPp2lS5cSFhYGwLBhw8jJyeG7777DbDZjsVjo1asXH330EY899thJ31d1qowuZPn5+eTk5PDbb79RWFhY7GtFP4+yqNHTzb7//vt06NCBnj17nvQxEyZMYPz48e77GRkZxMXFccEFFxSrgitbQUEBCxcuZNCgQVi1kJYco/Oihjv2xx9g8ODBcJKW0vLQOSEn0jlxarm5uezdu5eAgAB8KrG4rw5WqxUvL68yX3/4+vq6H+t0Ojl69CiBgYHFBtp6e3uze/duFi9ezPz589mxYwfDhg1j3759tGzZksWLF/Pnn38yZswYLr74Ynr16gXA1Vdfja+vL/PmzSM4OJj//e9/XHHFFWzZsoWwsDC+/PJLnnvuOV577TXOOeccPvnkE1577TWaNm3qznTi+7nvvvv44YcfmDFjBo0aNeL555/n6quvZuvWre4L9ROtXLmSnj17FvueFBUTgYGBpX6vfHx8yMnJ4f333+eLL77g6NGjXH311YwaNYqQkBDmzZvHzp07ueaaa+jfvz/Dhw8HXEXp008/TatWrUhKSuLBBx/knnvuYe7cuQBMnjyZxYsXM378eGbPns2bb77JypUrWbt2LSEhIe7X7927N8uXL6/S68iyONk5cSZyc3Px9fXl3HPPLfH/6sTi9lQMLSwiIiKwWCwkJiYW25+YmEhMTMwpn5uVlcUXX3zBk08+ecrHeXt74+3tXWK/1Wqtll/Y1fU6UrPovKhZnE4nGbmFJKbm0vLYvt+3p2IOzMdqMeNlNuFlMWO1mHA4we5wUGh3Ync4sTudFDqc4AQfqwVfmwVf67HNZsHL5HqOzgk5kc6J0tntdkwmE2az2T13v9PpJKfAbkgeX6ulzBd1JpOJuXPnFrsgvfDCC5k1a1apjz/+PRZ1dSl678cf0+FwMH36dAIDA2nfvj0DBgwgPj6eH3/8EbPZTJs2bXj++edZsmQJvXv35o8//mDlypUkJSW5r5FefPFFvvvuO2bPns1tt93Gq6++yi233MKtt94KwH//+18WLVpEbm6u+/WLxj6YzWaysrJ4++23mTFjhnumzvfee4/GjRszffp0HnrooVLfY0JCAj169Cj2nopuN2zYsNhj9+zZQ3h4OGazmYKCAt5++22aNWsGuAqljz/+mMTERAICAtzfhyVLljBixAgAxowZ4z5W8+bNefXVV+nRowfZ2dnuReY++eQTOnfuzH/+8x9effVV93s4XmxsLHv27DF8PZGTnRNnwmw2YzKZSv29U57fQ4YWFjabjW7durFo0SL3rAIOh4NFixYxbty4Uz531qxZ5OXlccMNN1RDUhHxZEdzC0jJzMcEmExgPvZHvuh2gd1BVp6d7PxCsvLtZOcd+ze/kLwCB/l2B/mFDgqO+zczz86hjBwOpudyKD2X7Hw7VnsBD/a8EoAXZv5NgaVyLvpMWHjy718J8bMR4mclxNdKqJ+NYD8rIb7H9vlZCfa1EuJnI9jXSpifjSBfL025KQLkFNhp+8QCQ177nycH42cr++XUgAEDeOutt9z3TzZGtDwaN27sHu8Ari7lFoul2MVmdHS0e8bN9evXk5mZWWKBwZycHHbs2AG4xjfcfvvtxb7eu3dvfv3111Iz7Nixg4KCgmJduaxWKz179mTz5s0nzZ6Tk3PSlqfff/+92PsKDQ113/bz83MXFUXvr3HjxgQEBJT6ngFWr17NpEmTWL9+PUeOHHFfmCckJNC2bVsAmjZtygsvvMDYsWMZPnw41113XYlcvr6+5eoeVJcY3hVq/PjxjBw5ku7du9OzZ0+mTZtGVlYWo0ePBuCmm24iNja2xGwF77//vru/n4jUDgV2BwfScsjIKcTuPPaJ/7HN4XSSX+jgYHouCanZ7D2Szd5U13Yku3oGuQYE+vHtdffhb7PQ1uGk0O5qmSg41kJRaHdgNpvwMpuwmE14mc3u+wC5BXay8+3uf4s+YXVi4kh2Qbnfh7/NQmyoL/VDfIkNcf3bINSXyEBvwvxthB4rVLy9znwVVRGpXP7+/jRv3rxSj3niJ8pFnzyfuK/oQjozM5N69eqxePHiEsc6vstPdYiIiODIkSOlfq1JkyYnzVPe95yVlcXgwYMZPHgwn376KZGRkSQkJDB48OASY3V+++03LBYLu3fvprCwEC+v4pfLqamp7nEZUpzhhcXw4cNJTk7miSee4NChQ3Tu3Jn58+e7B3QnJCSUaN6Jj4/njz/+4KeffjIisoiUk8Ph5GheIUdzC8jIKSQjt4DUrHz2HM4mITWbhNQsElKzOZCWi91x6llJTsbfZsFsMuFwOnECTieu206weZnxs1nw9/Zy/Wvzws/bgp/NgreXBavFhM3LjNVixuZlxttixsdmISbIh5hgH+oH+xIT7IOPtXIv0J1OJ0ez8/h23gK69z6Xo/kO0nIKSMvOJ+1YoZGeU0B6jut+mvt+AZnHWl22JmayNTHztN+bED8bof7HWkJ8rcdaRopaQ2yE+VuJDfEjNtSXAG/D/zSIlIuv1cI/Tw427LVrmq5du3Lo0CG8vLxKdPMp0qZNG5YvX85NN93k3vfXX3+d9JjNmjXDZrOxdOlSGjVqBLjGC61cuZL77rvvpM/r0qUL//zzzxm9j/LYsmULhw8f5tlnnyUuLg6g1PUoZs6cyezZs1m8eDHDhg3jqaeeKjEJ0MaNG+nSpUuVZ66JPOKvx7hx407a9am0arpVq1annRJNRKpG0XiD5KN5pGTmkXw0j9SsfI5k53MkK//YJ+9F9wvIyHVdBJf1v6y3l5lQPxsW96f+JsxmExaT635MsA9xob7EhfkRF+ZHw2P/VsvFsMMBCQmu2w0bQgX7tJpMJnxtFoJt0CI6oFz9WHML7OxPy+FAWg77j7j+3Xfsdkpm3rHCJB+HE7Ly7WTl57A/rWyrhYf6WYkN9aVBiB8NjrWIuDYf6of4Eu5vUxcs8Sgmk6lc3ZE8VWZmpntNC4Bdu3axbt06wsLCaNCgQaW9zsCBA+nduzdDhw5l6tSptGzZkgMHDjB37lyuuOIKunfvzr333suoUaPo3r07ffr04dNPP2XTpk00bdq01GP6+/tzxx138NBDDxEWFkbDhg2ZOnUq2dnZ3HLLLSfNMnjwYMaMGYPdbi8xzWllatiwITabjddee43bb7+djRs38tRTTxV7zL59+7jjjjt47rnnOOecc5g+fTqXXHIJF154IWeddZb7cb///nuJ54pLzf9fKCJVIrfAzj8HM9iwL52/96WzPTmTlKOuQiLffmbzZXt7mQnytRLo40Won424UF8ahvvTMMyPRuGuIiEywBuz2UMvWnNyoEkT1+3MzEqZFepM+VgtNIsMoFlkwEkf43A4OZpb+G+hl/1vy0daTgHp2a5CMC2ngJSjeexPyyE9p8DdLWvj/tJnArF5makf7CoyujcKpX/rKDo1CMHiqT83kRpi1apVDBgwwH2/aFbLkSNHnnTh4DNhMpmYN28ejz32GKNHjyY5OZmYmBjOPfdcd4+R4cOHs2PHDh5++GFyc3O56qqruOOOO1iw4ORjWZ599lkcDgc33ngjR48epXv37ixYsKDY2IgTXXjhhXh5efHzzz+7ZturIpGRkcyYMcM9KLtr16688MILXHbZZYDrQ7NRo0bRs2dP94fdgwcP5o477uCGG25g3bp1BAQEsGzZMtLT07n66qurLGtNZnLWsY/+MzIyCA4OJj09vcqnm503bx4XXXSRZvUQN089LwrsDuIPHWX9vjR3IbE18ahrNqOTCPTxIjLQm4gAbyICbK7uNMf69B/fvz/Y1+ouJmp8X/+sLCgaGFhJhYWnnRNHcwvYn5bDvtQc9h3JZt+RHA6k57A/LZeDaTkkHc0r9Xlh/jb6t4ykf+so+rWIJNjP+PdSU3naOeFpcnNz2bVrF02aNKlx081WRNEaDkFBQYbPRlTZ3njjDebMmXPKosVTDB8+nE6dOvGf//zH6CiVek6c6v9Vea6d1WIhUsc4nU72puawbl8a6/emsW5vGhv3p5NXWLIVIiLARscGIXSIDaZNvSBign2IDPQm3N9W6eMNxDME+lhpHWOldUzpfzzyCu0kpudxID2HXSlZ/LEthd+2JpOalc/stfuZvXY/ZhN0bRhKr6Zh9GgcRrdGoQT66AJZREo3duxY0tLS3GsyeKr8/Hw6dOjA/fffb3QUj6XCQqSWczicxCceZfnOw6zYncqKXamkZJZcrTbQx4tODULoFBdMh9gQOjYIpl6wj/rSSzHeXhYahvvRMNyPs5qGM6JnQwrsDlbvOcKv8Un8uiWJrYmZrNpzhFV7jgA7MJugbf0gejQOo2fjMHo0CSMioOT6QiJSN3l5eXnMKtanYrPZ+L//+z+jY3g0FRYitUyB3cGmAxms3JXK8l2prNydSnpO8WlMrRYTbesF0TkuhE7Htibh/p47tkE8mtVi5qym4ZzVNJwJF7Zh35Fslm5PYcWuI6zcnUpCajYb92ewcX8G05fuBqBZpD89m4TTq4mr0IgN8TX2TYiISIWpsBCp4TJyC1iz5wir97gu4tbtTSO3oHi3Jj+bhW6NQjmraTg9m4TRsUFwzR/vIB6rQagfw3s0ZHgP16q5h9JzWbE7lZW7XC1m8YlH2ZGcxY7kLD5f4ZplKzbEl55NwmgVE0ijMFeLSKNwf019KyJSg+g3tkgN43Q62XQggwWbDrFocxKbD2WUmMo1xM9Kt2N93Hs2Cadd/SCslto12E9qjphgHy7rVJ/LOtUH4EhWPqv2HGHFrsOs2JXKxgMZ7E/L4Zu1+0s8N9zf5up6FeZHvWDXlLcxQT7UO7a2SLi/TS1tIiIeQoWFSA1gdzhZvecI8zceYsGmQyXWI2gU7kf3RmF0bxxK90ahNIsM0MVWVfDygjvv/Pe2nJFQfxuD2kYzqK1rWsusvELWJLha3XanZLEnNZs9h7NJzcrn8LFtbUJaqceyWkw0jwrk3JYR9GsZSfdGYdi8VESLiBhBfxlFPNjG/el8sTKB+RsPFRtw7Wu10K9lJIPbR9OneQRRgXVnykVDeXvDG28YnaLW8ff2om+LSPq2iCy2/2hugXt19r2p2RxMz+VQei4HM3I5lO6a+rbA7mTzwQw2H8zgnSU78bdZOLu5q8jo1zKSuDA/g96ViEjdo8JCxMNk5xfy/foDfLY8gfX70t37g3y8GNg2msHtYji3RSS+No2RkNot0MdK+9hg2scGl/r1AruDxIxcVu85wpKtyfy2NZmUzHwW/pPIwn8SAYgK9KZDbDDtYoNpXz+I9rGa7UxEpKqosBDxEFsOZfDZ8gS+WbOfo3mFgKubx+B2MQzrHkfvZuEaJ2E0pxNSUly3IyJAF6eGslrMNAj1o0GoH5d3jsXhcPLPwQyWbE1mSXwyqxOOkHQ0j0Vbkli0Jcn9vHB/G23qBdE4wjV2o2GYa/X3huF+GiwuIlIB+g0qYiCHw8mv8Un877edLN+V6t7fKNyPET0bcnW3Bprv35NkZ0NUlOt2Ja28LZXHbDa5WzjuGtCc7PxCNh90TXO7YX86G/ensy0pk8NZ+fyxPYU/tpc8Rri/jWZRAbSvH0yHBkG0rx9M08gALBqzJFLttm/fzpdffsn999+Pr6+mpK4JVFiIGCCv0ME36/fy7m872ZaUCYCX2cSgttFc36sRZzcL1+BrkQrys3nRrVEY3RqFufflFtjZcugo8YcySEjNJiE1x/Xv4SyOZBe4Bosfmxb33+NYaFvP1Y2qTb1AWsUE0TI6AD+b/oRK5Vq8eDEDBgzgyJEjhISEVNvrzpgxg/vuu4+0tLQzPsbu3btp0qQJa9eupXPnzqU+pjzvLzc3l6uvvpr77ruvTEWFyWTim2++YejQoeUPL5VGvxVFqlFGTgE/7zfx35d+J+loHgAB3l5c16sho/s0pl6wPpERqUo+Vgud40LoHBdS4msZuQUkHM4m/tBRdwvHpgMZZOfbj1tJ3MVkgrhQP1rFBNI6JpBWMYG0ig6kcYS/uixKqU43rmfixIn079+/esLUAHfffTdDhw5l1KhRxfZPmjSJb7/9lnXr1hXbf/DgQUJDQ6svYCV64403eP755zl06BCdOnXitddeo2fPnid9/OzZs3nmmWfYvn07BQUFtGjRggceeIAbb7zR/ZiTnW9Tp07loYceqvT3UESFhUg1SM3K573fd/Lhst1k5VmAPGKCfBjdpzEjejUkyMdqdESROi/ouMHiV3VrALimet6ZnHms0MggPjGD+EOZpGTmHWvxyHYPFAfXuKhmkQG0jHYVGy2jA2kRFUCDUF+8VHDUaQcPHnTfnjlzJk888QTx8fHufQEBAaxatarcx83Pz8dms1VKRk/y7rvvluvxMTExVZSkas2cOZPx48fz9ttv06tXL6ZNm8bgwYOJj48nqqjr7QnCwsJ47LHHaNmyJfn5+SxZsoTRo0cTFRXF4MGDgeLnG8CPP/7ILbfcwlVXXVWl70e/5USqUGpWPs/N30Lf537hzcU7yMqzU8/XyXNXtuO3hwcwtl8zFRUiHsxiNtEiOpAruzbgiUvb8umYs1j1fwNZ/X8D+ezWXky8tC3X9oijc1wI/jYLBXYnWw4dZc76Azy/IJ5bP1pF/xcW0/aJBQx6aQljP17F1Plb+Gr1PtYmHCG/0GH0W5RqEhMT496Cg4MxmUzF9gUEBLgfu3r1arp3746fnx9nn312sQJk8uTJdO7cmffee48mTZrg4+OabjwtLY0xY8YQGRlJUFAQ5513HuvXr3c/b/369QwYMIDAwECCgoLo1q1biUJmwYIFtGnThoCAAIYMGVLs4tThcPDkk0/SoEEDvL296dy5M/Pnzz/le543bx4tW7bE19eXAQMGsHv37tN+nxISErj88ssJCAggKCiIYcOGkZjoKt5nzJjB5MmTWb9+PSaTCZPJxIwZMwDXJ/Tffvst4OqWZTKZ+PLLL+nbty++vr706NGDrVu3snLlSrp3705AQAAXXnghycnJFXqPFfXSSy9x6623Mnr0aNq2bcvbb7+Nn58fH3zwwUmf079/f6644gratGlDkyZNuOeee+jYsSN//PGH+zHHn1sxMTF89913DBgwgKZNm1bp+1GLhUgVOJyZx7u/7+KjZbvJzrcD0K5+EOP6NyVv5you7hKLVYt4idRY4QHenB3gzdnNItz7nE4n+9Ny2Jp4lC2HjrL1kOvfXSlZ5BU62JaUeWxM1b8tHEE+XlzQLoaLOsTQp3kE3l6aRrpCsrJO/jWLBXx8yvZYsxmO79d/ssdW0QQOjz32GC+++CKRkZHcfvvt3Hzzzfz+++/ur2/fvp2vv/6a2bNnY7G4zplrrrkGX19ffvzxR4KDg3nnnXc4//zz2bp1K2FhYVx//fV06dKFt956C4vFwrp167Ba//1gKzs7mxdeeIGPP/4Ys9nMDTfcwIMPPsinn34KwCuvvMKLL77IO++8Q5cuXfjggw+47LLL2LRpEy1atCjxHvbu3cuVV17JXXfdxW233caqVat44IEHTvm+HQ6Hu6hYsmQJhYWF3HXXXQwfPpzFixczfPhwNm7cyPz58/n5558BCA4ufTpqcHUvmzZtGg0bNuTmm2/muuuuIzAwkFdeeQU/Pz+GDRvGE088wVtvvXVG7xHgmWee4Zlnnjnl+/rnn39o2LBhif35+fmsXr2aCRMmuPeZzWYGDhzIsmXLTnnMIk6nk0WLFhEfH89zzz1X6mMSExOZO3cuH374YZmOWREqLEQq0cH0HGYs3c3Hf+1xFxTtY4O47/yWnN8misLCQubtMjikiFQJk8nknv72vNbR7v0Oh6vg2JGcyc7kLHamZLIjKYv4xKOkZuXz1ep9fLV6H4E+XgxqE80FbSIpUEPGmTnuU/8SLroI5s79935UlGumt9L06weLF/97v3Hjf6eaPp7TeSYpT+u///0v/fr1A+DRRx/l4osvJjc31/31/Px8PvroIyIjXYtK/vHHH6xYsYKkpCS8vV0zCb7wwgt8++23fPXVV9x2220kJCTw0EMP0bp1a4ASF8oFBQW8/fbbNGvWDIBx48bx5JNPur/+wgsv8Mgjj3DttdcC8Nxzz/Hrr78ybdo03ihl4dC33nqLZs2a8eKLLwLQqlUrNmzYcNKLX4BFixaxYcMGdu3aRVxcHAAfffQR7dq1Y+XKlfTo0YOAgAC8vLzK1PXpwQcfdHcNuvfeexkxYgSLFi2iT58+ANxyyy3uFo8zeY8At99+O8OGDTtljvr165e6PyUlBbvdTnR0dLH90dHRbNmy5ZTHTE9PJzY2lry8PCwWC2+++SaDBg0q9bEffvghgYGBXHnllac8ZmVQYSFSCf7el8b7f+xi7t8HKXS4/tB0iA3mvoEtOK91lBbjqi28vGDkyH9vi5SB2WwiLsyPuDA/+rf6d7/d4WTV7lR+3HiIHzceJDEjj9lr9zN77X6sJgvv7VlG86gAmkYG0CzSn2aRATSJ8Mdfa23Ueh07dnTfrlevHgBJSUnumZQaNWrkLirA1c0pMzOT8PDwYsfJyclhx44dAIwfP54xY8bw8ccfM3DgQK655hp3EQHg5+dX7H69evVISnKt/5KRkcGBAwfcF+RF+vTpU6y71fE2b95Mr169iu3r3bv3Kd/35s2biYuLcxcVAG3btiUkJITNmzfTo0ePUz7/RMd/H4su3jt06FBsX0XeI7jGO4SFhZ3061UlMDCQNWvWkJiYyPLlyxk/fjxNmzYtdQKADz74gOuvv97dba4q6beTyBmyO5z8vDmR9//YVWxqyrOahnHbuU0Z0EoFRa3j7Q3HfbolUhEWs4leTcPp1TScJy5py5qEI8zdcJB5G1xFxpZjXalO1Cjcj15NwujVJJyzmoUTG6LZ5NwyM0/+NcsJ3cySkkp/HLi6Qh2vDGMDKtPxXZSK/o44HP82Y/mf0AUrMzOTevXqsfj4VpZjioqRSZMmcd111zF37lx+/PFHJk6cyBdffMEVV1xR4jWLXtdZRS0y1aW07+OJ+47/vp6JinSFioiIwGKxuMeQFElMTDxti4zZbKZ58+ZERUXRp08ftmzZwpQpU0oUFr///jvx8fHMnDmzbG+oglRYiJRTod3BV6v38daSHew57GpG9zKbuKxTfW4+pwntY0/e31NEpDRms4nujcPo3jiMRy9owUff/EjD9j1ISM11d6Hakexa3G/P4Wz2HM7my1X7AGgQ6usqMpqG0btZOA1C/Qx+NwYqz5iHqnqsAbp27cqhQ4fw8vKicePGJ31cy5YtadmyJffffz8jRoxg+vTp7sLiVIKCgqhfvz5Lly51d9ECWLp06UmnRW3Tpg1z5swptu+vv/465eu0adOGvXv3snfvXnerxT///ENaWhpt27YFwGazYbfbT5u5vM7kPULFukLZbDa6devGokWL3OtvOBwOFi1axLhx48qV3+FwkJeXV2L/+++/T7du3ejUqVO5jnemVFiIlJHT6WT+xkM8/1M8O5NdA/mCfa1c36shN/VuTExw1TcxisGczn/7ZPv5uRYzEKlkZrOJKF84r1VkiU+R07LzWZuQxl+7DvPXzlQ27k9n35Ec9h3Zx9drXIVGo3A/zm4WQZ/m4ZzdLIIw/9o3FakUN3DgQHr37s3QoUOZOnUqLVu25MCBA8ydO5crrriCdu3a8dBDD3H11VfTpEkT9u3bx8qVK8s19ehDDz3ExIkTadasGZ07d2b69OmsW7fOPbj7RLfffjsvvvgiDz30EGPGjGH16tXFxjOc7H106NCB66+/nmnTplFYWMidd95Jv3796N69OwCNGzdm165drFu3jgYNGhAYGOgeV1JR5X2PUPGuUOPHj2fkyJF0796dnj17Mm3aNLKyshg9erT7MTfddBOxsbFMmTIFgClTptC9e3eaNGnC4cOH+f333/n444/dg9CLZGRkMGvWLPc4l+qgwkKkDP7cnsJz87ewfl86AGH+Nu4a0JwRPeO0+m5dkp397+DQzEyP/xRTap8QPxsDWkcxoLVrfvvMvEJW7znCXzsP89fOw/y9L/1Yi0YCn69IAKBNvSDObRHBNd0b0Dwq0Mj4UkVMJhPz5s3jscceY/To0SQnJxMTE8O5555LdHQ0FouFw4cPc9NNN5GYmEhERARXXnklkydPLvNr3HPPPaSnp/PAAw+QlJRE27ZtmTNnzklnS2rYsCFff/01999/v3vBt2eeeYabb775lO/ju+++4+677+bcc8/FbDYzZMgQXnvtNfdjrrrqKmbPns2AAQNIS0tj+vTpJRbRO1PlfY+VYfjw4SQnJ/PEE09w6NAh9xS3xw/oTkhIwHxc97ysrCzuvPNO9u3bh4+PD23atOGTTz5h+PDhxY79xRdf4HQ6GTFiRJXlP5HJWdM70JVTRkYGwcHBpKenExQUVGWvU1BQwLx587joootKfOIkNceGfelMXbCF37e5ZgPxt1kY07cpY/o2IfAM1p/QeVHDZWVVemGhc0JOVJFz4mhuAct3prJ0Rwp/bj9MfGLxMRpnNQ3jhrMacUHbGGw1dMrr3Nxcdu3aVWwNh7rA4XCQkZFBUFBQsYtMqbsq85w41f+r8lw766NWkVI4nU7eXLyD5xe4FiWyWkxc36sR485rTkRA5TS5iohUtkAfKwPbRjOwrevTzuSjefy5I4Uf/j7Ios2J/LUzlb92phIR4M21PeIY0auhBn+LSKVRYSFygkK7g8e/28jnK/YCcFmn+jx4QSsahtfhAZEiUiNFBnpzeedYLu8cy4G0HL5YkcDnK/eSfDSP13/dzpuLt3NOi0gGtY1mYJso6gWryBCRM6fCQuQ4mXmFjPtsDYvjkzGbYNJl7bipd2OjY4mIVFj9EF/GX9CKu89vwU+bEvnkrz0s23mY37Ym89vWZB7/FtrVD+L8Nq4io339YMxmTVAgImWnwkLkmKSMXEbPWMmmAxn4WM28em0XLmh3+pU9RURqEqvFzMUd63Fxx3rsTM5kwaZEft6cyJqEI2w6kMGmAxm8umgb0UHeXNKxPsN7xNEyWoO+ReT0VFiIANsSjzJq+kr2p+UQ7m/jvZHd6dIw1OhYIiJVqmlkAHf0D+CO/s1Iyczj1y1JLNqcxG/bkknMyOP9P3bx/h+76BQXwvDucVzaqd4ZTVwhInWDCgup85btOMxtH6/iaG4hTSL8mTG6B43CNY2olMJigauv/ve2SC0SEeDNNd3juKZ7HLkFdv7YlsKs1XtZtDmJ9XvTWL83jSd/2MRFHeoxrHscPRuHGdpVqqIrJovIvyrr/5MKC6nT5v59kPtnriPf7qBbo1Deu6k7oVpMSk7GxwdmzTI6hUiV87Fa3LNLJR/N45u1+5i5ci87krOYvWY/s9fsJyrQmyHtY7iwfT16NgnDUk1Fhs1mw2w2c+DAASIjI7HZbJjqwGKVDoeD/Px8cnNzNd2sAJVzTjidTvLz80lOTsZsNmOzVewaSIWF1FkfL9vNE3M24XTCkHYxTLu2Mz5WfQotInK8yEBvbju3Gbf2bcqahDS+XLmXeRsOknQ0j4+W7eGjZXuICLBxQbsYLmwfw1lNw7Faqu7C12w206RJEw4ePMiBAweq7HU8jdPpJCcnB19f3zpRSMnpVeY54efnR8OGDStctKqwkDrH6XTy8s/beHXRNgCu79WQJy9vX22ftomI1EQmk4lujULp1iiUJ4e248/th5m34SA//ZNISmY+ny1P4LPlCdWyRobNZqNhw4YUFhZit9ur5DU8TUFBAb/99hvnnnuuFtMUoPLOCYvFgpeXV6UUrCospE6xO5w8/t1GPlueAMB9A1tw7/kt9OmPlE0VrLwtUhN5e1kY0DqKAa2jeMbu4K+dh5m34RA/bTpESua/a2Sc1zqaG85qyLktIit9PIbJZMJqtdaZi2yLxUJhYSE+Pj515j3LqXniOaHCQuqM3AI7932xjvmbDmEywVOXt+eGsxoZHUtEpEazWsz0bRFJ3xaRPHl5Oxb+41oj488dh/l5s2sq24ZhflzfqyFXd2tAeIC30ZFFpIqosJA6ISO3gFs/XMXyXanYLGZeubYzF3aoZ3QsEZFaxWoxc1GHelzUoR7bkzL5dPkevlq9j4TUbKb8uIUXfopnUNtohvdoyDnNI9QFVaSWUWEhtV56TgE3vr+cv/elE+jtxf9u6k7vZuFGxxIRqdWaRwUw8dJ2PDS4Fd+vP8CnyxP4e1868zYcYt6GQ9QP9uHq7nFc060BcWF+RscVkUqgwkJqtaO5BYz8YAV/70snzN/Gx7f0pF39YKNjiYjUGX42L4b3aMjwHg3ZfDCDmSv38s3a/RxIz+XVRdt47ZdtnNM8gut7NWRgm2i8qnBGKRGpWiospNbKzCtk1PSVrNubRoiflU9u6UXb+kFGxxIRqbPa1Ati0mXtePTC1vz0TyIzVyawdPthft+Wwu/bUogJ8mFEz4Zc2zOO6CAfo+OKSDmpsJBaKTu/kJunr2T1niME+XipqBAR8SA+VguXdarPZZ3qk3A4my9WJjBz5V4OZeTy8s9befWXbQxuF80NvRrRu1m4Zu4TqSFUWEitk5Nv55YZq1ixO5VAby8+vqUX7WPV/UkqgcUCF130720RqbCG4X48PKQ19w5swfyNh/jkrz2s3H3EPRajRVQAt/drxmWd61fpwnsiUnEqLKRWyS2wc9vHq1i28zAB3l58eEtPOsWFGB1LagsfH5g71+gUIrWSt5eFyzvHcnnnWLYcyuCTv/bwzZr9bEvK5IFZ63lp4VbG9mvKsO5x+FhV2It4IpX+UmvkFdq5/ZPV/L4tBT+bhRmje9C1YajRsUREpJxaxwTx9NAOLPvP+Tw8pBURATb2p+XwxHebOOe5X3jj1+1k5BYYHVNETqDCQmqF7PxCxny4isXxyfhYzXwwqgfdG4cZHUtERCogyMfKnf2b88cj5/HU5e1oEOpLSmY+zy+Ip8+UX5j43Ub+3peG0+k0OqqIoK5QUgukZxcwesYK1iSk4Wez8N7I7pzVVOtUSBXIyoKoKNftpCTw9zc2j0gd4WO1cGPvxlzbsyHfrz/AW4t3sC0pkw+X7eHDZXtoERXAVd0aMLRzLDHBmk1KxCgqLKRGSz6ax43vL2fLoaME+1qZMboHXdT9SapSdrbRCUTqLKvFzJVdXQXEb9uS+XrNfn7adIhtSZk8++MWps7fQp/mEVzdrQEXd6inNTFEqpkKC6mx9h3J5sb3V7ArJYuIAG8+GdOT1jGaUlZEpLYzm030bxVF/1ZRZOQWMO/vg3y9Zh8rdx9xr4nx+i/beeziNvRvFWV0XJE6Q4WF1Eg7kjO58b3lHEjPJTbEl0/H9KJxhLqliIjUNUE+Vq7t2ZBrezZkz+Esvl6zn4+W7WZbUiajpq/k3JaRPHZRG1rFBBodVaTWUxuh1DibDqQz7O1lHEjPpVmkP1/d0VtFhYiI0Cjcn/GDWrLkwQGMOacJVouJ37Ymc+Erv/GfbzaQfDTP6IgitZoKC6lRticdZcT//uJwVj7t6gfx5dje1Av2NTqWiIh4kGA/K/93SVsW3t+PC9vH4HDCZ8sTGPDCYl5btI30HE1VK1IVVFhIjZGSmcfoGSvJyC2kS8MQPr/tLMIDvI2OJSIiHqpxhD9v3dCNL8f2pmODYDLzCnlx4VbOefYXnpu/RS0YIpVMhYXUCLkFdm77aBV7U3NoFO7Hezd1J8jHanQsqWvMZujXz7WZ9etTpKbo2SSMb+/swyvXdqZldABH8wp5a/EOznnuFx7/diN7UzXbm0hl0OBt8XgOh5MHZq1nTUIawb5WPhjVQy0VYgxfX1i82OgUInIGzGYTl3eO5dKO9Vm0JYk3ft3Our1pfPzXHj5bkcDlnepz78AWNArXmD2RM6WP3MTjvbgwnrl/H8RqMfH2Dd1oFhlgdCQREamhzGYTg9pG882dZ/PZrb3o2yICu8PJ7LX7GfTyb7z+yzbyCx1GxxSpkVRYiEf7ctVe3vh1BwDPXtmR3s20oraIiFScyWTi7GYRfHxLL+aM60PfFhHkFzp44aetXPTq76zYlWp0RJEaR4WFeKw/t6fwn9kbALjnvOZc1a2BwYmkzsvKgshI15aVZXQaEakkHRuE8NHNPXnl2s5EBNjYnpTJsHeW8chXf3MkK9/oeCI1hgoL8Ujbk45y+yerKXQ4uaxTfe4f1NLoSCIuKSmuTURqFZPJNQZj0fj+jOjZEICZq/Zy/ktL+Hr1PpxOp8EJRTyfCgvxODn5dsZ+vJqM3EK6Nwpl6tUdMZlMRscSEZE6INjPypQrO/DV7b1pGR1AalY+D8xaz9A3lrJsx2Gj44l4NBUW4nGe/XEzO5KziAr05p0bu+FjtRgdSURE6pjujcP44e6+PDykFf42C+v3pTPi3b+4ecZKtiYeNTqeiEdSYSEeZcnWZD5ctgeA56/ppGllRUTEMDYvM3f2b87ihwZw41mNsJhN/LIliSHTfuORr/7mUHqu0RFFPIoKC/EYR7LyeWjWegBG9m5Ev5aRBicSERGByEBvnhranoX3n8uF7WNwOF3jL/q/8Csv/hRPboHd6IgiHkGFhXgEp9PJY99uIOloHk0j/Xn0wjZGRxIRESmmaWQAb93Qja/vOJsejUPJLXDw2i/bGTztN5Zu16QOIiosxCN8s3Y/8zYcwstsYtrwzvjaNK5CPJDZDN27uzazfn2K1FXdGoXy5djevH1DV2KCfNhzOJvr31vO+JnrOJyZZ3Q8EcPoL6MYbt+RbCZ+twmAe89vQccGIcYGEjkZX19YudK1+foanUZEDGQymRjSvh4Lx5/LyN6NMJlg9tr9DHxpCV9pelqpo1RYiKEcDicPfLmeo3mFdGkYwh39mxkdSUREpMwCfaxMvrw9s+84m9YxgRzJLuDBWeu5/r3l7DmshTSlblFhIYZ674+dLN+Vip/NwsvDOuNl0SkpIiI1T5eGoXx/9zk8emFrfKxm/txxmItf/YMf/j5gdDSRaqOrODFM/KGjvLBgKwCPX9KWxhH+BicSOY3sbGjc2LVlZxudRkQ8jNVi5vZ+zfjpvn70aBxKZl4h4z5by3++2aCZo6ROUGEhhnA6nTzx3Uby7Q7Oax3FtT3ijI4kcnpOJ+zZ49rUf1pETqJhuB+f33oW4wY0x2SCz5YnMPSNpexIzjQ6mkiVUmEhhvhx4yGW70rF28vMk5e3w2QyGR1JRESk0nhZzDw4uBUf3dyTcH8bWw4d5dLX/uCbtfuMjiZSZVRYSLXLLbDz37mbARjbrxkNQv0MTiQiIlI1+raI5Md7+9K7aTjZ+Xbun7meh79ar65RUisZXli88cYbNG7cGB8fH3r16sWKFStO+fi0tDTuuusu6tWrh7e3Ny1btmTevHnVlFYqw/9+28n+tBzqBftwe7+mRscRERGpUlFBPnwyphf3nt8Ckwm+XLWP699bTmpWvtHRRCqVoYXFzJkzGT9+PBMnTmTNmjV06tSJwYMHk5SUVOrj8/PzGTRoELt37+arr74iPj6ed999l9jY2GpOLmfqQFoOby7eDsCEi9rgZ/MyOJGIiEjVs5hN3D+oJZ/c0osgHy9W7znClW8uZVeKpqSV2sPQwuKll17i1ltvZfTo0bRt25a3334bPz8/Pvjgg1If/8EHH5Camsq3335Lnz59aNy4Mf369aNTp07VnFzO1LM/biG3wEGPxqFc2rGe0XFERESqVZ/mEcy+82wahPqy+3A2V765lFW7U42OJVIpDPu4OD8/n9WrVzNhwgT3PrPZzMCBA1m2bFmpz5kzZw69e/fmrrvu4rvvviMyMpLrrruORx55BIvFUupz8vLyyMvLc9/PyMgAoKCggIKCgkp8R8UVHbsqX6OmWbXnCHPWH8BkgscubEVhYaHRkaqdzosarrAQrzZtjt0shEr4OeqckBPpnKj9GoX6MOu2noz9ZC1/78/guveWM/XK9lzcIeakz9F5ISeqrnOiPMc3OQ1ac/7AgQPExsby559/0rt3b/f+hx9+mCVLlrB8+fISz2ndujW7d+/m+uuv584772T79u3ceeed3HPPPUycOLHU15k0aRKTJ08usf+zzz7Dz0+DhquLwwkvbrCwL8tE7ygH1zZzGB1JRETEUPl2+GibmQ1HXB1ILm1o5/z6TjRRoniS7OxsrrvuOtLT0wkKCjrlY2tUB3eHw0FUVBT/+9//sFgsdOvWjf379/P888+ftLCYMGEC48ePd9/PyMggLi6OCy644LTfnIooKChg4cKFDBo0CKvVWmWvU1N8uWof+/76hwBvL14e3YfwAG+jIxlC54WcSOeEnEjnRN1yqcPJlPnxfLgsge8TLHhHxPJ/F7XC37v4JZrOCzlRdZ0TRb19ysKwwiIiIgKLxUJiYmKx/YmJicTElN4UWK9ePaxWa7FuT23atOHQoUPk5+djs9lKPMfb2xtv75IXsVartVr+Y1bX63iyjNwCXvrZNWD7voEtiAkNMDiR8XReyIl0TsiJdE7UDVZg8uUdaBQewFNz/+GrNftZtjOVZ6/qQN8WkSUfr/NCTlDV50R5jm3Y4G2bzUa3bt1YtGiRe5/D4WDRokXFukYdr0+fPmzfvh2H499uNFu3bqVevXqlFhXiGV5btI3DWfk0jfTnpt6NjY4jcuays6FdO9eWnW10GhGpRW4+pwkf39yL2BBf9qflcOP7K3ho1nrSszWmQmoOQ2eFGj9+PO+++y4ffvghmzdv5o477iArK4vRo0cDcNNNNxUb3H3HHXeQmprKvffey9atW5k7dy7PPPMMd911l1FvQU4j4XA205fuBuCJS9pi8zJ86RSRM+d0wj//uDZjhqeJSC12TosIfrr/XEad3RiTCWat3segl5ewYNMho6OJlImhYyyGDx9OcnIyTzzxBIcOHaJz587Mnz+f6OhoABISEjCb/70QjYuLY8GCBdx///107NiR2NhY7r33Xh555BGj3oKcxqu/bKPQ4aRviwj6t4oyOo6IiIhH8/f2YtJl7bi4Yz0e+fpvdiZnMfbj1VzUPpredXN4otQghg/eHjduHOPGjSv1a4sXLy6xr3fv3vz1119VnEoqw66ULGav2QfA+EEtDU4jIiJSc/RoHMa8e/ryyqJt/O+3nczbmMivFgt50XsY1acpXhb1ABDPo7NSqswrP2/F4YTzWkfRpWGo0XFERERqFB+rhUeGtOa7u/rQvn4QOXYTT8+L5+JX/+CvnYeNjidSggoLqRLbk47y3foDgForREREKqJ9bDBfje3F8KZ2QnytxCce5dr//cXdn6/lYHqO0fFE3FRYSJV4+edtOJ1wQdto2scGGx1HRESkRrOYTZwd7eSn+/pww1kNMZng+/UHOP/FJby1eAeFdi08K8ZTYSGVbvPBDOb+fRCA+9VaIbWJyQSNGrk2LY0rIgYI9bPx9NAOfD/uHLo2DCE7385z87dw84eryMjV1LRiLBUWUumm/bwVgIs71KNNvapb3Vyk2vn5we7drs3Pz+g0IlKHtY8N5qvbz2bq1R3xsZr5bWsyV735JwmHtcaOGEeFhVSqjfvTWbApEZPJtcq2iIiIVA2z2cSw7nHMGns20UHebEvKZOibS1m5O9XoaFJHqbCQSvXyQldrxeWd6tMiOtDgNCIiIrVfhwbBfHfXObSPDSI1K5/r313O16v3GR1L6iAVFlJp1iYcYdGWJMwmuOd8tVZILZSTAz16uLYczcQiIp4jJtiHL8f2Zki7GPLtDh6YtZ7n5m/B4XAaHU3qEBUWUmleOtZacWXXBjSNDDA4jUgVcDhg1SrX5tAMLCLiWfxsXrx5fVfuGtAMgLcW7+DOT9eQW2A3OJnUFSospFKs3J3K79tS8DKbuOc8tVaIiIgYwWw28dDg1rw0rBM2i5n5mw4xavoKjmrGKKkGKiykUry6aBsA13RvQMNwzZYjIiJipCu7NuDDm3sS4O3FXztTGfHuX6Rk5hkdS2o5FRZSYVsTj/L7thTMJrizf3Oj44iIiAjQu1k4X9x2FuH+Njbuz2DY28vYd0TT0UrVUWEhFTZ96W4ABrWNJi5MrRUiIiKeon1sMLNu701siC87U7K4+q1lbEs8anQsqaVUWEiFpGXn881a15R2o/s0MTiNiIiInKhpZABf3dGb5lEBHMrI5Zp3lrFub5rRsaQWUmEhFfL5ir3kFjhoUy+IXk3CjI4jUvUiIlybiEgNUi/Yl1lje9MpLoS07AKue/cvft+WbHQsqWVUWMgZK7Q7+HjZbgBG92mMyWQyNpBIVfP3h+Rk1+bvb3QaEZFyCfW38dmYXvRtEUF2vp3R01e6ex2IVAYVFnLGFmxK5EB6LuH+Ni7rVN/oOCIiInIa/t5evDeyO5d1qk+hw8n9M9fz1uIdOJ1aSE8qToWFnLHpS3cBcH2vhvhYLQanERERkbLw9rIwbXhnbju3KQDPzd/CpDmbsGuVbqkgFRZyRjbsS2fVniNYLSZuOKuR0XFEqkdODvTv79pycoxOIyJyxsxmE/+5qA1PXNIWkwk+XLaHOz9drVW6pUJUWMgZKWqtuLhDPaKCfAxOI1JNHA5YssS1ORxGpxERqbCbz2nCayO6YLOYWbApkRveW05adr7RsaSGUmEh5ZZ0NJfv/z4AaIpZERGRmu6SjvX56JaeBPp4sWrPEa56608OpecaHUtqIBUWUm6f/pVAgd1J14YhdIoLMTqOiIiIVNBZTcP56vazqRfsw47kLG54fzmHM/OMjiU1jAoLKZe8QjufLt8DqLVCRESkNmkVE8is23tTL9iH7UmZ3PTBCjJyC4yOJTWICgsplx/WHyQlM5+YIB+GtI8xOo6IiIhUogahfnwyphfh/jY2Hcjg5ukryc4vNDqW1BAqLKTMnE4n0/90Ddq+sXcjrBadPiIiIrVNs8iAYmMuxn68mrxCzRYlp6crQymz1XuOsHF/Bt5eZq7r2dDoOCLG8PNzbSIitVi7+sHMGN0DX6uF37elcM/naym0azY8OTUVFlJmn6/YC8DQzrGE+tsMTiNiAH9/yMpybf7+RqcREalS3RqF8e5N3d1T0T789d84tIienIIKCymTrLxCftx4EIBhPeIMTiMiIiLV4ZwWEbx+XRcsZhOz1+zniTkbVVzISamwkDKZu+Eg2fl2mkb407VhiNFxREREpJpc0C6GF67piMkEn/yVwAOz1lOgblFSChUWUiZfrd4HwFXdGmAymQxOI2KQ3Fy4+GLXlqvFo0Sk7riiSwNeuLoTFrOJb9bu59aPVmm2KClBhYWc1p7DWazYlYrZBFd1bWB0HBHj2O0wb55rs2uGFBGpW67q1oB3b+qGj9XM4vhkrn9vOWnZ+UbHEg+iwkJO6+tjrRXntIgkJtjH4DQiIiJilPNaR/PpmF4E+1pZm5DGNW8v42B6jtGxxEOosJBTcjicfL1mPwBXd1NrhYiISF3XrVEYs27vTUyQD9uSMrnqzT/ZnpRpdCzxACos5JT+2nmY/Wk5BPp4cUHbaKPjiIiIiAdoGR3I13eeTdNIfw6k53LN23+yfm+a0bHEYCos5JSKBm1f1qk+PlaLwWlERETEU8SG+PLV7WfTqUEwR7ILGDl9BTuS1XJRl6mwkJM6mlvAvGNrV6gblIiIiJwozN/GZ7eeRae4ENKyC7jp/RUkZWjWvLpKhYWc1LwNB8ktcNAs0p/OcSFGxxEREREP5O/txQcju9Mkwp/9aTmMmr6So7kFRscSA6iwkJMq6gZ1Tfc4rV0hAuDvD06na/P3NzqNiIjHCA/w5sPRPYkIsPHPwQxu/2Q1+YVaRK+uUWEhpdqVksXK3Ucwm+CKLrFGxxEREREP1zDcj+mjeuJvs7B0+2Ee+mo9DofT6FhSjVRYSKmK1q44t2Uk0UFau0JEREROr0ODYN66oRteZhPfrTvAc/O3GB1JqpEKCynB7nDy9RpXYaFB2yLHyc2Fa65xbbkanCgiUppzW0Yy9eqOALzz204++GOXwYmkuqiwkBL+3JHCwfRcgn2tDGyjtStE3Ox2+Oor12a3G51GRMRjXdm1AQ8PaQXAU3P/Yc76AwYnkuqgwkJK0NoVIiIiUlF39GvGqLMb43TC+JnrWByfZHQkqWIqLKSYzLxC5m88BKgblIiIiJw5k8nEE5e05bJO9Sl0OLnjkzWs3nPE6FhShVRYSDFL4pPJK3TQJMKfjg2CjY4jIiIiNZjZbOKFazrRr2UkOQV2bp6xkvhDR42OJVVEhYUUs/AfV2vFoLbRWrtCREREKszmZeatG7rStWEI6TkF3Pj+cvamZhsdS6qACgtxK7A7+GWLq//joLYatC0iIiKVw8/mxQejetAqOpCko3nc+P5yko/mGR1LKpkKC3FbuSuVjNxCwv1tdG0YanQcERERqUVC/Gx8dEtPGoT6svtwNiM/WEFGboHRsaQSnVFhsWPHDu6++24GDhzIwIEDueeee9ixY0dlZ5Nq9tM/iQCc1zoKi1ndoERK8PODzEzX5udndBoRkRonOsiHT27pRUSAjX8OZnDbR6uwa3XuWqPchcWCBQto27YtK1asoGPHjnTs2JHly5fTrl07Fi5cWBUZpRo4nU4WHiss1A1K5CRMJvD3d20agyQickYaR/jz4c09CfD24q+dqUxfqgX0aguv8j7h0Ucf5f777+fZZ58tsf+RRx5h0KBBlRZOqs/mg0fZn5aDj9VM3xaRRscRERGRWqxd/WAeu7gNE2Zv4IWf4hnUNppG4f5Gx5IKKneLxebNm7nllltK7L/55pv5559/KiWUVL+i1opzmkfia9OieCKlysuDUaNcW54GHYqIVMS1PeLo3TSc3AIHE2ZvwOlUl6iartyFRWRkJOvWrSuxf926dURFRVVGJjHAws2uaWYvUDcokZMrLIQPP3RthYVGpxERqdFMJhNTruyAj9XMnzsO8+WqvUZHkgoqd1eoW2+9ldtuu42dO3dy9tlnA7B06VKee+45xo8fX+kBpeodSMth4/4MTCY4r42KQxEREakejSP8eWBQK/47bzNPz91M/1ZRRAf5GB1LzlC5C4vHH3+cwMBAXnzxRSZMmABA/fr1mTRpEvfcc0+lB5SqV9QNqlvDUCICvA1OIyIiInXJ6D6N+eHvA6zfl87/fbuR/93YTYv01lDl7gplMpm4//772bdvH+np6aSnp7Nv3z7uvfdenQQ1lGaDEhEREaN4Wcw8d3VHvMwmFv6TyLwNh4yOJGeoQgvkBQYGEhgYWFlZxADpOQX8tfMwoMJCREREjNE6Jog7BzQHYOKcjRzJyjc4kZyJMnWF6tq1K4sWLSI0NJQuXbqcsmVizZo1lRZOqt7i+CQKHU6aRfrTNDLA6DgiIiJSR901oBk/bjjItqRMnpr7Dy8N62x0JCmnMhUWl19+Od7err73Q4cOrco8Us3+7QYVY3ASERERqcu8vSw8e1VHrn77T2av2c9lnerTv5UmlalJylRYTJw4sdTbUrPlFzpYEp8MqBuUSJn4+UFS0r+3RUSkUnVrFMqosxszfeluJszewPz7ziXY12p0LCmjCo2xkJrtr52HOZpXSESAN13iQoyOI+L5TCaIjHRtmqxCRKRKPDS4FY3D/TiYnsvkOZuMjiPlUKYWi9DQ0DLP+JSamlqhQFJ9irpBDWwThdmsiyQRERExnp/NixeHdeKat5cxe+1+LmgXw5D26rJdE5SpsJg2bZr79uHDh3n66acZPHgwvXv3BmDZsmUsWLCAxx9/vEpCSuVzOp38vFnTzIqUS14eFC0E+tJL4K11X0REqkK3RmGM7deMtxbv4LFvNtC9sdbaqgnKVFiMHDnSffuqq67iySefZNy4ce5999xzD6+//jo///wz999/f+WnlEq3cX8GB9Nz8bVa6NM8wug4IjVDYSG8+abr9tSpKixERKrQfQNb8OuWJLYcOsqE2Ru0cF4NUO4xFgsWLGDIkCEl9g8ZMoSff/65UkJJ1Vv4j2vxmXNbRuBjtRicRkRERKQ4by8LLw3rjNXiWjjv6zX7jY4kp1HuwiI8PJzvvvuuxP7vvvuO8PDwSgklVe+XeNfMNppmVkRERDxV2/pB3DewJQCT52xif1qOwYnkVMrUFep4kydPZsyYMSxevJhevXoBsHz5cubPn8+7775b6QGl8qVm5bPpQAbgarEQERER8VRjz23Kz5sTWZuQxkOz1vPJLb006YyHKneLxahRo1i6dClBQUHMnj2b2bNnExQUxB9//MGoUaOqIKJUtmU7DuN0QqvoQKICfYyOIyIiInJSXhYzLw3rjK/Vwp87DvPRst1GR5KTKHeLBUCvXr349NNPKzuLVJM/tqcAaNC2iIiI1AhNIvyZcFFrnvhuE8/O30K/VlE0ifA3Opac4IwWyNuxYwf/93//x3XXXUfSsVVof/zxRzZt0iImNcHSY4XFOS00JkZERERqhht6NeKc5hHkFjh4+od/jI4jpThtYREfH1/s/pIlS+jQoQPLly/n66+/JjMzE4D169czceLEMwrxxhtv0LhxY3x8fOjVqxcrVqw46WNnzJiByWQqtvn4qDtPWSUcziYhNRsvs4meTVRYiJSLry/s2uXafH2NTiMiUqeYzSYmXdYOL7OJRVuS+G1rstGR5ASnLSxmz57N9ddfj91uB+DRRx/l6aefZuHChdhsNvfjzjvvPP76669yB5g5cybjx49n4sSJrFmzhk6dOjF48GB3S0hpgoKCOHjwoHvbs2dPuV+3rlq6w9Va0aVhCAHeZ9QTTqTuMpuhcWPXZj6jBl8REamA5lEB3Ni7EQBPz/2HQrvD4ERyvNP+ZXzwwQcJCwtj8ODBAGzYsIErrriixOOioqJISUkpd4CXXnqJW2+9ldGjR9O2bVvefvtt/Pz8+OCDD076HJPJRExMjHuLjtbK0WVVNL7i7GYaXyEiIiI1z73ntyDEz8rWxEw+X7nX6DhynNMWFlarlddee42xY8cCEBISwsGDB0s8bu3atcTGxpbrxfPz81m9ejUDBw78N5DZzMCBA1m2bNlJn5eZmUmjRo2Ii4vj8ssv19iOMnI4nPzpHl+hwkKk3PLz4aGHXFt+vtFpRETqpBA/G+MHuda2eOmneNJzCgxOJEXK3BfmmmuuAeDaa6/lkUceYdasWZhMJhwOB0uXLuXBBx/kpptuKteLp6SkYLfbS7Q4REdHs2XLllKf06pVKz744AM6duxIeno6L7zwAmeffTabNm2iQYMGJR6fl5dHXl6e+35Ghmv9hoKCAgoKqu5ELDp2Vb5GeW06kMGR7AL8bRbaxfh7VLa6whPPCymH7GysL7wAQMFjj4Gp4vOo65yQE+mckNLovCjumi71+OjP3WxPzuKVhfFMuLCV0ZGqXXWdE+U5vsnpdDrLc/D8/HzuuusuZsyYgd1ux8vLC7vdznXXXceMGTOwWCxlPtaBAweIjY3lzz//pHfv3u79Dz/8MEuWLGH58uWnPUZBQQFt2rRhxIgRPPXUUyW+PmnSJCZPnlxi/2effYafn1+Zs9YGi/abmJNgoW2Ig7Ft1CdRpLwsublccu21APzwxRfYNXGEiIhhNqeZeHuzBbPJyYROdqI0p0aVyM7O5rrrriM9PZ2goKBTPrbchUWRhIQENm7cSGZmJl26dKFFixblPkZ+fj5+fn589dVXDB061L1/5MiRpKWl8d1335XpONdccw1eXl58/vnnJb5WWotFXFwcKSkpp/3mVERBQQELFy5k0KBBWK3WKnud8rj5w9X8vv0w/7mwFaPPbmR0nDrJE88LKYesLKyhoQAUHDkC/hWfQ13nhJxI54SURudF6cZ8vIYlW1M4r1Uk79zQxeg41aq6zomMjAwiIiLKVFic8bRADRs2pGHDhmf6dABsNhvdunVj0aJF7sLC4XCwaNEixo0bV6Zj2O12NmzYwEUXXVTq1729vfH29i6x32q1Vst/zOp6ndPJK7Szcs8RAPq1ivaITHWZp5wXUk7H/cysVmux+xU/tM4JKU7nhJRG50Vxj1/SjqXTfuOX+GT+2p1G3xaRRkeqdlV9TpTn2OUuLJxOJ1999RW//vorSUlJOBzFu9TMnj27XMcbP348I0eOpHv37vTs2ZNp06aRlZXF6NGjAbjpppuIjY1lypQpADz55JOcddZZNG/enLS0NJ5//nn27NnDmDFjyvtW6pQ1e9LILXAQEeBNy+gAo+OIiIiIVFjR9LPTl+7m6R82M/eecLwsmg7cKOUuLO677z7eeecdBgwYQHR0NKYKDl4cPnw4ycnJPPHEExw6dIjOnTszf/5894DuhIQEzMfNF3/kyBFuvfVWDh06RGhoKN26dePPP/+kbdu2FcpR27lX224eXuGfmYiIiIinuPf8Fnyzdj/xiUf5YuVebjhL3b2NUu7C4uOPP2b27Nkn7Xp0JsaNG3fSrk+LFy8udv/ll1/m5ZdfrrTXriuK1q/o01zTzIqIiEjtEeJn4/6BLZk4ZxMv/hTPkPYxRASU7AYvVa/cbUXBwcE0bdq0KrJIFUnPKeDvfWmACguRCvH1hY0bXZuvph8REfEU1/VqSOuYQI5kFzBh9gbOcG4iqaByFxZF07fm5ORURR6pAn/tPIzDCU0j/akfooshkTNmNkO7dq7NrD68IiKewmox89KwzlgtJhb+k8jXa/YbHalOKvdfxmHDhnHkyBGioqLo0KEDXbt2LbaJ5/l3fIVaK0RERKR2als/iPsGulbknjxnE/vT9CF4dSv3GIuRI0eyevVqbrjhhkoZvC1VT+MrRCpJfj4884zr9n/+AzabsXlERKSYsec25efNiaxNSOOhWev55JZemM26Vq0u5S4s5s6dy4IFCzjnnHOqIo9UsgNpOexMzsJsgrOahhsdR6RmKyiAyZNdtx96SIWFiIiH8TrWJeqiV37nzx2H+WjZbkb1aWJ0rDqj3F2h4uLiqnTFaqlcRd2gOjYIIdhXC+qIiIhI7dYkwp8JF7UGYMqPW9iRnGlworqj3IXFiy++yMMPP8zu3burII5UNo2vEBERkbrmhl6N6NsigrxCB+O/XE+h3XH6J0mFlbuwuOGGG/j1119p1qwZgYGBhIWFFdvEczidTv7YfhiAs5urG5SIiIjUDWazialXdyTQx4v1e9N4a/EOoyPVCeUeYzFt2rQqiCFVYWtiJimZefhYzXRtGGp0HBEREZFqUy/Yl8mXtWP8l+t5ZdE2BrSOon1ssNGxarUzmhVKaoY/d7i6QfVoHIaP1WJwGhEREZHqdUWXWH7alMj8TYd47NuNfHvn2ZrRtApphadabG1CGuAqLERERETqGpPJxJND2+FrtbB+bxo//ZNodKRaTYVFLbZ27xEAujQMMTaISG3h4wMrVrg2Hx+j04iISBlEBfpw8zmNAXhhQTx2h9PYQLWYCotaKvloHntTczCZoFNciNFxRGoHiwV69HBtFnUvFBGpKW47txnBvla2JWXy7dr9RseptVRY1FLr9qYB0DwygCAfrV8hIiIidVewr5Xb+zUD4OWft5JfqOlnq8IZFxbbt29nwYIF5OTkAK6pTcVzrFM3KJHKl58Pzz/v2vLzjU4jIiLlMOrsxkQFerPvSA6fr0gwOk6tVO7C4vDhwwwcOJCWLVty0UUXcfDgQQBuueUWHnjggUoPKGemaOB2F00zK1J5Cgrg4YddW0GB0WlERKQcfG0W7j6/BQCv/bKd7PxCgxPVPuUuLO6//368vLxISEjAz8/PvX/48OHMnz+/UsPJmbE7nKw/1hWqs8ZXiIiIiAAwvHscDcP8SMnMY/rS3UbHqXXKXVj89NNPPPfcczRo0KDY/hYtWrBnz55KCyZnblvSUbLy7fjZLLSMDjQ6joiIiIhHsHmZGT+oJQBvL9lBWra6tVamchcWWVlZxVoqiqSmpuLt7V0poaRiirpBdWoQgsWsRWBEREREilzWqT6tYwI5mlvIO7/tNDpOrVLuwqJv37589NFH7vsmkwmHw8HUqVMZMGBApYaTM7M2QQO3RUREREpjNpt48IJWAExfuoukjFyDE9UeXuV9wtSpUzn//PNZtWoV+fn5PPzww2zatInU1FSWLl1aFRmlnIqmmtXAbREREZGSzm8TRdeGIaxJSOO1X7bz1ND2RkeqFcrdYtG+fXu2bt3KOeecw+WXX05WVhZXXnkla9eupVmzZlWRUcohI7eAbUmZgAZui4iIiJTGZDLx8JDWAHy+IoE9h7MMTlQ7lKvFoqCggCFDhvD222/z2GOPVVUmqYC/96bjdEKDUF8iAzXmRaRS+fjAr7/+e1tERGqss5qGc27LSH7bmszUBfG8cV1XoyPVeOVqsbBarfz9999VlUUqwb/jK9QNSqTSWSzQv79rs1iMTiMiIhU04cLWmEww9++DrN6TanScGq/cXaFuuOEG3n///arIIpVgbdH4CnWDEhERETmlNvWCGN49DoCnftiM0+k0OFHNVu7B24WFhXzwwQf8/PPPdOvWDX9//2Jff+mllyotnJSP0+nUjFAiVamgAP73P9ft224Dq9XYPCIiUmHjL2jJnPUHWLc3je//PshlneobHanGKndhsXHjRrp2dfVB27p1a7GvmUxaM8FICanZHMkuwGYx07Z+kNFxRGqf/HwYN851e9QoFRYiIrVAVKAPd/RrxosLt/Lcj1u4oG00PlZ1dz0T5S4sfi0auCgep2hhvHaxQXh76T+EiIiISFmM6duUz1YksD8th+lLd3NHf810eibKPcbiePv27WPfvn2VlUUqyN0NKk4Dt0VERETKytdm4aHBrkXz3vh1OymZeQYnqpnKXVg4HA6efPJJgoODadSoEY0aNSIkJISnnnoKh8NRFRmljIoGbnfW+AoRERGRchnaOZYOscFk5hUy7eetp3+ClFDuwuKxxx7j9ddf59lnn2Xt2rWsXbuWZ555htdee43HH3+8KjJKGeQW2PnnQAagGaFEREREystsNvF/F7cB4LPlCWxLPGpwopqn3IXFhx9+yHvvvccdd9xBx44d6dixI3feeSfvvvsuM2bMqIKIUhYb96dT6HASEeBNg1Bfo+OIiIiI1Di9moYzuF00Dic8M2+z0XFqnHIXFqmpqbRu3brE/tatW5OaqoVFjLKuaP2KhiGanUtERETkDD16YRu8zCZ+jU/m923JRsepUcpdWHTq1InXX3+9xP7XX3+dTp06VUooKb+iGaG0foVIFfL2hh9+cG3e3kanERGRKtAkwp+bejcG4L9zN+NwaNG8sir3dLNTp07l4osv5ueff6Z3794ALFu2jL179zJv3rxKDyhloxmhRKqBlxdcfLHRKUREpIrdc35zvly1ly2HjrJidypnNQ03OlKNUO4Wi379+hEfH88VV1xBWloaaWlpXHnllcTHx9O3b9+qyCincSg9lwPpuZhN0LFBsNFxRERERGq0ED8bl3aqB8CXq/YanKbmKHeLBUBsbCz//e9/KzuLnKF1e12tFS2jA/H3PqMfqYiURUEBfPqp6/b112vlbRGRWuya7nF8vmIvP244xOTLCgj00e/80yl3i8X06dOZNWtWif2zZs3iww8/rJRQUj7/jq9QNyiRKpWfD6NHu7b8fKPTiIhIFeoSF0KzSH9yCuzM/fug0XFqhHIXFlOmTCEiIqLE/qioKJ555plKCSXls/a4GaFEREREpOJMJhPDuscB6g5VVuUuLBISEmjSpEmJ/Y0aNSIhIaFSQknZFdodbNiXDkBXFRYiIiIileaKrrFYzCbWJKSxPSnT6Dger9yFRVRUFH///XeJ/evXryc8XCPmq9vOlCxyCuz42yw0jQgwOo6IiIhIrREV6MOAVpEAzFqtVovTKXdhMWLECO655x5+/fVX7HY7drudX375hXvvvZdrr722KjLKKcQfci033zImELNZC+OJiIiIVKZrjnWHmr1mP4V2h8FpPFu5pxB66qmn2L17N+effz5eXq6nOxwObrrpJo2xMMDWRFdh0So60OAkIiIiIrXPea2jiAiwkXw0jyVbkzm/TbTRkTxWuVssbDYbM2fOJD4+nk8//ZTZs2ezY8cOPvjgA2w2W1VklFNwt1iosBARERGpdFaLmaGdYwEN4j6dM170oEWLFrRo0QK73c6GDRsICgoiNFTTnVY3d4tFjAoLkSrn7Q1ffvnvbRERqROu6R7He3/sYtHmJA5n5hEeoL8BpSl3i8V9993H+++/D4Ddbqdfv3507dqVuLg4Fi9eXNn55BRy8u3sSc0GVFiIVAsvL7jmGtfmpcUoRUTqilYxgXSKC6HQ4eSbtfuNjuOxyl1YfPXVV3Tq1AmA77//np07d7Jlyxbuv/9+HnvssUoPKCe3PSkTpxPC/W1EqHIWERERqTLDujcAYNaqfTidToPTeKZyFxYpKSnExMQAMG/ePIYNG0bLli25+eab2bBhQ6UHlJOLT9T4CpFqVVgIs2a5tsJCo9OIiEg1urRTfby9zMQnHmXD/nSj43ikchcW0dHR/PPPP9jtdubPn8+gQYMAyM7OxmKxVHpAObn4QxmAukGJVJu8PBg2zLXl5RmdRkREqlGQj5UL27s+XNcg7tKVu7AYPXo0w4YNo3379phMJgYOHAjA8uXLad26daUHlJOLT3StAKkWCxEREZGqN+zYmhbfrTtAboHd4DSep9yjDydNmkT79u3Zu3cv11xzDd7HZkaxWCw8+uijlR5QTm7roaIZobTitoiIiEhVO6tpOA1Cfdl3JIcFmw5x+bFpaMXljKY1ufrqqwHYt28fDocDs9nMyJEjKzWYnFp6dgGHMnIBaKEWCxEREZEqZzabuLpbA6b9vI2v1+xXYXGCcneFOl7btm3ZvXt3JUWR8tia5GqtqB/sQ5CP1eA0IiIiInVD0WJ5S7enkJqVb3Aaz1KhwkJTbRnHveK2Bm6LiIiIVJvGEf60jw3C7nCyYNMho+N4lAoVFmIc94rb6gYlIiIiUq0u7lAfgB/+PmBwEs9SocLiP//5D2FhYZWVRcrB3WKhwkKk+thsMH26a7PZjE4jIiIGuaRjPQCW7ThMSqamHy9SocJiwoQJhISEVFIUKSun0/lvi4W6QolUH6sVRo1ybVaNbRIRqaviwvzo1CAYhxN+3KjuUEUqrSvU3r17ufnmmyvrcHIKyZl5HMkuwGyC5lGaalZERESkul3S0dUdaq66Q7lVWmGRmprKhx9+WFmHk1PYesi1MF7jcH98rFrtXKTaFBbC3LmurbDQ6DQiImKgi451h1q+K5WkY0sA1HVlXsdizpw5p/z6zp07KxxGymbLoQxA4ytEql1eHlxyiet2ZiZ4ndFSQCIiUgvEhvjStWEIaxLS+HHjIUae3djoSIYr81/FoUOHYjKZTjnFrMlkqpRQcmpF4ys01ayIiIiIcS7uWJ81CWn88PcBFRaUoytUvXr1mD17Ng6Ho9RtzZo1VZlTjhOf6OoKpalmRURERIxzcQdXd6iVu49wMD3H4DTGK3Nh0a1bN1avXn3Sr5+uNUMqh8PhZJt7RigN3BYRERExSkywDz0ahwIwb4NmhypzYfHQQw9x9tlnn/TrzZs359dff62UUHJy+9NyyM63Y7OYaRTub3QcERERkTqtaHYoLZZXjsKib9++DBky5KRf9/f3p1+/fpUSSk6uaGG8ppH+WC1aOF1ERETESBe2j8FkgrUJaew7km10HEOV+cp0586d6urkAeK1MJ6IiIiIx4gK8qFXkzAA5m04aHAaY5W5sGjRogXJycnu+8OHDycxMbFKQsnJuWeE0sBtkepns8Hrr7s2m83oNCIi4iEudi+Wp8KiTE5srZg3bx5ZWVmVHkhOragrVGu1WIhUP6sV7rrLtVmtRqcREREPcWH7GMwmWL8vnYTDdbc7lDrp1yAFdgc7k13FnFosRERERDxDRIA3vZuFAzC3DneHKnNhYTKZSiyApwXxqtfulCzy7Q78bRZiQ3yNjiNS99jtsHixa7PbjU4jIiIe5OIOmh2qzCtvO51ORo0ahbe3NwC5ubncfvvt+PsXn/J09uzZlZtQ3IoGbreIDsRsVlEnUu1yc2HAANftzEzw15TPIiLiMqR9DI9/t5FNBzLYlZJFk4i69zeizC0WI0eOJCoqiuDgYIKDg7nhhhuoX7+++37RdibeeOMNGjdujI+PD7169WLFihVlet4XX3yByWRi6NChZ/S6Nc3WY+MrtOK2iIiIiGcJ87fRp3kEAHPW1c1WizK3WEyfPr1KAsycOZPx48fz9ttv06tXL6ZNm8bgwYOJj48nKirqpM/bvXs3Dz74IH379q2SXJ6oqMWipQZui4iIiHicoZ3r89vWZL5Zu497zm9e54YNGD54+6WXXuLWW29l9OjRtG3blrfffhs/Pz8++OCDkz7Hbrdz/fXXM3nyZJo2bVqNaY21NTETUIuFiIiIiCca3C4GP5uF3YezWZOQZnScalfmFouqkJ+fz+rVq5kwYYJ7n9lsZuDAgSxbtuykz3vyySeJiorilltu4ffffz/la+Tl5ZGXl+e+n5GRAUBBQQEFBQUVfAcnV3TsynqN3AI7uw+7ZoRqGu5Tpdml6lT2eSHVrKAAq/tmAVTCz1HnhJxI54SURudFzWAzwwVtovh2/UG+Xp1Ax/oBVfZa1XVOlOf4hhYWKSkp2O12oqOji+2Pjo5my5YtpT7njz/+4P3332fdunVleo0pU6YwefLkEvt/+ukn/Pz8yp25vBYuXFgpx9mbCU6nF/5eTlb8tog61rJW61TWeSHVy5KbyyXHbi9YsAC7j0+lHVvnhJxI54SURueF56uXbwIsfLtmL91Mu/Gq4v5BVX1OZGeXfV0OQwuL8jp69Cg33ngj7777LhEREWV6zoQJExg/frz7fkZGBnFxcVxwwQUEBQVVVVQKCgpYuHAhgwYNwloJC2l9s/YAbNhIuwZhXHxxj0pIKEao7PNCqtlxi4IOHjy4UmaF0jkhJ9I5IaXReVFzDHY4mf3CbyQezcOnaTcuaBt9+iedgeo6J4p6+5SFoYVFREQEFouFxMTEYvsTExOJiYkp8fgdO3awe/duLr30Uvc+h8MBgJeXF/Hx8TRr1qzYc7y9vd1T5B7ParVWy3/MynqdHSmuarF1vSD9QqkFquv8k0rm5wdTpwJg9fOr1NW3dU7IiXROSGl0Xng+KzC0Syzv/LaTOX8f4uJODar29ar4nCjPsQ0dvG2z2ejWrRuLFi1y73M4HCxatIjevXuXeHzr1q3ZsGED69atc2+XXXYZAwYMYN26dcTFxVVn/GpVNCNUK80IJWIcmw0eesi12WxGpxEREQ91RddYAH7ZkkRadr7BaaqP4V2hxo8fz8iRI+nevTs9e/Zk2rRpZGVlMXr0aABuuukmYmNjmTJlCj4+PrRv377Y80NCQgBK7K9t9hx2tVjUxcVWRERERGqS1jFBtKkXxOaDGXz/90FuPKuR0ZGqheGFxfDhw0lOTuaJJ57g0KFDdO7cmfnz57sHdCckJGA2Gz4rrqHsDif7jrgKi7jQqh9wLiInYbfDmjWu2127gsVibB4REfFYV3WN5em5GXyzZp8Ki+o0btw4xo0bV+rXFi9efMrnzpgxo/IDeZjEjFwK7E68zCbqBVfeLDQiUk65udCzp+t2ZmalDN4WEZHa6bJO9Xlm3mbWJKSxOyWLxnWg10ndbgqoIfamulor6of44mXRj0xERETE00UF+XBOi0gAvlm73+A01UNXqTXA3iM5AMSF+RqcRERERETK6sourkHc36zdj9PpNDhN1VNhUQMUtVhofIWIiIhIzXFBu2j8bRYSUrNZveeI0XGqnAqLGsBdWISpsBARERGpKfxsXgxpXw+A2XWgO5QKixpg7xEVFiIiIiI10ZXH1rT4Yf0BcgvsBqepWiosaoC9qcfGWIRqjIWIiIhITXJW03DqBfuQkVvIr1uSjI5TpVRYeLi8QjuJR3MBtViIGM5qhYkTXZvVanQaERGpASxmE5d3drVa1PbuUB6xjoWc3P4jOTid4Gu1EO5vMzqOSN1ms8GkSUanEBGRGubKrrG8vWQHv25JIiO3gCCf2vnhlFosPNzxU82aTCaD04iIiIhIebWMDqRphD+FDidLt6UYHafKqLDwcAnHZoRqqG5QIsZzOGDTJtfmcBidRkREapB+rVyL5S2OTzY4SdVRYeHh9h0rLBpoDQsR4+XkQPv2ri0nx+g0IiJSg/RvFQXAkq3JtXaxPBUWHk5TzYqIiIjUfL2ahOFjNXMoI5f4xKNGx6kSKiw8nKaaFREREan5fKwWejcNB2pvdygVFh5OLRYiIiIitUNRd6jF8bVzPQsVFh4sI7eAtOwCQIWFiIiISE3X/9gA7lW7j3A0t8DgNJVPhYUH23ts4HaYv40Aby05IiIiIlKTNQr3p3G4n2va2e2HjY5T6VRYeDCNrxARERGpXf6dHar2dYfSx+AebN+x8RUN1A1KxDNYrfDgg//eFhERKad+rSKZ8eduFse7pp2tTQsgq7DwYEVdoeK0hoWIZ7DZ4PnnjU4hIiI1WO+m4Xh7mTmYnsu2pExaRgcaHanSqCuUB9t7xNUVSqtui4iIiNQOPlYLZ7mnna1d3aFUWHiwhKIWizCNsRDxCA4H7N7t2hwOo9OIiEgNVTQ7VG1bz0KFhYdyOp3uMRbqCiXiIXJyoEkT15aTY3QaERGpoYoGcK/cnUpmXqHBaSqPCgsPlZyZR26BA5MJ6oeoxUJERESktmgS4U+jcD8K7E7+3J5idJxKo8LCQxVNNVsvyAebl35MIiIiIrVJv5bHukNtrT3doXTF6qHc3aA0cFtERESk1ikaZ7Hk2LSztYEKCw/lnmpWhYWIiIhIrdO7aQQ2LzP703LYkZxpdJxKocLCQyVoDQsRERGRWsvXZqFXkzCg9swOpcLCQxWNsdBUsyIiIiK1U9HsUCospErt1RgLEc/j5QV33unavLyMTiMiIjVc0TiLFbtSyaoF087qL6MHKrQ7OJieC6grlIhH8faGN94wOoWIiNQSTSP8iQvzZW9qDst2HGZg22ijI1WIWiw80MH0XOwOJzYvM1GB3kbHEREREZEqYDKZ6N/yWHeorUkGp6k4FRYeqGjgdoNQX8xmk8FpRMTN6YTkZNdWS6YGFBERY/VpHgHAqt1HDE5ScSosPNBezQgl4pmysyEqyrVlZxudRkREaoHOcSEAbE08SnZ+zR5nocLCA/07cFszQomIiIjUZjHBPkQHeeNwwqYDGUbHqRAVFh7IPdWsWixEREREar2ODUIAWL83zdAcFaXCwgMVtVg01FSzIiIiIrVeUXeo9fvSjQ1SQSosPNC/i+OpsBARERGp7To2CAbg731pxgapIBUWHiY7v5CUzDxAXaFERERE6oKOsSEA7DmczZGsfGPDVIAKCw+z74irtSLQx4tgP6vBaURERESkqgX7WWkS4Q/A3/trbncorbztYTTVrIgH8/KCkSP/vS0iIlJJOjYIZldKFn/vTaNfy0ij45wR/WX0MEWFhQZui3ggb2+YMcPoFCIiUgt1ahDCd+sOsL4Gj7NQVygPs/dI0cBtrWEhIiIiUld0inMN4F63Nx2n02lwmjOjwsLDJBR1hVKLhYjncTohK8u11dBf+iIi4pna1gvGYjaRkpnHwfRco+OcERUWHkZjLEQ8WHY2BAS4tuxso9OIiEgt4muz0Co6EKi5086qsPAgTqfTPSuUukKJiIiI1C1F3aFq6kJ5Kiw8SFp2AZl5hQA0UIuFiIiISJ3SqUEIAOv3phma40ypsPAge4+4ulZEBXrjY7UYnEZEREREqlPHY4XFhn3pOBw1byyfCgsPsje1qBuUWitERERE6pqW0QH4WM0czStkZ0qW0XHKTYWFBzmQ5iosYkM0vkJERESkrvGymGlf3zXOoiYO4FZh4UGSM/MAiA7yNjiJiIiIiBihYw0eZ6GVtz1I8lFXYRERoMJCxCNZLHD11f/eFhERqWQ1eWYoFRYepKiwiAxUYSHikXx8YNYso1OIiEgtVjQz1D8HM8gvdGDzqjkdjGpO0jogJVOFhYiIiEhd1ijcj2BfK/mFDuIPHTU6TrmosPAg6golIiIiUreZTCY6NijqDpVmbJhyUmHhIQrsDlKz8wG1WIh4rKwsMJlcW1bNmwZQRERqhpq6UJ4KCw+RmpWP0wkWs4lQP5vRcURERETEIJ3iQgD4u4YN4FZh4SGKukGF+duwmE0GpxERERERo3Q61hVqW9JRsvIKDU5TdiosPETRGhaRGl8hIiIiUqdFBflQL9gHhxM27q85rRYqLDyEppoVERERkSJFA7hrUncoFRYeQlPNioiIiEiRohW419WgmaFUWHgITTUrIiIiIkU6uwdwpxmaozy08raHUFcokRrAYoGLLvr3toiISBVpH+vqCrU3NYfUrHzC/D1/1lC1WHgIdYUSqQF8fGDuXNfm42N0GhERqcWCfa00jfQHas5CeSosPMS/XaE8vxoVERERkapXtFDe7pSasSirukJ5iKLCIkotFiIiIiICPDKkNZMvb0eQj9XoKGWiFgsPkFtgJyPXtfhJZIC6V4h4rKws8Pd3bVk149MjERGpuWKCfWpMUQFqsfAIh7PyAbBZzAT56kci4tGys41OICIi4pHUYuEBjh9fYTKZDE4jIiIiIlJ+Kiw8gKaaFREREZGaToWFB9BUsyIiIiJS03lEYfHGG2/QuHFjfHx86NWrFytWrDjpY2fPnk337t0JCQnB39+fzp078/HHH1dj2sqnVbdFREREpKYzvLCYOXMm48ePZ+LEiaxZs4ZOnToxePBgkpKSSn18WFgYjz32GMuWLePvv/9m9OjRjB49mgULFlRz8sqjrlAiIiIiUtMZXli89NJL3HrrrYwePZq2bdvy9ttv4+fnxwcffFDq4/v3788VV1xBmzZtaNasGffeey8dO3bkjz/+qObklUddoURqCLMZ+vVzbWbDf32KiIh4FEPnNs3Pz2f16tVMmDDBvc9sNjNw4ECWLVt22uc7nU5++eUX4uPjee6550p9TF5eHnl5ee77GRkZABQUFFBQUFDBd3ByRccuy2skZeQCEOJjqdJMYrzynBfigby8YOHCf+9Xws9R54ScSOeElEbnhZyous6J8hzf0MIiJSUFu91OdHR0sf3R0dFs2bLlpM9LT08nNjaWvLw8LBYLb775JoMGDSr1sVOmTGHy5Mkl9v/000/4+flV7A2UwcLjL0JOYk+iBTCxfeMa5iVUeSTxAGU5L6Ru0TkhJ9I5IaXReSEnqupzIrsc6zfVyNXYAgMDWbduHZmZmSxatIjx48fTtGlT+vfvX+KxEyZMYPz48e77GRkZxMXFccEFFxAUFFRlGQsKCli4cCGDBg3Caj31iokTVi8C7Fw2qD+Nwqu+2BHjlOe8kLpB54ScSOeElEbnhZyous6Jot4+ZWFoYREREYHFYiExMbHY/sTERGJiYk76PLPZTPPmzQHo3LkzmzdvZsqUKaUWFt7e3nh7lxy7YLVaq+U/5uleJyuvkOx8OwAxof5YrTWy1pNyqq7zTypZVhY0buy6vXs3+PtX2qF1TsiJdE5IaXReyImq+pwoz7ENHX1os9no1q0bixYtcu9zOBwsWrSI3r17l/k4Doej2DiKmqRo4Lav1YK/zWJwGhE5rZQU1yYiIiLFGP7x+Pjx4xk5ciTdu3enZ8+eTJs2jaysLEaPHg3ATTfdRGxsLFOmTAFcYya6d+9Os2bNyMvLY968eXz88ce89dZbRr6NM3b8VLMmk8ngNCIiIiIiZ8bwwmL48OEkJyfzxBNPcOjQITp37sz8+fPdA7oTEhIwHzetY1ZWFnfeeSf79u3D19eX1q1b88knnzB8+HCj3kKFaKpZEREREakNDC8sAMaNG8e4ceNK/drixYuL3X/66ad5+umnqyFV9fh31W2bwUlERERERM6cVngymFbdFhEREZHaQIWFwZIz8wGIDPAxOImIiIiIyJnziK5QdZm7K1SgukKJeDyzGbp3//e2iIiIuKmwMFhy0eDtAHWFEvF4vr6wcqXRKURERDySPnIzWIrGWIiIiIhILaDCwkBOp9PdYhGhFgsRERERqcFUWBgoI7eQ/EIHoBYLkRohOxsaN3Zt2dlGpxEREfEoGmNhoKKB24E+XvhYLQanEZHTcjphz55/b4uIiIibWiwMpFW3RURERKS2UGFhoH9X3VZhISIiIiI1mwoLA2nVbRERERGpLVRYGChFa1iIiIiISC2hwsJAarEQERERkdpCs0IZSKtui9QwJhO0bfvvbREREXFTYWEgtViI1DB+frBpk9EpREREPJK6QhkoRatui4iIiEgtocLCIA6Hk5TMfEAtFiIiIiJS86mwMMiR7HzsDtfKveEBNoPTiEiZZGdDu3auLTvb6DQiIiIeRWMsDFLUWhHmb8NqUX0nUiM4nfDPP//eFhERETdd0Rrk31W31VohIiIiIjWfCguDJGfmAhpfISIiIiK1gwoLg6QcPTZwWzNCiYiIiEgtoMLCIMmaalZEREREahEVFgbR4ngiIiIiUptoViiDqLAQqYFMJmjU6N/bIiIi4qbCwiBadVukBvLzg927jU4hIiLikdQVyiBqsRARERGR2kSFhQEK7Q5Ss4/NCqXCQkRERERqARUWBkjNysfpBIvZRKifFsgTqTFycqBHD9eWk2N0GhEREY+iMRYGSDrWDSrM34bFrAGgIjWGwwGrVv17W0RERNzUYmGAojUstDieiIiIiNQWKiwMoIHbIiIiIlLbqLAwgKaa/f/27j6m6vrv4/jrCBxQAbVQhCRLzLxJLy81HZL509hsNs3c0l06shsyp66mZVJUmKb5c9rqcnanJa1ZWKnNJd6kyc9EnUpgLom8AbUpmqWBinH3vv7o4hSI5uF4zgF8Prbvds6Hz/d831/23uH78nsjAAAAmhqChR9wxgIAAABNDcHCDwgWAAAAaGp4KpQf/HUpFI+aBRqdiAh/VwAAQINEsPADzlgAjVTLltIvv/i7CgAAGiQuhfKD6mDRjmABAACAJoJg4WN/VFSq+FKFJKltaIifqwEAAACuD4KFj505XyZJcgY0U3hzrkQDGpXSUulf//pzKS31dzUAADQoHNn6WHhIoP73f/5bF/6okMPh8Hc5ANxRVSX95z9/vQYAAC4ECx8LCwnSyP+K9ncZAAAAwHXFpVAAAAAAPEawAAAAAOAxggUAAAAAjxEsAAAAAHiMm7cBwB0tWvi7AgAAGiSCBQBcq5YtpQsX/F0FAAANEpdCAQAAAPAYwQIAAACAxwgWAHCtLl2SHnjgz+XSJX9XAwBAg8I9FgBwrSorpYyMv14DAAAXzlgAAAAA8BjBAgAAAIDHCBYAAAAAPEawAAAAAOAxggUAAAAAj91wT4UyM0lScXGxV7dTXl6uixcvqri4WEFBQV7dFhoP+qKR+/v/ul1cfF2eDEVPoDZ6AnWhL1Cbr3qi+pi5+hj6am64YFFSUiJJiomJ8XMlABq16Gh/VwAAgM+UlJSoVatWV53jsGuJH01IVVWVTpw4obCwMDkcDq9tp7i4WDExMTp+/LjCw8O9th00LvQFaqMnUBs9gbrQF6jNVz1hZiopKVF0dLSaNbv6XRQ33BmLZs2aqUOHDj7bXnh4OF8AuAx9gdroCdRGT6Au9AVq80VP/NOZimrcvA0AAADAYwQLAAAAAB4jWHhJcHCwUlNTFRwc7O9S0IDQF6iNnkBt9ATqQl+gtobYEzfczdsAAAAArj/OWAAAAADwGMECAAAAgMcIFgAAAAA8RrDwwJIlS3TbbbcpJCREAwYM0O7du686//PPP1fXrl0VEhKinj17KiMjw0eVwpfc6YulS5dq0KBBatOmjdq0aaOEhIR/7CM0Pu5+V1RLT0+Xw+HQqFGjvFsgfM7dnjh37pymTJmiqKgoBQcHq0uXLvwNaYLc7Ys333xTd955p5o3b66YmBhNmzZNly5d8lG18LZt27ZpxIgRio6OlsPh0JdffvmP62RmZqpPnz4KDg5W586dlZaW5vU6azDUS3p6ujmdTvvwww/thx9+sCeffNJat25tp06dqnN+VlaWBQQE2IIFC+zAgQP20ksvWVBQkO3fv9/HlcOb3O2LcePG2ZIlSywnJ8fy8vLs0UcftVatWtnPP//s48rhLe72RLWCggK75ZZbbNCgQfbggw/6plj4hLs98ccff1i/fv1s+PDhtn37disoKLDMzEzLzc31ceXwJnf7YsWKFRYcHGwrVqywgoIC27hxo0VFRdm0adN8XDm8JSMjw1JSUmz16tUmydasWXPV+UeOHLEWLVrY9OnT7cCBA7Z48WILCAiwDRs2+KZgMyNY1FP//v1typQprveVlZUWHR1tr7/+ep3zx4wZYw888ECNsQEDBthTTz3l1TrhW+72RW0VFRUWFhZmH330kbdKhI/VpycqKips4MCBtmzZMpswYQLBoolxtyfeeecd69Spk5WVlfmqRPiBu30xZcoUGzp0aI2x6dOnW3x8vFfrhH9cS7B4/vnnrUePHjXGxo4da8OGDfNiZTVxKVQ9lJWVKTs7WwkJCa6xZs2aKSEhQTt37qxznZ07d9aYL0nDhg274nw0PvXpi9ouXryo8vJy3XTTTd4qEz5U356YPXu22rVrpyeeeMIXZcKH6tMTa9euVVxcnKZMmaLIyEjdddddmjdvniorK31VNrysPn0xcOBAZWdnuy6XOnLkiDIyMjR8+HCf1IyGpyEcawb6bEtNyJkzZ1RZWanIyMga45GRkfrxxx/rXKeoqKjO+UVFRV6rE75Vn76obebMmYqOjr7siwGNU316Yvv27frggw+Um5vrgwrha/XpiSNHjuibb77R+PHjlZGRoUOHDmny5MkqLy9XamqqL8qGl9WnL8aNG6czZ87onnvukZmpoqJCkyZN0osvvuiLktEAXelYs7i4WKWlpWrevLnXa+CMBdBAzJ8/X+np6VqzZo1CQkL8XQ78oKSkRImJiVq6dKkiIiL8XQ4aiKqqKrVr107vv/+++vbtq7FjxyolJUXvvvuuv0uDH2VmZmrevHl6++239d1332n16tVat26d5syZ4+/ScAPjjEU9REREKCAgQKdOnaoxfurUKbVv377Oddq3b+/WfDQ+9emLagsXLtT8+fO1efNm9erVy5tlwofc7YnDhw+rsLBQI0aMcI1VVVVJkgIDA5Wfn6/Y2FjvFg2vqs/3RFRUlIKCghQQEOAa69atm4qKilRWVian0+nVmuF99emLl19+WYmJiUpKSpIk9ezZUxcuXNDEiROVkpKiZs34t+MbzZWONcPDw31ytkLijEW9OJ1O9e3bV1u2bHGNVVVVacuWLYqLi6tznbi4uBrzJenrr7++4nw0PvXpC0lasGCB5syZow0bNqhfv36+KBU+4m5PdO3aVfv371dubq5rGTlypIYMGaLc3FzFxMT4snx4QX2+J+Lj43Xo0CFXyJSkn376SVFRUYSKJqI+fXHx4sXLwkN1+DQz7xWLBqtBHGv67DbxJiY9Pd2Cg4MtLS3NDhw4YBMnTrTWrVtbUVGRmZklJiZacnKya35WVpYFBgbawoULLS8vz1JTU3ncbBPkbl/Mnz/fnE6nffHFF3by5EnXUlJS4q9dwHXmbk/UxlOhmh53e+LYsWMWFhZmU6dOtfz8fPvqq6+sXbt29tprr/lrF+AF7vZFamqqhYWF2aeffmpHjhyxTZs2WWxsrI0ZM8Zfu4DrrKSkxHJyciwnJ8ck2RtvvGE5OTl29OhRMzNLTk62xMRE1/zqx83OmDHD8vLybMmSJTxutjFZvHix3XrrreZ0Oq1///62a9cu188GDx5sEyZMqDH/s88+sy5dupjT6bQePXrYunXrfFwxfMGdvujYsaNJumxJTU31feHwGne/K/6OYNE0udsTO3bssAEDBlhwcLB16tTJ5s6daxUVFT6uGt7mTl+Ul5fbrFmzLDY21kJCQiwmJsYmT55sZ8+e9X3h8IqtW7fWeYxQ3QcTJkywwYMHX7ZO7969zel0WqdOnWz58uU+rdlhxvkyAAAAAJ7hHgsAAAAAHiNYAAAAAPAYwQIAAACAxwgWAAAAADxGsAAAAADgMYIFAAAAAI8RLAAAAAB4jGABAAAAwGMECwDAP8rMzJTD4dC5c+d8ut20tDS1bt3ao88oLCyUw+FQbm7uFef4a/8AoCkhWADADc7hcFx1mTVrlr9LBAA0AoH+LgAA4F8nT550vV65cqVeeeUV5efnu8ZCQ0O1d+9etz+3rKxMTqfzutQIAGj4OGMBADe49u3bu5ZWrVrJ4XDUGAsNDXXNzc7OVr9+/dSiRQsNHDiwRgCZNWuWevfurWXLlun2229XSEiIJOncuXNKSkpS27ZtFR4erqFDh2rfvn2u9fbt26chQ4YoLCxM4eHh6tu372VBZuPGjerWrZtCQ0N1//331whDVVVVmj17tjp06KDg4GD17t1bGzZsuOo+Z2RkqEuXLmrevLmGDBmiwsJCT36FAAARLAAAbkhJSdGiRYu0d+9eBQYG6vHHH6/x80OHDmnVqlVavXq1656Ghx9+WKdPn9b69euVnZ2tPn366L777tNvv/0mSRo/frw6dOigPXv2KDs7W8nJyQoKCnJ95sWLF7Vw4UJ9/PHH2rZtm44dO6bnnnvO9fO33npLixYt0sKFC/X9999r2LBhGjlypA4ePFjnPhw/flyjR4/WiBEjlJubq6SkJCUnJ1/n3xQA3IAMAID/t3z5cmvVqtVl41u3bjVJtnnzZtfYunXrTJKVlpaamVlqaqoFBQXZ6dOnXXO+/fZbCw8Pt0uXLtX4vNjYWHvvvffMzCwsLMzS0tKuWI8kO3TokGtsyZIlFhkZ6XofHR1tc+fOrbHe3XffbZMnTzYzs4KCApNkOTk5Zmb2wgsvWPfu3WvMnzlzpkmys2fP1lkHAOCfccYCAHDNevXq5XodFRUlSTp9+rRrrGPHjmrbtq3r/b59+3T+/HndfPPNCg0NdS0FBQU6fPiwJGn69OlKSkpSQkKC5s+f7xqv1qJFC8XGxtbYbvU2i4uLdeLECcXHx9dYJz4+Xnl5eXXuQ15engYMGFBjLC4u7pp/BwCAunHzNgDgmv39EiWHwyHpz3scqrVs2bLG/PPnzysqKkqZmZmXfVb1Y2RnzZqlcePGad26dVq/fr1SU1OVnp6uhx566LJtVm/XzK7H7gAAriPOWAAAvKZPnz4qKipSYGCgOnfuXGOJiIhwzevSpYumTZumTZs2afTo0Vq+fPk1fX54eLiio6OVlZVVYzwrK0vdu3evc51u3bpp9+7dNcZ27drl5p4BAGojWAAAvCYhIUFxcXEaNWqUNm3apMLCQu3YsUMpKSnau3evSktLNXXqVGVmZuro0aPKysrSnj171K1bt2vexowZM/Tvf/9bK1euVH5+vpKTk5Wbm6tnnnmmzvmTJk3SwYMHNWPGDOXn5+uTTz5RWlraddpjALhxcSkUAMBrHA6HMjIylJKSoscee0y//PKL2rdvr3vvvVeRkZEKCAjQr7/+qkceeUSnTp1SRESERo8erVdfffWat/H000/r999/17PPPqvTp0+re/fuWrt2re6444465996661atWqVpk2bpsWLF6t///6aN2/eZU+4AgC4x2FcqAoAAADAQ1wKBQAAAMBjBAsAAAAAHiNYAAAAAPAYwQIAAACAxwgWAAAAADxGsAAAAADgMYIFAAAAAI8RLAAAAAB4jGABAAAAwGMECwAAAAAeI1gAAAAA8BjBAgAAAIDH/g89hibaBlA91AAAAABJRU5ErkJggg==",
|
| 297 |
+
"text/plain": [
|
| 298 |
+
"<Figure size 800x500 with 1 Axes>"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"output_type": "display_data"
|
| 303 |
+
}
|
| 304 |
+
],
|
| 305 |
+
"source": [
|
| 306 |
+
"# %%\n",
|
| 307 |
+
"import numpy as np\n",
|
| 308 |
+
"import matplotlib.pyplot as plt\n",
|
| 309 |
+
"from sklearn.metrics import precision_recall_curve, auc\n",
|
| 310 |
+
"\n",
|
| 311 |
+
"# Funções métricas\n",
|
| 312 |
+
"THR = np.linspace(0.01, 0.99, 99)\n",
|
| 313 |
+
"\n",
|
| 314 |
+
"def fmax(y_t, y_p):\n",
|
| 315 |
+
" best, thr = 0, 0\n",
|
| 316 |
+
" for t in THR:\n",
|
| 317 |
+
" y_b = (y_p >= t).astype(int)\n",
|
| 318 |
+
" tp = (y_t * y_b).sum(1)\n",
|
| 319 |
+
" fp = ((1 - y_t) * y_b).sum(1)\n",
|
| 320 |
+
" fn = (y_t * (1 - y_b)).sum(1)\n",
|
| 321 |
+
" f1 = 2 * tp / (2 * tp + fp + fn + 1e-8)\n",
|
| 322 |
+
" m = f1.mean()\n",
|
| 323 |
+
" if m > best:\n",
|
| 324 |
+
" best, thr = m, t\n",
|
| 325 |
+
" return best, thr\n",
|
| 326 |
+
"\n",
|
| 327 |
+
"def auprc(y_t, y_p):\n",
|
| 328 |
+
" p, r, _ = precision_recall_curve(y_t.ravel(), y_p.ravel())\n",
|
| 329 |
+
" return auc(r, p)\n",
|
| 330 |
+
"\n",
|
| 331 |
+
"def smin(y_t, y_p, thr, alpha=0.5):\n",
|
| 332 |
+
" y_b = (y_p >= thr).astype(int)\n",
|
| 333 |
+
" ic = -(np.log((y_t + y_b).sum(0) + 1e-8) - np.log((y_t + y_b).sum() + 1e-8))\n",
|
| 334 |
+
" ru = np.logical_and(y_b, np.logical_not(y_t)) * ic\n",
|
| 335 |
+
" mi = np.logical_and(y_t, np.logical_not(y_b)) * ic\n",
|
| 336 |
+
" return np.sqrt((alpha * ru.sum(1))**2 + ((1 - alpha) * mi.sum(1))**2).mean()\n",
|
| 337 |
+
"\n",
|
| 338 |
+
"# Avaliação\n",
|
| 339 |
+
"f, thr = fmax(y_stack, y_pred_stack)\n",
|
| 340 |
+
"print(f\"\\n🔎 STACKING (MLP) — Avaliação completa\")\n",
|
| 341 |
+
"print(f\"Fmax = {f:.4f}\")\n",
|
| 342 |
+
"print(f\"Thr. = {thr:.2f}\")\n",
|
| 343 |
+
"print(f\"AuPRC = {auprc(y_stack, y_pred_stack):.4f}\")\n",
|
| 344 |
+
"print(f\"Smin = {smin(y_stack, y_pred_stack, thr):.4f}\")\n",
|
| 345 |
+
"\n",
|
| 346 |
+
"# Gráfico Fmax vs Threshold\n",
|
| 347 |
+
"fmax_scores = []\n",
|
| 348 |
+
"for t in THR:\n",
|
| 349 |
+
" y_b = (y_pred_stack >= t).astype(int)\n",
|
| 350 |
+
" tp = (y_stack * y_b).sum(1)\n",
|
| 351 |
+
" fp = ((1 - y_stack) * y_b).sum(1)\n",
|
| 352 |
+
" fn = (y_stack * (1 - y_b)).sum(1)\n",
|
| 353 |
+
" f1 = 2 * tp / (2 * tp + fp + fn + 1e-8)\n",
|
| 354 |
+
" fmax_scores.append(f1.mean())\n",
|
| 355 |
+
"\n",
|
| 356 |
+
"plt.figure(figsize=(8, 5))\n",
|
| 357 |
+
"plt.plot(THR, fmax_scores, label=\"F1 médio (Fmax)\")\n",
|
| 358 |
+
"plt.axvline(thr, color=\"red\", linestyle=\"--\", label=f\"Threshold ótimo = {thr:.2f}\")\n",
|
| 359 |
+
"plt.xlabel(\"Threshold\")\n",
|
| 360 |
+
"plt.ylabel(\"F1-score médio\")\n",
|
| 361 |
+
"plt.title(\"Fmax vs Threshold (Stacking MLP)\")\n",
|
| 362 |
+
"plt.legend()\n",
|
| 363 |
+
"plt.grid(True)\n",
|
| 364 |
+
"plt.tight_layout()\n",
|
| 365 |
+
"plt.show()\n"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"cell_type": "code",
|
| 370 |
+
"execution_count": null,
|
| 371 |
+
"id": "ee9f9472-0b22-4ceb-ab09-7d925349e237",
|
| 372 |
+
"metadata": {},
|
| 373 |
+
"outputs": [],
|
| 374 |
+
"source": []
|
| 375 |
+
}
|
| 376 |
+
],
|
| 377 |
+
"metadata": {
|
| 378 |
+
"kernelspec": {
|
| 379 |
+
"display_name": "Python 3 (ipykernel)",
|
| 380 |
+
"language": "python",
|
| 381 |
+
"name": "python3"
|
| 382 |
+
},
|
| 383 |
+
"language_info": {
|
| 384 |
+
"codemirror_mode": {
|
| 385 |
+
"name": "ipython",
|
| 386 |
+
"version": 3
|
| 387 |
+
},
|
| 388 |
+
"file_extension": ".py",
|
| 389 |
+
"mimetype": "text/x-python",
|
| 390 |
+
"name": "python",
|
| 391 |
+
"nbconvert_exporter": "python",
|
| 392 |
+
"pygments_lexer": "ipython3",
|
| 393 |
+
"version": "3.10.16"
|
| 394 |
+
}
|
| 395 |
+
},
|
| 396 |
+
"nbformat": 4,
|
| 397 |
+
"nbformat_minor": 5
|
| 398 |
+
}
|
notebooks/Input.ipynb
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 15,
|
| 6 |
+
"id": "78731790-cecc-4e7b-9599-c35a9fad1c11",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"A gerar embeddings …\n"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"name": "stderr",
|
| 18 |
+
"output_type": "stream",
|
| 19 |
+
"text": [
|
| 20 |
+
"Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']\n",
|
| 21 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"name": "stdout",
|
| 26 |
+
"output_type": "stream",
|
| 27 |
+
"text": [
|
| 28 |
+
"A fazer predições individuais …\n",
|
| 29 |
+
"1/1 [==============================] - 0s 47ms/step\n",
|
| 30 |
+
"1/1 [==============================] - 0s 33ms/step\n",
|
| 31 |
+
"1/1 [==============================] - 0s 30ms/step\n"
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"ename": "ValueError",
|
| 36 |
+
"evalue": "in user code:\n\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2341, in predict_function *\n return step_function(self, iterator)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2327, in step_function **\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2315, in run_step **\n outputs = model.predict_step(data)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2283, in predict_step\n return self(x, training=False)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\utils\\traceback_utils.py\", line 70, in error_handler\n raise e.with_traceback(filtered_tb) from None\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\input_spec.py\", line 298, in assert_input_compatibility\n raise ValueError(\n\n ValueError: Input 0 of layer \"sequential\" is incompatible with the layer: expected shape=(None, 1779), found shape=(None, 1791)\n",
|
| 37 |
+
"output_type": "error",
|
| 38 |
+
"traceback": [
|
| 39 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 40 |
+
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
| 41 |
+
"Cell \u001b[1;32mIn[15], line 47\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;66;03m# --- 4. Ensemble (stacking) -----------------------------------------------\u001b[39;00m\n\u001b[0;32m 46\u001b[0m X_stack \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mconcatenate([y_pb, y_bfd, y_esm], axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m---> 47\u001b[0m y_ens \u001b[38;5;241m=\u001b[39m \u001b[43mstacking\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_stack\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;66;03m# --- 5. Carregar MultiLabelBinarizer ---------------------------------------\u001b[39;00m\n\u001b[0;32m 50\u001b[0m mlb \u001b[38;5;241m=\u001b[39m joblib\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/mlb_597.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
| 42 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\utils\\traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 67\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[0;32m 68\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[0;32m 69\u001b[0m \u001b[38;5;66;03m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[1;32m---> 70\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
|
| 43 |
+
"File \u001b[1;32m~\\AppData\\Local\\Temp\\__autograph_generated_filen1meoyfq.py:15\u001b[0m, in \u001b[0;36mouter_factory.<locals>.inner_factory.<locals>.tf__predict_function\u001b[1;34m(iterator)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 14\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m---> 15\u001b[0m retval_ \u001b[38;5;241m=\u001b[39m ag__\u001b[38;5;241m.\u001b[39mconverted_call(ag__\u001b[38;5;241m.\u001b[39mld(step_function), (ag__\u001b[38;5;241m.\u001b[39mld(\u001b[38;5;28mself\u001b[39m), ag__\u001b[38;5;241m.\u001b[39mld(iterator)), \u001b[38;5;28;01mNone\u001b[39;00m, fscope)\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[0;32m 17\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
|
| 44 |
+
"\u001b[1;31mValueError\u001b[0m: in user code:\n\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2341, in predict_function *\n return step_function(self, iterator)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2327, in step_function **\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2315, in run_step **\n outputs = model.predict_step(data)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2283, in predict_step\n return self(x, training=False)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\utils\\traceback_utils.py\", line 70, in error_handler\n raise e.with_traceback(filtered_tb) from None\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\input_spec.py\", line 298, in assert_input_compatibility\n raise ValueError(\n\n ValueError: Input 0 of layer \"sequential\" is incompatible with the layer: expected shape=(None, 1779), found shape=(None, 1791)\n"
|
| 45 |
+
]
|
| 46 |
+
}
|
| 47 |
+
],
|
| 48 |
+
"source": [
|
| 49 |
+
"# %%\n",
|
| 50 |
+
"import numpy as np\n",
|
| 51 |
+
"import torch\n",
|
| 52 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
| 53 |
+
"from tensorflow.keras.models import load_model\n",
|
| 54 |
+
"import joblib\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"# Parâmetros\n",
|
| 57 |
+
"SEQ_FASTA = \"MFNVESVERVELCESLLTWIQTFNVDAPCQTAEDLTNGVVMSQVLQKIDPVYFDDNWLNRIKTEVGDNWRLKISNLKKILKGILDYNHEILGQQINDFTLPDVNLIGEHSDAAELGRMLQLILGCAVNCEQKQEYIQAIMMMEESVQHVVMTAIQELMSKESPVSAGHDAYVDLDRQLKKTTEELNEALSAKEEIAQRCHELDMQVAALQEEKSSLLAENQILMERLNQSDSIEDPNSPAGRRHLQLQTQLEQLQEETFRLEAAKDDYRIRCEELEKEISELRQQNDELTTLADEAQSLKDEIDVLRHSSDKVSKLEGQVESYKKKLEDLGDLRRQVKLLEEKNTMYMQNTVSLEEELRKANAARGQLETYKRQVVELQNRLSDESKKADKLDFEYKRLKEKVDGLQKEKDRLRTERDSLKETIEELRCVQAQEGQLTTQGLMPLGSQESSDSLAAEIVTPEIREKLIRLQHENKMLKLNQEDSDNEKIALLQSLLDDANLRKNELETENRLVNQRLLEVQSQVEELQKSLQDQGSKAEDSVLLKKKLEEHLEKLHEANNELQKKRAIIEDLEPRFNNSSLRIEELQEALRKKEEEMKQMEERYKKYLEKAKSVIRTLDPKQNQGAAPEIQALKNQLQERDRLFHSLEKEYEKTKSQRDMEEKYIVSAWYNMGMTLHKKAAEDRLASTGSGQSFLARQRQATSTRRSYPGHVQPATAR\" # (mantém a tua sequência completa)\n",
|
| 58 |
+
"TOP_N = 10\n",
|
| 59 |
+
"THRESH = 0.37 \n",
|
| 60 |
+
"\n",
|
| 61 |
+
"# Funções auxiliares\n",
|
| 62 |
+
"def get_embedding_mean(model_name, seq, chunk):\n",
|
| 63 |
+
" tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)\n",
|
| 64 |
+
" model = AutoModel.from_pretrained(model_name)\n",
|
| 65 |
+
" model.eval()\n",
|
| 66 |
+
"\n",
|
| 67 |
+
" chunks = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]\n",
|
| 68 |
+
" reps = []\n",
|
| 69 |
+
" for c in chunks:\n",
|
| 70 |
+
" tokens = tokenizer(\" \".join(c), return_tensors=\"pt\", truncation=False, padding=False)\n",
|
| 71 |
+
" with torch.no_grad():\n",
|
| 72 |
+
" reps.append(model(**tokens).last_hidden_state[:, 0, :].squeeze().numpy())\n",
|
| 73 |
+
" return np.mean(reps, axis=0, keepdims=True) # shape (1, dim)\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"# Embeddings\n",
|
| 76 |
+
"print(\"A gerar embeddings …\")\n",
|
| 77 |
+
"emb_pb = get_embedding_mean(\"Rostlab/prot_bert\", SEQ_FASTA, 512)\n",
|
| 78 |
+
"emb_bfd = get_embedding_mean(\"Rostlab/prot_bert_bfd\", SEQ_FASTA, 512)\n",
|
| 79 |
+
"emb_esm = get_embedding_mean(\"facebook/esm2_t33_650M_UR50D\", SEQ_FASTA, 1024)\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"# Carregar modelos\n",
|
| 82 |
+
"mlp_pb = load_model(\"models/mlp_protbert.h5\")\n",
|
| 83 |
+
"mlp_bfd = load_model(\"models/mlp_protbertbfd.h5\")\n",
|
| 84 |
+
"mlp_esm = load_model(\"models/mlp_esm2.h5\")\n",
|
| 85 |
+
"stacking = load_model(\"models/ensemble_stack.h5\")\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"# Predições dos MLPs base\n",
|
| 88 |
+
"print(\"A fazer predições individuais …\")\n",
|
| 89 |
+
"y_pb = mlp_pb.predict(emb_pb)[:, :597]\n",
|
| 90 |
+
"y_bfd = mlp_bfd.predict(emb_bfd)[:, :597]\n",
|
| 91 |
+
"y_esm = mlp_esm.predict(emb_esm)[:, :597]\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"# --- 4. Ensemble (stacking)\n",
|
| 94 |
+
"X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)\n",
|
| 95 |
+
"y_ens = stacking.predict(X_stack)\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"# --- 5. Carregar MultiLabelBinarizer\n",
|
| 98 |
+
"mlb = joblib.load(\"data/mlb_597.pkl\")\n",
|
| 99 |
+
"GO = mlb.classes_\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"# --- 6. Função para mostrar resultados\n",
|
| 102 |
+
"def print_results(name, y_pred):\n",
|
| 103 |
+
" print(f\"\\n {name}\")\n",
|
| 104 |
+
" # GO terms acima do limiar\n",
|
| 105 |
+
" terms = mlb.inverse_transform((y_pred >= THRESH).astype(int))\n",
|
| 106 |
+
" print(f\" GO terms com prob ≥ {THRESH}:\")\n",
|
| 107 |
+
" print(\" \", terms[0] if terms[0] else \"Nenhum\")\n",
|
| 108 |
+
"\n",
|
| 109 |
+
" # Top-N\n",
|
| 110 |
+
" top_idx = np.argsort(-y_pred[0])[:TOP_N]\n",
|
| 111 |
+
" print(f\" Top {TOP_N} mais prováveis:\")\n",
|
| 112 |
+
" for i in top_idx:\n",
|
| 113 |
+
" print(f\" {GO[i]} : {y_pred[0][i]:.4f}\")\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"# Imprimir tudo\n",
|
| 116 |
+
"print_results(\"ProtBERT (MLP)\", y_pb)\n",
|
| 117 |
+
"print_results(\"ProtBERT-BFD (MLP)\", y_bfd)\n",
|
| 118 |
+
"print_results(\"ESM-2 (MLP)\", y_esm)\n",
|
| 119 |
+
"print_results(\"Ensemble (Stacking)\", y_ens)\n"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"cell_type": "code",
|
| 124 |
+
"execution_count": null,
|
| 125 |
+
"id": "70a3035b-01cd-4c63-b34d-d520d2aa88bf",
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [],
|
| 128 |
+
"source": []
|
| 129 |
+
}
|
| 130 |
+
],
|
| 131 |
+
"metadata": {
|
| 132 |
+
"kernelspec": {
|
| 133 |
+
"display_name": "Python 3 (ipykernel)",
|
| 134 |
+
"language": "python",
|
| 135 |
+
"name": "python3"
|
| 136 |
+
},
|
| 137 |
+
"language_info": {
|
| 138 |
+
"codemirror_mode": {
|
| 139 |
+
"name": "ipython",
|
| 140 |
+
"version": 3
|
| 141 |
+
},
|
| 142 |
+
"file_extension": ".py",
|
| 143 |
+
"mimetype": "text/x-python",
|
| 144 |
+
"name": "python",
|
| 145 |
+
"nbconvert_exporter": "python",
|
| 146 |
+
"pygments_lexer": "ipython3",
|
| 147 |
+
"version": "3.10.16"
|
| 148 |
+
}
|
| 149 |
+
},
|
| 150 |
+
"nbformat": 4,
|
| 151 |
+
"nbformat_minor": 5
|
| 152 |
+
}
|
notebooks/mlp_esm2.ipynb
ADDED
|
@@ -0,0 +1,565 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 9,
|
| 6 |
+
"id": "641053e3-7fec-4f9b-a75e-ddd957af03c4",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n",
|
| 14 |
+
"✓ Dataset preparado:\n",
|
| 15 |
+
" - Training: (31142, 3)\n",
|
| 16 |
+
" - Validation: (1724, 3)\n",
|
| 17 |
+
" - Test: (1724, 3)\n",
|
| 18 |
+
" - GO terms: 602\n"
|
| 19 |
+
]
|
| 20 |
+
}
|
| 21 |
+
],
|
| 22 |
+
"source": [
|
| 23 |
+
"# %%\n",
|
| 24 |
+
"import pandas as pd\n",
|
| 25 |
+
"import numpy as np\n",
|
| 26 |
+
"from Bio import SeqIO\n",
|
| 27 |
+
"from goatools.obo_parser import GODag\n",
|
| 28 |
+
"from collections import Counter\n",
|
| 29 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 30 |
+
"from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n",
|
| 31 |
+
"import os, random\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"# Carregar ficheiros principais\n",
|
| 34 |
+
"FASTA = \"uniprot_sprot_exp.fasta\"\n",
|
| 35 |
+
"ANNOT = \"uniprot_sprot_exp.txt\"\n",
|
| 36 |
+
"GO_OBO = \"go.obo\"\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"# Ler sequências\n",
|
| 39 |
+
"seqs, ids = [], []\n",
|
| 40 |
+
"for record in SeqIO.parse(FASTA, \"fasta\"):\n",
|
| 41 |
+
" ids.append(record.id)\n",
|
| 42 |
+
" seqs.append(str(record.seq))\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"df_seq = pd.DataFrame({\"protein_id\": ids, \"sequence\": seqs})\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"# Ler anotações GO:MF\n",
|
| 47 |
+
"df_ann = pd.read_csv(ANNOT, sep=\"\\t\", names=[\"protein_id\", \"go_term\", \"category\"])\n",
|
| 48 |
+
"df_ann = df_ann[df_ann[\"category\"] == \"F\"]\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"# Propagação hierárquica dos GO terms\n",
|
| 51 |
+
"go_dag = GODag(GO_OBO)\n",
|
| 52 |
+
"mf_terms = {t for t, o in go_dag.items() if o.namespace == \"molecular_function\"}\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"def propagate_terms(terms):\n",
|
| 55 |
+
" expanded = set()\n",
|
| 56 |
+
" for t in terms:\n",
|
| 57 |
+
" if t in go_dag:\n",
|
| 58 |
+
" expanded |= go_dag[t].get_all_parents()\n",
|
| 59 |
+
" expanded.add(t)\n",
|
| 60 |
+
" return list(expanded & mf_terms)\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"grouped = df_ann.groupby(\"protein_id\")[\"go_term\"].apply(list).reset_index()\n",
|
| 63 |
+
"grouped[\"go_term\"] = grouped[\"go_term\"].apply(propagate_terms)\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"# Juntar com sequência\n",
|
| 66 |
+
"df = df_seq.merge(grouped, on=\"protein_id\")\n",
|
| 67 |
+
"df = df[df[\"go_term\"].str.len() > 0]\n",
|
| 68 |
+
"\n",
|
| 69 |
+
"# Filtrar GO terms com ≥50 proteínas\n",
|
| 70 |
+
"all_terms = [term for sublist in df[\"go_term\"] for term in sublist]\n",
|
| 71 |
+
"term_counts = Counter(all_terms)\n",
|
| 72 |
+
"valid_terms = {t for t, count in term_counts.items() if count >= 50}\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"df[\"go_term\"] = df[\"go_term\"].apply(lambda ts: [t for t in ts if t in valid_terms])\n",
|
| 75 |
+
"df = df[df[\"go_term\"].str.len() > 0]\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"# Preparar labels e dividir por proteína\n",
|
| 78 |
+
"df[\"go_terms\"] = df[\"go_term\"].apply(lambda x: ';'.join(sorted(set(x))))\n",
|
| 79 |
+
"df = df[[\"protein_id\", \"sequence\", \"go_terms\"]].drop_duplicates()\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"mlb = MultiLabelBinarizer()\n",
|
| 82 |
+
"Y = mlb.fit_transform(df[\"go_terms\"].str.split(\";\"))\n",
|
| 83 |
+
"X = df[[\"protein_id\", \"sequence\"]].values\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"mskf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n",
|
| 86 |
+
"train_idx, temp_idx = next(mskf.split(X, Y))\n",
|
| 87 |
+
"val_idx, test_idx = np.array_split(temp_idx, 2)\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"df_train = df.iloc[train_idx].copy()\n",
|
| 90 |
+
"df_val = df.iloc[val_idx].copy()\n",
|
| 91 |
+
"df_test = df.iloc[test_idx].copy()\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"os.makedirs(\"data\", exist_ok=True)\n",
|
| 94 |
+
"df_train.to_csv(\"data/mf-training.csv\", index=False)\n",
|
| 95 |
+
"df_val.to_csv(\"data/mf-validation.csv\", index=False)\n",
|
| 96 |
+
"df_test.to_csv(\"data/mf-test.csv\", index=False)\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"# Guardar o binarizador\n",
|
| 99 |
+
"import joblib\n",
|
| 100 |
+
"joblib.dump(mlb, \"data/mlb.pkl\")\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"print(\"✓ Dataset preparado:\")\n",
|
| 103 |
+
"print(\" - Training:\", df_train.shape)\n",
|
| 104 |
+
"print(\" - Validation:\", df_val.shape)\n",
|
| 105 |
+
"print(\" - Test:\", df_test.shape)\n",
|
| 106 |
+
"print(\" - GO terms:\", len(mlb.classes_))\n"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "code",
|
| 111 |
+
"execution_count": 10,
|
| 112 |
+
"id": "40ba1798-daf8-4649-ae3f-bfe81df6437f",
|
| 113 |
+
"metadata": {},
|
| 114 |
+
"outputs": [],
|
| 115 |
+
"source": [
|
| 116 |
+
"# %%\n",
|
| 117 |
+
"import random\n",
|
| 118 |
+
"from collections import defaultdict\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"# PAM1\n",
|
| 121 |
+
"# PAM matrix model of protein evolution\n",
|
| 122 |
+
"# DOI:10.1093/oxfordjournals.molbev.a040360\n",
|
| 123 |
+
"pam_data = {\n",
|
| 124 |
+
" 'A': [9948, 19, 27, 42, 31, 46, 50, 92, 17, 7, 40, 88, 42, 41, 122, 279, 255, 9, 72, 723],\n",
|
| 125 |
+
" 'R': [14, 9871, 24, 38, 37, 130, 38, 62, 49, 4, 58, 205, 26, 33, 47, 103, 104, 5, 36, 52],\n",
|
| 126 |
+
" 'N': [20, 22, 9860, 181, 29, 36, 41, 67, 31, 5, 22, 49, 23, 10, 33, 83, 66, 3, 43, 32],\n",
|
| 127 |
+
" 'D': [40, 34, 187, 9818, 11, 63, 98, 61, 23, 5, 25, 54, 43, 13, 27, 88, 55, 4, 29, 36],\n",
|
| 128 |
+
" 'C': [20, 16, 26, 9, 9987, 10, 17, 37, 12, 2, 16, 26, 10, 19, 27, 26, 25, 2, 6, 67],\n",
|
| 129 |
+
" 'Q': [29, 118, 29, 49, 8, 9816, 72, 55, 36, 4, 60, 158, 35, 22, 39, 86, 74, 3, 34, 28],\n",
|
| 130 |
+
" 'E': [35, 29, 41, 101, 12, 71, 9804, 56, 33, 5, 36, 107, 42, 20, 38, 87, 69, 4, 30, 42],\n",
|
| 131 |
+
" 'G': [96, 61, 77, 70, 38, 51, 58, 9868, 26, 6, 37, 53, 39, 28, 69, 134, 116, 5, 47, 60],\n",
|
| 132 |
+
" 'H': [17, 53, 33, 19, 15, 39, 34, 24, 9907, 3, 32, 57, 24, 15, 27, 47, 43, 2, 22, 19],\n",
|
| 133 |
+
" 'I': [6, 3, 6, 6, 3, 5, 6, 7, 3, 9973, 23, 13, 12, 41, 93, 84, 115, 3, 8, 102],\n",
|
| 134 |
+
" 'L': [26, 39, 17, 15, 7, 33, 22, 20, 19, 27, 9864, 49, 24, 78, 117, 148, 193, 5, 24, 70],\n",
|
| 135 |
+
" 'K': [60, 198, 43, 52, 12, 142, 96, 53, 42, 10, 63, 9710, 33, 26, 54, 109, 102, 5, 43, 42],\n",
|
| 136 |
+
" 'M': [21, 22, 15, 18, 6, 20, 18, 18, 17, 11, 27, 32, 9945, 26, 34, 61, 71, 3, 12, 31],\n",
|
| 137 |
+
" 'F': [18, 17, 8, 6, 8, 11, 10, 16, 10, 44, 92, 24, 29, 9899, 89, 88, 142, 7, 14, 68],\n",
|
| 138 |
+
" 'P': [97, 47, 35, 29, 23, 35, 38, 57, 21, 24, 47, 56, 28, 76, 9785, 115, 77, 4, 24, 35],\n",
|
| 139 |
+
" 'S': [241, 87, 76, 73, 17, 56, 60, 99, 32, 13, 69, 92, 42, 67, 100, 9605, 212, 8, 63, 70],\n",
|
| 140 |
+
" 'T': [186, 78, 54, 37, 14, 42, 42, 83, 28, 23, 84, 85, 53, 93, 66, 182, 9676, 8, 39, 90],\n",
|
| 141 |
+
" 'W': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 5, 3, 4, 4, 9960, 3, 4],\n",
|
| 142 |
+
" 'Y': [29, 21, 17, 9, 4, 13, 9, 21, 10, 7, 20, 17, 11, 23, 19, 41, 31, 3, 9935, 23],\n",
|
| 143 |
+
" 'V': [368, 27, 18, 18, 50, 23, 34, 64, 15, 85, 72, 42, 33, 88, 42, 112, 137, 4, 20, 9514]\n",
|
| 144 |
+
"}\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"pam_raw = pd.DataFrame(pam_data, index=pam_data.keys())\n",
|
| 147 |
+
"pam_matrix = pam_raw.div(pam_raw.sum(axis=1), axis=0)\n",
|
| 148 |
+
"pam_dict = {aa: pam_matrix.loc[aa].to_dict() for aa in pam_matrix.index}\n",
|
| 149 |
+
"\n",
|
| 150 |
+
"def pam1_substitution(aa):\n",
|
| 151 |
+
" if aa not in pam_dict:\n",
|
| 152 |
+
" return aa\n",
|
| 153 |
+
" subs = list(pam_dict[aa].keys())\n",
|
| 154 |
+
" probs = list(pam_dict[aa].values())\n",
|
| 155 |
+
" return np.random.choice(subs, p=probs)\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"def augment_sequence(seq, sub_prob=0.05):\n",
|
| 158 |
+
" return ''.join([pam1_substitution(aa) if random.random() < sub_prob else aa for aa in seq])\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"def slice_sequence(seq, win=1024):\n",
|
| 161 |
+
" if len(seq) <= win:\n",
|
| 162 |
+
" return [seq]\n",
|
| 163 |
+
" return [seq[i:i+win] for i in range(0, len(seq), win)]\n",
|
| 164 |
+
"\n",
|
| 165 |
+
"def format_seq(seq):\n",
|
| 166 |
+
" return \" \".join(seq)\n",
|
| 167 |
+
"\n",
|
| 168 |
+
"# Carregar labels e datasets\n",
|
| 169 |
+
"import joblib\n",
|
| 170 |
+
"mlb = joblib.load(\"data/mlb.pkl\")\n",
|
| 171 |
+
"df_train = pd.read_csv(\"data/mf-training.csv\")\n",
|
| 172 |
+
"df_val = pd.read_csv(\"data/mf-validation.csv\")\n",
|
| 173 |
+
"df_test = pd.read_csv(\"data/mf-test.csv\")\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"# Slicing + augmentação no treino\n",
|
| 176 |
+
"X_train, y_train = [], []\n",
|
| 177 |
+
"\n",
|
| 178 |
+
"for _, row in df_train.iterrows():\n",
|
| 179 |
+
" seq_aug = augment_sequence(row[\"sequence\"], sub_prob=0.05)\n",
|
| 180 |
+
" slices = slice_sequence(seq_aug, win=1024)\n",
|
| 181 |
+
" label = mlb.transform([row[\"go_terms\"].split(\";\")])[0]\n",
|
| 182 |
+
" for sl in slices:\n",
|
| 183 |
+
" X_train.append(format_seq(sl))\n",
|
| 184 |
+
" y_train.append(label)\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"# Sem slicing no val/test\n",
|
| 187 |
+
"X_val = [format_seq(seq) for seq in df_val[\"sequence\"]]\n",
|
| 188 |
+
"X_test = [format_seq(seq) for seq in df_test[\"sequence\"]]\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"y_val = mlb.transform(df_val[\"go_terms\"].str.split(\";\"))\n",
|
| 191 |
+
"y_test = mlb.transform(df_test[\"go_terms\"].str.split(\";\"))\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"np.save(\"embeddings/y_test.npy\", y_test)"
|
| 194 |
+
]
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"cell_type": "code",
|
| 198 |
+
"execution_count": 11,
|
| 199 |
+
"id": "80d5c1fb-9c84-463d-8d8c-bfcc2982afc9",
|
| 200 |
+
"metadata": {},
|
| 201 |
+
"outputs": [
|
| 202 |
+
{
|
| 203 |
+
"name": "stderr",
|
| 204 |
+
"output_type": "stream",
|
| 205 |
+
"text": [
|
| 206 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\huggingface_hub\\file_download.py:797: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
| 207 |
+
" warnings.warn(\n",
|
| 208 |
+
"Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']\n",
|
| 209 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
| 210 |
+
"100%|██████████| 2189/2189 [1:17:26<00:00, 2.12s/it]\n",
|
| 211 |
+
"100%|██████████| 108/108 [03:43<00:00, 2.07s/it]\n",
|
| 212 |
+
"100%|██████████| 108/108 [03:56<00:00, 2.19s/it]\n"
|
| 213 |
+
]
|
| 214 |
+
}
|
| 215 |
+
],
|
| 216 |
+
"source": [
|
| 217 |
+
"# %%\n",
|
| 218 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
| 219 |
+
"import torch\n",
|
| 220 |
+
"from tqdm import tqdm\n",
|
| 221 |
+
"import numpy as np\n",
|
| 222 |
+
"import os\n",
|
| 223 |
+
"\n",
|
| 224 |
+
"# Configurações\n",
|
| 225 |
+
"MODEL_NAME = \"facebook/esm2_t33_650M_UR50D\"\n",
|
| 226 |
+
"DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 227 |
+
"CHUNK_SIZE = 16\n",
|
| 228 |
+
"\n",
|
| 229 |
+
"# Carregar modelo\n",
|
| 230 |
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)\n",
|
| 231 |
+
"model = AutoModel.from_pretrained(MODEL_NAME)\n",
|
| 232 |
+
"model.to(DEVICE)\n",
|
| 233 |
+
"model.eval()\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"def extract_embeddings(texts):\n",
|
| 236 |
+
" embeddings = []\n",
|
| 237 |
+
" for i in tqdm(range(0, len(texts), CHUNK_SIZE)):\n",
|
| 238 |
+
" batch = texts[i:i+CHUNK_SIZE]\n",
|
| 239 |
+
" with torch.no_grad():\n",
|
| 240 |
+
" inputs = tokenizer(batch, return_tensors=\"pt\", padding=True, truncation=True, max_length=1024)\n",
|
| 241 |
+
" inputs = {k: v.to(DEVICE) for k, v in inputs.items()}\n",
|
| 242 |
+
" outputs = model(**inputs).last_hidden_state\n",
|
| 243 |
+
" cls_tokens = outputs[:, 0, :] # token CLS\n",
|
| 244 |
+
" embeddings.append(cls_tokens.cpu().numpy())\n",
|
| 245 |
+
" return np.vstack(embeddings)\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"# Extrair e guardar embeddings\n",
|
| 248 |
+
"os.makedirs(\"embeddings\", exist_ok=True)\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"emb_train = extract_embeddings(X_train)\n",
|
| 251 |
+
"emb_val = extract_embeddings(X_val)\n",
|
| 252 |
+
"emb_test = extract_embeddings(X_test)\n",
|
| 253 |
+
"\n",
|
| 254 |
+
"np.save(\"embeddings/esm2_train.npy\", emb_train)\n",
|
| 255 |
+
"np.save(\"embeddings/esm2_val.npy\", emb_val)\n",
|
| 256 |
+
"np.save(\"embeddings/esm2_test.npy\", emb_test)\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"np.save(\"embeddings/y_train.npy\", np.array(y_train))\n",
|
| 259 |
+
"np.save(\"embeddings/y_val.npy\", np.array(y_val))\n"
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"cell_type": "code",
|
| 264 |
+
"execution_count": 2,
|
| 265 |
+
"id": "592e4f6c-b871-4f0b-b84c-f3918c698544",
|
| 266 |
+
"metadata": {},
|
| 267 |
+
"outputs": [
|
| 268 |
+
{
|
| 269 |
+
"name": "stdout",
|
| 270 |
+
"output_type": "stream",
|
| 271 |
+
"text": [
|
| 272 |
+
"Epoch 1/100\n",
|
| 273 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0557 - val_loss: 0.0448\n",
|
| 274 |
+
"Epoch 2/100\n",
|
| 275 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0444 - val_loss: 0.0413\n",
|
| 276 |
+
"Epoch 3/100\n",
|
| 277 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0418 - val_loss: 0.0393\n",
|
| 278 |
+
"Epoch 4/100\n",
|
| 279 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0404 - val_loss: 0.0385\n",
|
| 280 |
+
"Epoch 5/100\n",
|
| 281 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0392 - val_loss: 0.0373\n",
|
| 282 |
+
"Epoch 6/100\n",
|
| 283 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0382 - val_loss: 0.0372\n",
|
| 284 |
+
"Epoch 7/100\n",
|
| 285 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0374 - val_loss: 0.0355\n",
|
| 286 |
+
"Epoch 8/100\n",
|
| 287 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0368 - val_loss: 0.0350\n",
|
| 288 |
+
"Epoch 9/100\n",
|
| 289 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0362 - val_loss: 0.0349\n",
|
| 290 |
+
"Epoch 10/100\n",
|
| 291 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0357 - val_loss: 0.0342\n",
|
| 292 |
+
"Epoch 11/100\n",
|
| 293 |
+
"1095/1095 [==============================] - 16s 14ms/step - loss: 0.0353 - val_loss: 0.0339\n",
|
| 294 |
+
"Epoch 12/100\n",
|
| 295 |
+
"1095/1095 [==============================] - 16s 14ms/step - loss: 0.0348 - val_loss: 0.0336\n",
|
| 296 |
+
"Epoch 13/100\n",
|
| 297 |
+
"1095/1095 [==============================] - 15s 13ms/step - loss: 0.0344 - val_loss: 0.0335\n",
|
| 298 |
+
"Epoch 14/100\n",
|
| 299 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0341 - val_loss: 0.0337\n",
|
| 300 |
+
"Epoch 15/100\n",
|
| 301 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0338 - val_loss: 0.0331\n",
|
| 302 |
+
"Epoch 16/100\n",
|
| 303 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0335 - val_loss: 0.0327\n",
|
| 304 |
+
"Epoch 17/100\n",
|
| 305 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0332 - val_loss: 0.0328\n",
|
| 306 |
+
"Epoch 18/100\n",
|
| 307 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0330 - val_loss: 0.0326\n",
|
| 308 |
+
"Epoch 19/100\n",
|
| 309 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0326 - val_loss: 0.0326\n",
|
| 310 |
+
"Epoch 20/100\n",
|
| 311 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0324 - val_loss: 0.0319\n",
|
| 312 |
+
"Epoch 21/100\n",
|
| 313 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0321 - val_loss: 0.0319\n",
|
| 314 |
+
"Epoch 22/100\n",
|
| 315 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0320 - val_loss: 0.0321\n",
|
| 316 |
+
"Epoch 23/100\n",
|
| 317 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0319 - val_loss: 0.0314\n",
|
| 318 |
+
"Epoch 24/100\n",
|
| 319 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0316 - val_loss: 0.0315\n",
|
| 320 |
+
"Epoch 25/100\n",
|
| 321 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0315 - val_loss: 0.0314\n",
|
| 322 |
+
"Epoch 26/100\n",
|
| 323 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0314 - val_loss: 0.0316\n",
|
| 324 |
+
"Epoch 27/100\n",
|
| 325 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0310 - val_loss: 0.0315\n",
|
| 326 |
+
"Epoch 28/100\n",
|
| 327 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0311 - val_loss: 0.0312\n",
|
| 328 |
+
"Epoch 29/100\n",
|
| 329 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0307 - val_loss: 0.0312\n",
|
| 330 |
+
"Epoch 30/100\n",
|
| 331 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0307 - val_loss: 0.0309\n",
|
| 332 |
+
"Epoch 31/100\n",
|
| 333 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0305 - val_loss: 0.0310\n",
|
| 334 |
+
"Epoch 32/100\n",
|
| 335 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0305 - val_loss: 0.0311\n",
|
| 336 |
+
"Epoch 33/100\n",
|
| 337 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0303 - val_loss: 0.0307\n",
|
| 338 |
+
"Epoch 34/100\n",
|
| 339 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0301 - val_loss: 0.0309\n",
|
| 340 |
+
"Epoch 35/100\n",
|
| 341 |
+
"1095/1095 [==============================] - 13s 12ms/step - loss: 0.0300 - val_loss: 0.0310\n",
|
| 342 |
+
"Epoch 36/100\n",
|
| 343 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0299 - val_loss: 0.0311\n",
|
| 344 |
+
"Epoch 37/100\n",
|
| 345 |
+
"1095/1095 [==============================] - 14s 12ms/step - loss: 0.0298 - val_loss: 0.0305\n",
|
| 346 |
+
"Epoch 38/100\n",
|
| 347 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0296 - val_loss: 0.0308\n",
|
| 348 |
+
"Epoch 39/100\n",
|
| 349 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0296 - val_loss: 0.0310\n",
|
| 350 |
+
"Epoch 40/100\n",
|
| 351 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0295 - val_loss: 0.0313\n",
|
| 352 |
+
"Epoch 41/100\n",
|
| 353 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0293 - val_loss: 0.0306\n",
|
| 354 |
+
"Epoch 42/100\n",
|
| 355 |
+
"1095/1095 [==============================] - 14s 13ms/step - loss: 0.0292 - val_loss: 0.0306\n",
|
| 356 |
+
"Modelo guardado em models/\n",
|
| 357 |
+
"54/54 [==============================] - 0s 2ms/step\n",
|
| 358 |
+
" Predições do ESM-2 salvas com forma: (1724, 602)\n"
|
| 359 |
+
]
|
| 360 |
+
}
|
| 361 |
+
],
|
| 362 |
+
"source": [
|
| 363 |
+
"# %%\n",
|
| 364 |
+
"import numpy as np\n",
|
| 365 |
+
"import tensorflow as tf\n",
|
| 366 |
+
"from tensorflow.keras import Input\n",
|
| 367 |
+
"from tensorflow.keras.models import Sequential\n",
|
| 368 |
+
"from tensorflow.keras.layers import Dense, Dropout\n",
|
| 369 |
+
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
| 370 |
+
"from sklearn.metrics import average_precision_score\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"# Carregar os embeddings e labels\n",
|
| 373 |
+
"X_train = np.load(\"embeddings/esm2_train.npy\")\n",
|
| 374 |
+
"X_val = np.load(\"embeddings/esm2_val.npy\")\n",
|
| 375 |
+
"X_test = np.load(\"embeddings/esm2_test.npy\")\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"y_train = np.load(\"embeddings/y_train.npy\")\n",
|
| 378 |
+
"y_val = np.load(\"embeddings/y_val.npy\")\n",
|
| 379 |
+
"y_test = np.load(\"embeddings/y_test.npy\")\n",
|
| 380 |
+
"\n",
|
| 381 |
+
"# Definir o modelo\n",
|
| 382 |
+
"model = Sequential([\n",
|
| 383 |
+
" Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),\n",
|
| 384 |
+
" Dropout(0.3),\n",
|
| 385 |
+
" Dense(512, activation='relu'),\n",
|
| 386 |
+
" Dropout(0.3),\n",
|
| 387 |
+
" Dense(y_train.shape[1], activation='sigmoid')\n",
|
| 388 |
+
"])\n",
|
| 389 |
+
"\n",
|
| 390 |
+
"model.compile(optimizer='adam', loss='binary_crossentropy')\n",
|
| 391 |
+
"\n",
|
| 392 |
+
"# Treinar\n",
|
| 393 |
+
"early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)\n",
|
| 394 |
+
"\n",
|
| 395 |
+
"history = model.fit(\n",
|
| 396 |
+
" X_train, y_train,\n",
|
| 397 |
+
" validation_data=(X_val, y_val),\n",
|
| 398 |
+
" epochs=100,\n",
|
| 399 |
+
" batch_size=32,\n",
|
| 400 |
+
" callbacks=[early_stop],\n",
|
| 401 |
+
" verbose=1\n",
|
| 402 |
+
")\n",
|
| 403 |
+
"\n",
|
| 404 |
+
"# Salvar o modelo\n",
|
| 405 |
+
"model.save(\"models/mlp_esm2.h5\")\n",
|
| 406 |
+
"model.save(\"models/mlp_esm2.keras\")\n",
|
| 407 |
+
"print(\"Modelo guardado em models/\")\n",
|
| 408 |
+
"\n",
|
| 409 |
+
"# Fazer predições no conjunto de teste\n",
|
| 410 |
+
"y_prob = model.predict(X_test)\n",
|
| 411 |
+
"np.save(\"predictions/mf-esm2.npy\", y_prob)\n",
|
| 412 |
+
"\n",
|
| 413 |
+
"print(\" Predições do ESM-2 salvas com forma:\", y_prob.shape)\n"
|
| 414 |
+
]
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"cell_type": "code",
|
| 418 |
+
"execution_count": 3,
|
| 419 |
+
"id": "3dddb0df-3ea5-4e32-8cf0-45e90be8ba66",
|
| 420 |
+
"metadata": {},
|
| 421 |
+
"outputs": [
|
| 422 |
+
{
|
| 423 |
+
"name": "stderr",
|
| 424 |
+
"output_type": "stream",
|
| 425 |
+
"text": [
|
| 426 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\sklearn\\base.py:380: InconsistentVersionWarning: Trying to unpickle estimator MultiLabelBinarizer from version 1.1.3 when using version 1.6.1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
|
| 427 |
+
"https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
|
| 428 |
+
" warnings.warn(\n"
|
| 429 |
+
]
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"name": "stdout",
|
| 433 |
+
"output_type": "stream",
|
| 434 |
+
"text": [
|
| 435 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n",
|
| 436 |
+
"✓ Dados carregados: (1724, 602) proteínas × 602 GO terms\n",
|
| 437 |
+
"\n",
|
| 438 |
+
" Resultados finais (ESM-2 + PAM1 + propagação):\n",
|
| 439 |
+
"Fmax = 0.6377\n",
|
| 440 |
+
"Thr. = 0.35\n",
|
| 441 |
+
"AuPRC = 0.6848\n",
|
| 442 |
+
"Smin = 14.4202\n"
|
| 443 |
+
]
|
| 444 |
+
}
|
| 445 |
+
],
|
| 446 |
+
"source": [
|
| 447 |
+
"# %%\n",
|
| 448 |
+
"import numpy as np\n",
|
| 449 |
+
"import joblib\n",
|
| 450 |
+
"import math\n",
|
| 451 |
+
"from goatools.obo_parser import GODag\n",
|
| 452 |
+
"from sklearn.metrics import precision_recall_curve, auc\n",
|
| 453 |
+
"\n",
|
| 454 |
+
"# Carregar dados e parâmetros\n",
|
| 455 |
+
"GO_FILE = \"go.obo\"\n",
|
| 456 |
+
"THRESHOLDS = np.arange(0.0, 1.01, 0.01)\n",
|
| 457 |
+
"ALPHA = 0.5\n",
|
| 458 |
+
"\n",
|
| 459 |
+
"mlb = joblib.load(\"data/mlb.pkl\")\n",
|
| 460 |
+
"y_true = np.load(\"embeddings/y_test.npy\")\n",
|
| 461 |
+
"y_prob = np.load(\"predictions/mf-esm2.npy\")\n",
|
| 462 |
+
"terms = mlb.classes_\n",
|
| 463 |
+
"go_dag = GODag(GO_FILE)\n",
|
| 464 |
+
"\n",
|
| 465 |
+
"print(f\"✓ Dados carregados: {y_true.shape} proteínas × {len(terms)} GO terms\")\n",
|
| 466 |
+
"\n",
|
| 467 |
+
"# Fmax\n",
|
| 468 |
+
"def compute_fmax(y_true, y_prob, thresholds):\n",
|
| 469 |
+
" fmax, best_thr = 0, 0\n",
|
| 470 |
+
" for t in thresholds:\n",
|
| 471 |
+
" y_pred = (y_prob >= t).astype(int)\n",
|
| 472 |
+
" tp = (y_true * y_pred).sum(axis=1)\n",
|
| 473 |
+
" fp = ((1 - y_true) * y_pred).sum(axis=1)\n",
|
| 474 |
+
" fn = (y_true * (1 - y_pred)).sum(axis=1)\n",
|
| 475 |
+
" precision = tp / (tp + fp + 1e-8)\n",
|
| 476 |
+
" recall = tp / (tp + fn + 1e-8)\n",
|
| 477 |
+
" f1 = 2 * precision * recall / (precision + recall + 1e-8)\n",
|
| 478 |
+
" avg_f1 = np.mean(f1)\n",
|
| 479 |
+
" if avg_f1 > fmax:\n",
|
| 480 |
+
" fmax, best_thr = avg_f1, t\n",
|
| 481 |
+
" return fmax, best_thr\n",
|
| 482 |
+
"\n",
|
| 483 |
+
"# AuPRC (micro)\n",
|
| 484 |
+
"def compute_auprc(y_true, y_prob):\n",
|
| 485 |
+
" precision, recall, _ = precision_recall_curve(y_true.ravel(), y_prob.ravel())\n",
|
| 486 |
+
" return auc(recall, precision)\n",
|
| 487 |
+
"\n",
|
| 488 |
+
"# Smin\n",
|
| 489 |
+
"def compute_smin(y_true, y_prob, terms, threshold, go_dag, alpha=ALPHA):\n",
|
| 490 |
+
" y_pred = (y_prob >= threshold).astype(int)\n",
|
| 491 |
+
"\n",
|
| 492 |
+
" # Informação semântica: IC (Information Content)\n",
|
| 493 |
+
" ic = {}\n",
|
| 494 |
+
" total = (y_true + y_pred).sum(axis=0).sum()\n",
|
| 495 |
+
" for i, term in enumerate(terms):\n",
|
| 496 |
+
" freq = (y_true[:, i] + y_pred[:, i]).sum()\n",
|
| 497 |
+
" ic[term] = -np.log((freq + 1e-8) / total)\n",
|
| 498 |
+
"\n",
|
| 499 |
+
" # Para cada proteína, calcular RU e MI\n",
|
| 500 |
+
" s_values = []\n",
|
| 501 |
+
" for true_vec, pred_vec in zip(y_true, y_pred):\n",
|
| 502 |
+
" true_terms = {terms[i] for i in np.where(true_vec)[0]}\n",
|
| 503 |
+
" pred_terms = {terms[i] for i in np.where(pred_vec)[0]}\n",
|
| 504 |
+
"\n",
|
| 505 |
+
" anc_true = set()\n",
|
| 506 |
+
" for t in true_terms:\n",
|
| 507 |
+
" if t in go_dag:\n",
|
| 508 |
+
" anc_true |= go_dag[t].get_all_parents()\n",
|
| 509 |
+
" anc_pred = set()\n",
|
| 510 |
+
" for t in pred_terms:\n",
|
| 511 |
+
" if t in go_dag:\n",
|
| 512 |
+
" anc_pred |= go_dag[t].get_all_parents()\n",
|
| 513 |
+
"\n",
|
| 514 |
+
" ru = pred_terms - true_terms\n",
|
| 515 |
+
" mi = true_terms - pred_terms\n",
|
| 516 |
+
" dist_ru = sum(ic.get(t, 0) for t in ru)\n",
|
| 517 |
+
" dist_mi = sum(ic.get(t, 0) for t in mi)\n",
|
| 518 |
+
" s = math.sqrt((alpha * dist_ru)**2 + ((1 - alpha) * dist_mi)**2)\n",
|
| 519 |
+
" s_values.append(s)\n",
|
| 520 |
+
"\n",
|
| 521 |
+
" return np.mean(s_values)\n",
|
| 522 |
+
"\n",
|
| 523 |
+
"# Avaliação\n",
|
| 524 |
+
"fmax, thr = compute_fmax(y_true, y_prob, THRESHOLDS)\n",
|
| 525 |
+
"auprc = compute_auprc(y_true, y_prob)\n",
|
| 526 |
+
"smin = compute_smin(y_true, y_prob, terms, thr, go_dag)\n",
|
| 527 |
+
"\n",
|
| 528 |
+
"print(f\"\\n Resultados finais (ESM-2 + PAM1 + propagação):\")\n",
|
| 529 |
+
"print(f\"Fmax = {fmax:.4f}\")\n",
|
| 530 |
+
"print(f\"Thr. = {thr:.2f}\")\n",
|
| 531 |
+
"print(f\"AuPRC = {auprc:.4f}\")\n",
|
| 532 |
+
"print(f\"Smin = {smin:.4f}\")\n"
|
| 533 |
+
]
|
| 534 |
+
},
|
| 535 |
+
{
|
| 536 |
+
"cell_type": "code",
|
| 537 |
+
"execution_count": null,
|
| 538 |
+
"id": "1a1ea084-01de-4dc4-88da-e7ffeb8c94c9",
|
| 539 |
+
"metadata": {},
|
| 540 |
+
"outputs": [],
|
| 541 |
+
"source": []
|
| 542 |
+
}
|
| 543 |
+
],
|
| 544 |
+
"metadata": {
|
| 545 |
+
"kernelspec": {
|
| 546 |
+
"display_name": "Python 3 (ipykernel)",
|
| 547 |
+
"language": "python",
|
| 548 |
+
"name": "python3"
|
| 549 |
+
},
|
| 550 |
+
"language_info": {
|
| 551 |
+
"codemirror_mode": {
|
| 552 |
+
"name": "ipython",
|
| 553 |
+
"version": 3
|
| 554 |
+
},
|
| 555 |
+
"file_extension": ".py",
|
| 556 |
+
"mimetype": "text/x-python",
|
| 557 |
+
"name": "python",
|
| 558 |
+
"nbconvert_exporter": "python",
|
| 559 |
+
"pygments_lexer": "ipython3",
|
| 560 |
+
"version": "3.10.16"
|
| 561 |
+
}
|
| 562 |
+
},
|
| 563 |
+
"nbformat": 4,
|
| 564 |
+
"nbformat_minor": 5
|
| 565 |
+
}
|
notebooks/mlp_protbert.ipynb
ADDED
|
@@ -0,0 +1,825 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"id": "c6dbc330-062a-48f0-8242-3f21cc1c9c2b",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n",
|
| 14 |
+
"✓ Ficheiros criados:\n",
|
| 15 |
+
" - data/mf-training.csv : (31142, 3)\n",
|
| 16 |
+
" - data/mf-validation.csv: (1724, 3)\n",
|
| 17 |
+
" - data/mf-test.csv : (1724, 3)\n",
|
| 18 |
+
"GO terms únicos (após propagação e filtro): 602\n"
|
| 19 |
+
]
|
| 20 |
+
}
|
| 21 |
+
],
|
| 22 |
+
"source": [
|
| 23 |
+
"import pandas as pd\n",
|
| 24 |
+
"from Bio import SeqIO\n",
|
| 25 |
+
"from collections import Counter\n",
|
| 26 |
+
"from goatools.obo_parser import GODag\n",
|
| 27 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 28 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 29 |
+
"from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n",
|
| 30 |
+
"import numpy as np\n",
|
| 31 |
+
"import os\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"# Carregar GO anotações\n",
|
| 34 |
+
"annotations = pd.read_csv(\"uniprot_sprot_exp.txt\", sep=\"\\t\", names=[\"protein_id\", \"go_term\", \"go_category\"])\n",
|
| 35 |
+
"annotations_f = annotations[annotations[\"go_category\"] == \"F\"]\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"# Carregar DAG e propagar GO terms\n",
|
| 38 |
+
"# propagação hierárquica\n",
|
| 39 |
+
"# https://geneontology.org/docs/download-ontology/\n",
|
| 40 |
+
"go_dag = GODag(\"go.obo\")\n",
|
| 41 |
+
"mf_terms = {t for t, o in go_dag.items() if o.namespace == \"molecular_function\"}\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"def propagate_terms(term_list):\n",
|
| 44 |
+
" full = set()\n",
|
| 45 |
+
" for t in term_list:\n",
|
| 46 |
+
" if t not in go_dag:\n",
|
| 47 |
+
" continue\n",
|
| 48 |
+
" full.add(t)\n",
|
| 49 |
+
" full.update(go_dag[t].get_all_parents())\n",
|
| 50 |
+
" return list(full & mf_terms)\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"# Carregar sequências\n",
|
| 53 |
+
"seqs, ids = [], []\n",
|
| 54 |
+
"for record in SeqIO.parse(\"uniprot_sprot_exp.fasta\", \"fasta\"):\n",
|
| 55 |
+
" ids.append(record.id)\n",
|
| 56 |
+
" seqs.append(str(record.seq))\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"seq_df = pd.DataFrame({\"protein_id\": ids, \"sequence\": seqs})\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"# Juntar com GO anotado e propagar\n",
|
| 61 |
+
"grouped = annotations_f.groupby(\"protein_id\")[\"go_term\"].apply(list).reset_index()\n",
|
| 62 |
+
"data = seq_df.merge(grouped, on=\"protein_id\")\n",
|
| 63 |
+
"data = data[data[\"go_term\"].apply(len) > 0]\n",
|
| 64 |
+
"data[\"go_term\"] = data[\"go_term\"].apply(propagate_terms)\n",
|
| 65 |
+
"data = data[data[\"go_term\"].apply(len) > 0]\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"# Filtrar GO terms raros\n",
|
| 68 |
+
"# todos os terms com menos de 50 proteinas associadas\n",
|
| 69 |
+
"all_terms = [term for sublist in data[\"go_term\"] for term in sublist]\n",
|
| 70 |
+
"term_counts = Counter(all_terms)\n",
|
| 71 |
+
"valid_terms = {term for term, count in term_counts.items() if count >= 50}\n",
|
| 72 |
+
"data[\"go_term\"] = data[\"go_term\"].apply(lambda terms: [t for t in terms if t in valid_terms])\n",
|
| 73 |
+
"data = data[data[\"go_term\"].apply(len) > 0]\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"# Preparar dataset final\n",
|
| 76 |
+
"data[\"go_terms\"] = data[\"go_term\"].apply(lambda x: ';'.join(sorted(set(x))))\n",
|
| 77 |
+
"data = data[[\"protein_id\", \"sequence\", \"go_terms\"]].drop_duplicates()\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# Binarizar labels e dividir\n",
|
| 80 |
+
"mlb = MultiLabelBinarizer()\n",
|
| 81 |
+
"Y = mlb.fit_transform(data[\"go_terms\"].str.split(\";\"))\n",
|
| 82 |
+
"X = data[[\"protein_id\", \"sequence\"]].values\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"mskf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n",
|
| 85 |
+
"train_idx, temp_idx = next(mskf.split(X, Y))\n",
|
| 86 |
+
"val_idx, test_idx = np.array_split(temp_idx, 2)\n",
|
| 87 |
+
"\n",
|
| 88 |
+
"df_train = data.iloc[train_idx].copy()\n",
|
| 89 |
+
"df_val = data.iloc[val_idx].copy()\n",
|
| 90 |
+
"df_test = data.iloc[test_idx].copy()\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"# Guardar em CSV\n",
|
| 93 |
+
"os.makedirs(\"data\", exist_ok=True)\n",
|
| 94 |
+
"df_train.to_csv(\"data/mf-training.csv\", index=False)\n",
|
| 95 |
+
"df_val.to_csv(\"data/mf-validation.csv\", index=False)\n",
|
| 96 |
+
"df_test.to_csv(\"data/mf-test.csv\", index=False)\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"# Confirmar\n",
|
| 99 |
+
"print(\"✓ Ficheiros criados:\")\n",
|
| 100 |
+
"print(\" - data/mf-training.csv :\", df_train.shape, df_train.columns.tolist())\n",
|
| 101 |
+
"print(\" - data/mf-validation.csv:\", df_val.shape, df_val.columns.tolist())\n",
|
| 102 |
+
"print(\" - data/mf-test.csv :\", df_test.shape, df_test.columns.tolist())\n",
|
| 103 |
+
"print(f\"GO terms únicos (após propagação e filtro): {len(mlb.classes_)}\")\n"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"cell_type": "code",
|
| 108 |
+
"execution_count": 2,
|
| 109 |
+
"id": "6cf7aaa6-4941-4951-8d73-1f4f1f4362f3",
|
| 110 |
+
"metadata": {},
|
| 111 |
+
"outputs": [
|
| 112 |
+
{
|
| 113 |
+
"name": "stderr",
|
| 114 |
+
"output_type": "stream",
|
| 115 |
+
"text": [
|
| 116 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 117 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 118 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n",
|
| 119 |
+
" _torch_pytree._register_pytree_node(\n",
|
| 120 |
+
"100%|██████████| 31142/31142 [00:24<00:00, 1262.18it/s]\n",
|
| 121 |
+
"100%|██████████| 1724/1724 [00:00<00:00, 2628.24it/s]\n",
|
| 122 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\text\\preprocessor.py:382: UserWarning: The class_names argument is replacing the classes argument. Please update your code.\n",
|
| 123 |
+
" warnings.warn(\n"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"name": "stdout",
|
| 128 |
+
"output_type": "stream",
|
| 129 |
+
"text": [
|
| 130 |
+
"preprocessing train...\n",
|
| 131 |
+
"language: de\n",
|
| 132 |
+
"train sequence lengths:\n",
|
| 133 |
+
"\tmean : 423\n",
|
| 134 |
+
"\t95percentile : 604\n",
|
| 135 |
+
"\t99percentile : 715\n"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"data": {
|
| 140 |
+
"text/html": [
|
| 141 |
+
"\n",
|
| 142 |
+
"<style>\n",
|
| 143 |
+
" /* Turns off some styling */\n",
|
| 144 |
+
" progress {\n",
|
| 145 |
+
" /* gets rid of default border in Firefox and Opera. */\n",
|
| 146 |
+
" border: none;\n",
|
| 147 |
+
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
|
| 148 |
+
" background-size: auto;\n",
|
| 149 |
+
" }\n",
|
| 150 |
+
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
|
| 151 |
+
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
|
| 152 |
+
" }\n",
|
| 153 |
+
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
|
| 154 |
+
" background: #F44336;\n",
|
| 155 |
+
" }\n",
|
| 156 |
+
"</style>\n"
|
| 157 |
+
],
|
| 158 |
+
"text/plain": [
|
| 159 |
+
"<IPython.core.display.HTML object>"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"output_type": "display_data"
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"data": {
|
| 167 |
+
"text/html": [],
|
| 168 |
+
"text/plain": [
|
| 169 |
+
"<IPython.core.display.HTML object>"
|
| 170 |
+
]
|
| 171 |
+
},
|
| 172 |
+
"metadata": {},
|
| 173 |
+
"output_type": "display_data"
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"name": "stdout",
|
| 177 |
+
"output_type": "stream",
|
| 178 |
+
"text": [
|
| 179 |
+
"Is Multi-Label? True\n",
|
| 180 |
+
"preprocessing test...\n",
|
| 181 |
+
"language: de\n",
|
| 182 |
+
"test sequence lengths:\n",
|
| 183 |
+
"\tmean : 408\n",
|
| 184 |
+
"\t95percentile : 603\n",
|
| 185 |
+
"\t99percentile : 714\n"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"data": {
|
| 190 |
+
"text/html": [
|
| 191 |
+
"\n",
|
| 192 |
+
"<style>\n",
|
| 193 |
+
" /* Turns off some styling */\n",
|
| 194 |
+
" progress {\n",
|
| 195 |
+
" /* gets rid of default border in Firefox and Opera. */\n",
|
| 196 |
+
" border: none;\n",
|
| 197 |
+
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
|
| 198 |
+
" background-size: auto;\n",
|
| 199 |
+
" }\n",
|
| 200 |
+
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
|
| 201 |
+
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
|
| 202 |
+
" }\n",
|
| 203 |
+
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
|
| 204 |
+
" background: #F44336;\n",
|
| 205 |
+
" }\n",
|
| 206 |
+
"</style>\n"
|
| 207 |
+
],
|
| 208 |
+
"text/plain": [
|
| 209 |
+
"<IPython.core.display.HTML object>"
|
| 210 |
+
]
|
| 211 |
+
},
|
| 212 |
+
"metadata": {},
|
| 213 |
+
"output_type": "display_data"
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"data": {
|
| 217 |
+
"text/html": [],
|
| 218 |
+
"text/plain": [
|
| 219 |
+
"<IPython.core.display.HTML object>"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
"metadata": {},
|
| 223 |
+
"output_type": "display_data"
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"name": "stderr",
|
| 227 |
+
"output_type": "stream",
|
| 228 |
+
"text": [
|
| 229 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\text\\preprocessor.py:1093: UserWarning: Could not load a Tensorflow version of model. (If this worked before, it might be an out-of-memory issue.) Attempting to download/load PyTorch version as TensorFlow model using from_pt=True. You will need PyTorch installed for this.\n",
|
| 230 |
+
" warnings.warn(\n"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"name": "stdout",
|
| 235 |
+
"output_type": "stream",
|
| 236 |
+
"text": [
|
| 237 |
+
"\n",
|
| 238 |
+
"\n",
|
| 239 |
+
"begin training using triangular learning rate policy with max lr of 1e-05...\n",
|
| 240 |
+
"Epoch 1/10\n",
|
| 241 |
+
"40995/40995 [==============================] - 13053s 318ms/step - loss: 0.0745 - binary_accuracy: 0.9866 - val_loss: 0.0582 - val_binary_accuracy: 0.9859\n",
|
| 242 |
+
"Epoch 2/10\n",
|
| 243 |
+
"40995/40995 [==============================] - 14484s 353ms/step - loss: 0.0504 - binary_accuracy: 0.9873 - val_loss: 0.0499 - val_binary_accuracy: 0.9867\n",
|
| 244 |
+
"Epoch 3/10\n",
|
| 245 |
+
"40995/40995 [==============================] - 14472s 353ms/step - loss: 0.0450 - binary_accuracy: 0.9879 - val_loss: 0.0449 - val_binary_accuracy: 0.9873\n",
|
| 246 |
+
"Epoch 4/10\n",
|
| 247 |
+
"40995/40995 [==============================] - 14445s 352ms/step - loss: 0.0407 - binary_accuracy: 0.9884 - val_loss: 0.0413 - val_binary_accuracy: 0.9878\n",
|
| 248 |
+
"Epoch 5/10\n",
|
| 249 |
+
"40995/40995 [==============================] - 12524s 305ms/step - loss: 0.0378 - binary_accuracy: 0.9888 - val_loss: 0.0394 - val_binary_accuracy: 0.9881\n",
|
| 250 |
+
"Epoch 6/10\n",
|
| 251 |
+
"40995/40995 [==============================] - 14737s 359ms/step - loss: 0.0359 - binary_accuracy: 0.9891 - val_loss: 0.0383 - val_binary_accuracy: 0.9883\n",
|
| 252 |
+
"Epoch 7/10\n",
|
| 253 |
+
"40995/40995 [==============================] - 20317s 495ms/step - loss: 0.0343 - binary_accuracy: 0.9894 - val_loss: 0.0371 - val_binary_accuracy: 0.9885\n",
|
| 254 |
+
"Epoch 8/10\n",
|
| 255 |
+
"40995/40995 [==============================] - 9073s 221ms/step - loss: 0.0331 - binary_accuracy: 0.9896 - val_loss: 0.0364 - val_binary_accuracy: 0.9887\n",
|
| 256 |
+
"Epoch 9/10\n",
|
| 257 |
+
"40995/40995 [==============================] - 9001s 219ms/step - loss: 0.0320 - binary_accuracy: 0.9898 - val_loss: 0.0360 - val_binary_accuracy: 0.9888\n",
|
| 258 |
+
"Epoch 10/10\n",
|
| 259 |
+
"40995/40995 [==============================] - 8980s 219ms/step - loss: 0.0311 - binary_accuracy: 0.9900 - val_loss: 0.0356 - val_binary_accuracy: 0.9890\n"
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"ename": "RuntimeError",
|
| 264 |
+
"evalue": "Can't decrement id ref count (unable to extend file properly)",
|
| 265 |
+
"output_type": "error",
|
| 266 |
+
"traceback": [
|
| 267 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 268 |
+
"\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
|
| 269 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\engine\\training.py:2252\u001b[0m, in \u001b[0;36mModel.save_weights\u001b[1;34m(self, filepath, overwrite, save_format, options)\u001b[0m\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m h5py\u001b[38;5;241m.\u001b[39mFile(filepath, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m-> 2252\u001b[0m \u001b[43mhdf5_format\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_weights_to_hdf5_group\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayers\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
| 270 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\saving\\hdf5_format.py:646\u001b[0m, in \u001b[0;36msave_weights_to_hdf5_group\u001b[1;34m(f, layers)\u001b[0m\n\u001b[0;32m 645\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 646\u001b[0m param_dset[:] \u001b[38;5;241m=\u001b[39m val\n",
|
| 271 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 272 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 273 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\h5py\\_hl\\dataset.py:999\u001b[0m, in \u001b[0;36mDataset.__setitem__\u001b[1;34m(self, args, val)\u001b[0m\n\u001b[0;32m 998\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m fspace \u001b[38;5;129;01min\u001b[39;00m selection\u001b[38;5;241m.\u001b[39mbroadcast(mshape):\n\u001b[1;32m--> 999\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmspace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfspace\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdxpl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dxpl\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 274 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 275 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 276 |
+
"File \u001b[1;32mh5py\\\\h5d.pyx:282\u001b[0m, in \u001b[0;36mh5py.h5d.DatasetID.write\u001b[1;34m()\u001b[0m\n",
|
| 277 |
+
"File \u001b[1;32mh5py\\\\_proxy.pyx:115\u001b[0m, in \u001b[0;36mh5py._proxy.dset_rw\u001b[1;34m()\u001b[0m\n",
|
| 278 |
+
"\u001b[1;31mOSError\u001b[0m: [Errno 28] Can't write data (file write failed: time = Wed May 7 10:48:36 2025\n, filename = 'mf-fine-tuned-protbert\\weights-10-0.04.hdf5', file descriptor = 4, errno = 28, error message = 'No space left on device', buf = 000002CC552FF040, total write size = 4194304, bytes this sub-write = 4194304, bytes actually written = 18446744073709551615, offset = 1180551864)",
|
| 279 |
+
"\nDuring handling of the above exception, another exception occurred:\n",
|
| 280 |
+
"\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
| 281 |
+
"Cell \u001b[1;32mIn[2], line 119\u001b[0m\n\u001b[0;32m 113\u001b[0m model \u001b[38;5;241m=\u001b[39m t\u001b[38;5;241m.\u001b[39mget_classifier()\n\u001b[0;32m 114\u001b[0m learner \u001b[38;5;241m=\u001b[39m ktrain\u001b[38;5;241m.\u001b[39mget_learner(model,\n\u001b[0;32m 115\u001b[0m train_data\u001b[38;5;241m=\u001b[39mtrn,\n\u001b[0;32m 116\u001b[0m val_data\u001b[38;5;241m=\u001b[39mval,\n\u001b[0;32m 117\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mBATCH_SIZE)\n\u001b[1;32m--> 119\u001b[0m \u001b[43mlearner\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautofit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 121\u001b[0m \u001b[43m \u001b[49m\u001b[43mearly_stopping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 122\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_folder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmf-fine-tuned-protbert\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
| 282 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\core.py:1239\u001b[0m, in \u001b[0;36mLearner.autofit\u001b[1;34m(self, lr, epochs, early_stopping, reduce_on_plateau, reduce_factor, cycle_momentum, max_momentum, min_momentum, monitor, checkpoint_folder, class_weight, callbacks, steps_per_epoch, verbose)\u001b[0m\n\u001b[0;32m 1234\u001b[0m policy \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtriangular learning rate\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1235\u001b[0m U\u001b[38;5;241m.\u001b[39mvprint(\n\u001b[0;32m 1236\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbegin training using \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m policy with max lr of \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m...\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (policy, lr),\n\u001b[0;32m 1237\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[0;32m 1238\u001b[0m )\n\u001b[1;32m-> 1239\u001b[0m hist \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1240\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1241\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1242\u001b[0m \u001b[43m \u001b[49m\u001b[43mearly_stopping\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mearly_stopping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1243\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_folder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1244\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1245\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1246\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1247\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps_per_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps_per_epoch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1248\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1249\u001b[0m hist\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m clr\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 1250\u001b[0m hist\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miterations\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m clr\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miterations\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
|
| 283 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\core.py:1650\u001b[0m, in \u001b[0;36mGenLearner.fit\u001b[1;34m(self, lr, n_cycles, cycle_len, cycle_mult, lr_decay, checkpoint_folder, early_stopping, class_weight, callbacks, steps_per_epoch, verbose)\u001b[0m\n\u001b[0;32m 1648\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m, message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.*Check your callbacks.*\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 1649\u001b[0m fit_fn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mfit\n\u001b[1;32m-> 1650\u001b[0m hist \u001b[38;5;241m=\u001b[39m \u001b[43mfit_fn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1651\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain_data\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1652\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps_per_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps_per_epoch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1653\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidation_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1654\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1655\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_prepare\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mval_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1656\u001b[0m \u001b[43m \u001b[49m\u001b[43mworkers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mworkers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1657\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_multiprocessing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_multiprocessing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1658\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1659\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 1660\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1661\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1663\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sgdr \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 1664\u001b[0m hist\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m sgdr\u001b[38;5;241m.\u001b[39mhistory[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
|
| 284 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\engine\\training.py:1230\u001b[0m, in \u001b[0;36mModel.fit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[0;32m 1227\u001b[0m val_logs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mval_\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m name: val \u001b[38;5;28;01mfor\u001b[39;00m name, val \u001b[38;5;129;01min\u001b[39;00m val_logs\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[0;32m 1228\u001b[0m epoch_logs\u001b[38;5;241m.\u001b[39mupdate(val_logs)\n\u001b[1;32m-> 1230\u001b[0m \u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mon_epoch_end\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepoch_logs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1231\u001b[0m training_logs \u001b[38;5;241m=\u001b[39m epoch_logs\n\u001b[0;32m 1232\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstop_training:\n",
|
| 285 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\callbacks.py:413\u001b[0m, in \u001b[0;36mCallbackList.on_epoch_end\u001b[1;34m(self, epoch, logs)\u001b[0m\n\u001b[0;32m 411\u001b[0m logs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_logs(logs)\n\u001b[0;32m 412\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m callback \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallbacks:\n\u001b[1;32m--> 413\u001b[0m \u001b[43mcallback\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mon_epoch_end\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 286 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\callbacks.py:1368\u001b[0m, in \u001b[0;36mModelCheckpoint.on_epoch_end\u001b[1;34m(self, epoch, logs)\u001b[0m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;66;03m# pylint: disable=protected-access\u001b[39;00m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_freq \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mepoch\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m-> 1368\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlogs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlogs\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 287 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\callbacks.py:1431\u001b[0m, in \u001b[0;36mModelCheckpoint._save_model\u001b[1;34m(self, epoch, batch, logs)\u001b[0m\n\u001b[0;32m 1429\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mEpoch \u001b[39m\u001b[38;5;132;01m%05d\u001b[39;00m\u001b[38;5;124m: saving model to \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m%\u001b[39m (epoch \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m, filepath))\n\u001b[0;32m 1430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_weights_only:\n\u001b[1;32m-> 1431\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_weights\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1432\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilepath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moverwrite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1433\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1434\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39msave(filepath, overwrite\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_options)\n",
|
| 288 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\keras\\engine\\training.py:2252\u001b[0m, in \u001b[0;36mModel.save_weights\u001b[1;34m(self, filepath, overwrite, save_format, options)\u001b[0m\n\u001b[0;32m 2250\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m save_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mh5\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m 2251\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m h5py\u001b[38;5;241m.\u001b[39mFile(filepath, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m-> 2252\u001b[0m hdf5_format\u001b[38;5;241m.\u001b[39msave_weights_to_hdf5_group(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlayers)\n\u001b[0;32m 2253\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 2254\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tf\u001b[38;5;241m.\u001b[39mexecuting_eagerly():\n",
|
| 289 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 290 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 291 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\h5py\\_hl\\files.py:599\u001b[0m, in \u001b[0;36mFile.__exit__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 596\u001b[0m \u001b[38;5;129m@with_phil\u001b[39m\n\u001b[0;32m 597\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs):\n\u001b[0;32m 598\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid:\n\u001b[1;32m--> 599\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclose\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 292 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\h5py\\_hl\\files.py:581\u001b[0m, in \u001b[0;36mFile.close\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 575\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[0;32m 576\u001b[0m \u001b[38;5;66;03m# We have to explicitly murder all open objects related to the file\u001b[39;00m\n\u001b[0;32m 577\u001b[0m \n\u001b[0;32m 578\u001b[0m \u001b[38;5;66;03m# Close file-resident objects first, then the files.\u001b[39;00m\n\u001b[0;32m 579\u001b[0m \u001b[38;5;66;03m# Otherwise we get errors in MPI mode.\u001b[39;00m\n\u001b[0;32m 580\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39m_close_open_objects(h5f\u001b[38;5;241m.\u001b[39mOBJ_LOCAL \u001b[38;5;241m|\u001b[39m \u001b[38;5;241m~\u001b[39mh5f\u001b[38;5;241m.\u001b[39mOBJ_FILE)\n\u001b[1;32m--> 581\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_close_open_objects\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh5f\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mOBJ_LOCAL\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m|\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mh5f\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mOBJ_FILE\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 583\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m 584\u001b[0m _objects\u001b[38;5;241m.\u001b[39mnonlocal_close()\n",
|
| 293 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:54\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 294 |
+
"File \u001b[1;32mh5py\\\\_objects.pyx:55\u001b[0m, in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[1;34m()\u001b[0m\n",
|
| 295 |
+
"File \u001b[1;32mh5py\\\\h5f.pyx:355\u001b[0m, in \u001b[0;36mh5py.h5f.FileID._close_open_objects\u001b[1;34m()\u001b[0m\n",
|
| 296 |
+
"\u001b[1;31mRuntimeError\u001b[0m: Can't decrement id ref count (unable to extend file properly)"
|
| 297 |
+
]
|
| 298 |
+
}
|
| 299 |
+
],
|
| 300 |
+
"source": [
|
| 301 |
+
"import pandas as pd\n",
|
| 302 |
+
"import numpy as np\n",
|
| 303 |
+
"from tqdm import tqdm\n",
|
| 304 |
+
"import random\n",
|
| 305 |
+
"import os\n",
|
| 306 |
+
"import ktrain\n",
|
| 307 |
+
"from ktrain import text\n",
|
| 308 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 309 |
+
"\n",
|
| 310 |
+
"\n",
|
| 311 |
+
"# PAM1\n",
|
| 312 |
+
"# PAM matrix model of protein evolution\n",
|
| 313 |
+
"# DOI:10.1093/oxfordjournals.molbev.a040360\n",
|
| 314 |
+
"pam_data = {\n",
|
| 315 |
+
" 'A': [9948, 19, 27, 42, 31, 46, 50, 92, 17, 7, 40, 88, 42, 41, 122, 279, 255, 9, 72, 723],\n",
|
| 316 |
+
" 'R': [14, 9871, 24, 38, 37, 130, 38, 62, 49, 4, 58, 205, 26, 33, 47, 103, 104, 5, 36, 52],\n",
|
| 317 |
+
" 'N': [20, 22, 9860, 181, 29, 36, 41, 67, 31, 5, 22, 49, 23, 10, 33, 83, 66, 3, 43, 32],\n",
|
| 318 |
+
" 'D': [40, 34, 187, 9818, 11, 63, 98, 61, 23, 5, 25, 54, 43, 13, 27, 88, 55, 4, 29, 36],\n",
|
| 319 |
+
" 'C': [20, 16, 26, 9, 9987, 10, 17, 37, 12, 2, 16, 26, 10, 19, 27, 26, 25, 2, 6, 67],\n",
|
| 320 |
+
" 'Q': [29, 118, 29, 49, 8, 9816, 72, 55, 36, 4, 60, 158, 35, 22, 39, 86, 74, 3, 34, 28],\n",
|
| 321 |
+
" 'E': [35, 29, 41, 101, 12, 71, 9804, 56, 33, 5, 36, 107, 42, 20, 38, 87, 69, 4, 30, 42],\n",
|
| 322 |
+
" 'G': [96, 61, 77, 70, 38, 51, 58, 9868, 26, 6, 37, 53, 39, 28, 69, 134, 116, 5, 47, 60],\n",
|
| 323 |
+
" 'H': [17, 53, 33, 19, 15, 39, 34, 24, 9907, 3, 32, 57, 24, 15, 27, 47, 43, 2, 22, 19],\n",
|
| 324 |
+
" 'I': [6, 3, 6, 6, 3, 5, 6, 7, 3, 9973, 23, 13, 12, 41, 93, 84, 115, 3, 8, 102],\n",
|
| 325 |
+
" 'L': [26, 39, 17, 15, 7, 33, 22, 20, 19, 27, 9864, 49, 24, 78, 117, 148, 193, 5, 24, 70],\n",
|
| 326 |
+
" 'K': [60, 198, 43, 52, 12, 142, 96, 53, 42, 10, 63, 9710, 33, 26, 54, 109, 102, 5, 43, 42],\n",
|
| 327 |
+
" 'M': [21, 22, 15, 18, 6, 20, 18, 18, 17, 11, 27, 32, 9945, 26, 34, 61, 71, 3, 12, 31],\n",
|
| 328 |
+
" 'F': [18, 17, 8, 6, 8, 11, 10, 16, 10, 44, 92, 24, 29, 9899, 89, 88, 142, 7, 14, 68],\n",
|
| 329 |
+
" 'P': [97, 47, 35, 29, 23, 35, 38, 57, 21, 24, 47, 56, 28, 76, 9785, 115, 77, 4, 24, 35],\n",
|
| 330 |
+
" 'S': [241, 87, 76, 73, 17, 56, 60, 99, 32, 13, 69, 92, 42, 67, 100, 9605, 212, 8, 63, 70],\n",
|
| 331 |
+
" 'T': [186, 78, 54, 37, 14, 42, 42, 83, 28, 23, 84, 85, 53, 93, 66, 182, 9676, 8, 39, 90],\n",
|
| 332 |
+
" 'W': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 5, 3, 4, 4, 9960, 3, 4],\n",
|
| 333 |
+
" 'Y': [29, 21, 17, 9, 4, 13, 9, 21, 10, 7, 20, 17, 11, 23, 19, 41, 31, 3, 9935, 23],\n",
|
| 334 |
+
" 'V': [368, 27, 18, 18, 50, 23, 34, 64, 15, 85, 72, 42, 33, 88, 42, 112, 137, 4, 20, 9514]\n",
|
| 335 |
+
"}\n",
|
| 336 |
+
"pam_raw = pd.DataFrame(pam_data, index=list(pam_data.keys()))\n",
|
| 337 |
+
"pam_matrix = pam_raw.div(pam_raw.sum(axis=1), axis=0)\n",
|
| 338 |
+
"list_amino = pam_raw.columns.tolist()\n",
|
| 339 |
+
"pam_dict = {\n",
|
| 340 |
+
" aa: {sub: pam_matrix.loc[aa, sub] for sub in list_amino}\n",
|
| 341 |
+
" for aa in list_amino\n",
|
| 342 |
+
"}\n",
|
| 343 |
+
"\n",
|
| 344 |
+
"def pam1_substitution(aa):\n",
|
| 345 |
+
" if aa not in pam_dict:\n",
|
| 346 |
+
" return aa\n",
|
| 347 |
+
" subs = list(pam_dict[aa].keys())\n",
|
| 348 |
+
" probs = list(pam_dict[aa].values())\n",
|
| 349 |
+
" return np.random.choice(subs, p=probs)\n",
|
| 350 |
+
"\n",
|
| 351 |
+
"def augment_sequence(seq, sub_prob=0.05):\n",
|
| 352 |
+
" return ''.join([pam1_substitution(aa) if random.random() < sub_prob else aa for aa in seq])\n",
|
| 353 |
+
"\n",
|
| 354 |
+
"def slice_sequence(seq, win=512):\n",
|
| 355 |
+
" return [seq[i:i+win] for i in range(0, len(seq), win)]\n",
|
| 356 |
+
"\n",
|
| 357 |
+
"def generate_data(df, augment=False):\n",
|
| 358 |
+
" X, y = [], []\n",
|
| 359 |
+
" label_cols = [col for col in df.columns if col.startswith(\"GO:\")]\n",
|
| 360 |
+
" for _, row in tqdm(df.iterrows(), total=len(df)):\n",
|
| 361 |
+
" seq = row[\"sequence\"]\n",
|
| 362 |
+
" if augment:\n",
|
| 363 |
+
" seq = augment_sequence(seq)\n",
|
| 364 |
+
" seq_slices = slice_sequence(seq)\n",
|
| 365 |
+
" X.extend(seq_slices)\n",
|
| 366 |
+
" lbl = row[label_cols].values.astype(int)\n",
|
| 367 |
+
" y.extend([lbl] * len(seq_slices))\n",
|
| 368 |
+
" return X, np.array(y), label_cols\n",
|
| 369 |
+
"\n",
|
| 370 |
+
"def format_sequence(seq): return \" \".join(list(seq))\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"# Função para carregar e binarizar\n",
|
| 373 |
+
"def load_and_binarize(csv_path, mlb=None):\n",
|
| 374 |
+
" df = pd.read_csv(csv_path)\n",
|
| 375 |
+
" df[\"go_terms\"] = df[\"go_terms\"].str.split(\";\")\n",
|
| 376 |
+
" if mlb is None:\n",
|
| 377 |
+
" mlb = MultiLabelBinarizer()\n",
|
| 378 |
+
" labels = mlb.fit_transform(df[\"go_terms\"])\n",
|
| 379 |
+
" else:\n",
|
| 380 |
+
" labels = mlb.transform(df[\"go_terms\"])\n",
|
| 381 |
+
" labels_df = pd.DataFrame(labels, columns=mlb.classes_)\n",
|
| 382 |
+
" df = df.reset_index(drop=True).join(labels_df)\n",
|
| 383 |
+
" return df, mlb\n",
|
| 384 |
+
"\n",
|
| 385 |
+
"# Carregar os dados\n",
|
| 386 |
+
"df_train, mlb = load_and_binarize(\"data/mf-training.csv\")\n",
|
| 387 |
+
"df_val, _ = load_and_binarize(\"data/mf-validation.csv\", mlb=mlb)\n",
|
| 388 |
+
"\n",
|
| 389 |
+
"# Gerar com augmentation no treino\n",
|
| 390 |
+
"X_train, y_train, term_cols = generate_data(df_train, augment=True)\n",
|
| 391 |
+
"X_val, y_val, _ = generate_data(df_val, augment=False)\n",
|
| 392 |
+
"\n",
|
| 393 |
+
"# Preparar texto para tokenizer\n",
|
| 394 |
+
"X_train_fmt = list(map(format_sequence, X_train))\n",
|
| 395 |
+
"X_val_fmt = list(map(format_sequence, X_val))\n",
|
| 396 |
+
"\n",
|
| 397 |
+
"# Fine-tune ProtBERT\n",
|
| 398 |
+
"# https://huggingface.co/Rostlab/prot_bert\n",
|
| 399 |
+
"# https://doi.org/10.1093/bioinformatics/btac020\n",
|
| 400 |
+
"# dados de treino-> UniRef100 (216 milhões de sequências)\n",
|
| 401 |
+
"MODEL_NAME = \"Rostlab/prot_bert\"\n",
|
| 402 |
+
"MAX_LEN = 512\n",
|
| 403 |
+
"BATCH_SIZE = 1\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"t = text.Transformer(MODEL_NAME, maxlen=MAX_LEN, classes=term_cols)\n",
|
| 406 |
+
"trn = t.preprocess_train(X_train_fmt, y_train)\n",
|
| 407 |
+
"val = t.preprocess_test(X_val_fmt, y_val)\n",
|
| 408 |
+
"\n",
|
| 409 |
+
"model = t.get_classifier()\n",
|
| 410 |
+
"learner = ktrain.get_learner(model,\n",
|
| 411 |
+
" train_data=trn,\n",
|
| 412 |
+
" val_data=val,\n",
|
| 413 |
+
" batch_size=BATCH_SIZE)\n",
|
| 414 |
+
"\n",
|
| 415 |
+
"learner.autofit(lr=1e-5,\n",
|
| 416 |
+
" epochs=10,\n",
|
| 417 |
+
" early_stopping=1,\n",
|
| 418 |
+
" checkpoint_folder=\"mf-fine-tuned-protbert\")\n"
|
| 419 |
+
]
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"cell_type": "code",
|
| 423 |
+
"execution_count": 19,
|
| 424 |
+
"id": "9b39c439-5708-4787-bfee-d3a4d3aa190d",
|
| 425 |
+
"metadata": {},
|
| 426 |
+
"outputs": [
|
| 427 |
+
{
|
| 428 |
+
"name": "stderr",
|
| 429 |
+
"output_type": "stream",
|
| 430 |
+
"text": [
|
| 431 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 432 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 433 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n",
|
| 434 |
+
" _torch_pytree._register_pytree_node(\n",
|
| 435 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\huggingface_hub\\file_download.py:797: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
| 436 |
+
" warnings.warn(\n",
|
| 437 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n",
|
| 438 |
+
" _torch_pytree._register_pytree_node(\n",
|
| 439 |
+
"Some layers from the model checkpoint at weights/mf-fine-tuned-protbert-epoch10 were not used when initializing TFBertModel: ['classifier', 'dropout_183']\n",
|
| 440 |
+
"- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
| 441 |
+
"- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
| 442 |
+
"All the layers of TFBertModel were initialized from the model checkpoint at weights/mf-fine-tuned-protbert-epoch10.\n",
|
| 443 |
+
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n"
|
| 444 |
+
]
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"name": "stdout",
|
| 448 |
+
"output_type": "stream",
|
| 449 |
+
"text": [
|
| 450 |
+
"✓ Tokenizer base e modelo fine-tuned carregados com sucesso\n"
|
| 451 |
+
]
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"name": "stderr",
|
| 455 |
+
"output_type": "stream",
|
| 456 |
+
"text": [
|
| 457 |
+
"Processando data/mf-training.csv: 0%| | 25/31142 [00:06<2:23:28, 3.61it/s]\n"
|
| 458 |
+
]
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"ename": "KeyboardInterrupt",
|
| 462 |
+
"evalue": "",
|
| 463 |
+
"output_type": "error",
|
| 464 |
+
"traceback": [
|
| 465 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 466 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
| 467 |
+
"Cell \u001b[1;32mIn[19], line 78\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[38;5;66;03m# --- 4. Aplicar -----------------------------------------------------------\u001b[39;00m\n\u001b[0;32m 76\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(OUT_DIR, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m---> 78\u001b[0m \u001b[43mprocess_split\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata/mf-training.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mOUT_DIR\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain_protbert.pkl\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 79\u001b[0m process_split(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/mf-validation.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(OUT_DIR, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mval_protbert.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[0;32m 80\u001b[0m process_split(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/mf-test.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(OUT_DIR, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_protbert.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n",
|
| 468 |
+
"Cell \u001b[1;32mIn[19], line 61\u001b[0m, in \u001b[0;36mprocess_split\u001b[1;34m(csv_path, out_path)\u001b[0m\n\u001b[0;32m 59\u001b[0m embeds\u001b[38;5;241m.\u001b[39mappend(prot_embed\u001b[38;5;241m.\u001b[39mastype(np\u001b[38;5;241m.\u001b[39mfloat32))\n\u001b[0;32m 60\u001b[0m labels\u001b[38;5;241m.\u001b[39mappend(row[label_cols]\u001b[38;5;241m.\u001b[39mvalues\u001b[38;5;241m.\u001b[39mastype(np\u001b[38;5;241m.\u001b[39mint8))\n\u001b[1;32m---> 61\u001b[0m gc\u001b[38;5;241m.\u001b[39mcollect()\n\u001b[0;32m 63\u001b[0m embeds \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvstack(embeds)\n\u001b[0;32m 64\u001b[0m labels \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvstack(labels)\n",
|
| 469 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
| 470 |
+
]
|
| 471 |
+
}
|
| 472 |
+
],
|
| 473 |
+
"source": [
|
| 474 |
+
"import os\n",
|
| 475 |
+
"import pandas as pd\n",
|
| 476 |
+
"import numpy as np\n",
|
| 477 |
+
"from tqdm import tqdm\n",
|
| 478 |
+
"import joblib\n",
|
| 479 |
+
"import gc\n",
|
| 480 |
+
"from transformers import AutoTokenizer, TFAutoModel\n",
|
| 481 |
+
"\n",
|
| 482 |
+
"# Parâmetros\n",
|
| 483 |
+
"MODEL_DIR = \"weights/mf-fine-tuned-protbert-epoch10\"\n",
|
| 484 |
+
"BASE_MODEL = \"Rostlab/prot_bert\"\n",
|
| 485 |
+
"OUT_DIR = \"embeddings\"\n",
|
| 486 |
+
"BATCH_TOK = 16\n",
|
| 487 |
+
"\n",
|
| 488 |
+
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=False)\n",
|
| 489 |
+
"model = TFAutoModel.from_pretrained(MODEL_DIR, from_pt=False)\n",
|
| 490 |
+
"\n",
|
| 491 |
+
"print(\"✓ Tokenizer base e modelo fine-tuned carregados com sucesso\")\n",
|
| 492 |
+
"\n",
|
| 493 |
+
"# Funções auxiliares\n",
|
| 494 |
+
"\n",
|
| 495 |
+
"def get_embeddings(batch, tokenizer, model):\n",
|
| 496 |
+
" tokens = tokenizer(batch, return_tensors=\"tf\", padding=True, truncation=True, max_length=512)\n",
|
| 497 |
+
" output = model(**tokens)\n",
|
| 498 |
+
" return output.last_hidden_state[:, 0, :].numpy()\n",
|
| 499 |
+
"\n",
|
| 500 |
+
"def process_split(csv_path, out_path):\n",
|
| 501 |
+
" df = pd.read_csv(csv_path)\n",
|
| 502 |
+
" label_cols = [col for col in df.columns if col.startswith(\"GO:\")]\n",
|
| 503 |
+
" prot_ids, embeds, labels = [], [], []\n",
|
| 504 |
+
"\n",
|
| 505 |
+
" for _, row in tqdm(df.iterrows(), total=len(df), desc=f\"Processando {csv_path}\"):\n",
|
| 506 |
+
" slices = slice_sequence(row[\"sequence\"])\n",
|
| 507 |
+
" slices_fmt = list(map(format_sequence, slices))\n",
|
| 508 |
+
"\n",
|
| 509 |
+
" slice_embeds = []\n",
|
| 510 |
+
" for i in range(0, len(slices_fmt), BATCH_TOK):\n",
|
| 511 |
+
" batch = slices_fmt[i:i+BATCH_TOK]\n",
|
| 512 |
+
" slice_embeds.append(get_embeddings(batch, tokenizer, model))\n",
|
| 513 |
+
" slice_embeds = np.vstack(slice_embeds)\n",
|
| 514 |
+
"\n",
|
| 515 |
+
" prot_embed = slice_embeds.mean(axis=0)\n",
|
| 516 |
+
" prot_ids.append(row[\"protein_id\"])\n",
|
| 517 |
+
" embeds.append(prot_embed.astype(np.float32))\n",
|
| 518 |
+
" labels.append(row[label_cols].values.astype(np.int8))\n",
|
| 519 |
+
" gc.collect()\n",
|
| 520 |
+
"\n",
|
| 521 |
+
" embeds = np.vstack(embeds)\n",
|
| 522 |
+
" labels = np.vstack(labels)\n",
|
| 523 |
+
"\n",
|
| 524 |
+
" joblib.dump({\n",
|
| 525 |
+
" \"protein_ids\": prot_ids,\n",
|
| 526 |
+
" \"embeddings\": embeds,\n",
|
| 527 |
+
" \"labels\": labels,\n",
|
| 528 |
+
" \"go_terms\": label_cols\n",
|
| 529 |
+
" }, out_path, compress=3)\n",
|
| 530 |
+
"\n",
|
| 531 |
+
" print(f\"✓ Guardado {out_path} — {embeds.shape[0]} proteínas\")\n",
|
| 532 |
+
"\n",
|
| 533 |
+
"# Aplicar\n",
|
| 534 |
+
"os.makedirs(OUT_DIR, exist_ok=True)\n",
|
| 535 |
+
"\n",
|
| 536 |
+
"process_split(\"data/mf-training.csv\", os.path.join(OUT_DIR, \"train_protbert.pkl\"))\n",
|
| 537 |
+
"process_split(\"data/mf-validation.csv\", os.path.join(OUT_DIR, \"val_protbert.pkl\"))\n",
|
| 538 |
+
"process_split(\"data/mf-test.csv\", os.path.join(OUT_DIR, \"test_protbert.pkl\"))\n"
|
| 539 |
+
]
|
| 540 |
+
},
|
| 541 |
+
{
|
| 542 |
+
"cell_type": "code",
|
| 543 |
+
"execution_count": 27,
|
| 544 |
+
"id": "ad0c5421-e0a1-4a6a-8ace-2c69aeab0e0d",
|
| 545 |
+
"metadata": {},
|
| 546 |
+
"outputs": [
|
| 547 |
+
{
|
| 548 |
+
"name": "stdout",
|
| 549 |
+
"output_type": "stream",
|
| 550 |
+
"text": [
|
| 551 |
+
"✓ Corrigido: embeddings/train_protbert.pkl — 31142 exemplos, 597 GO terms\n",
|
| 552 |
+
"✓ Corrigido: embeddings/val_protbert.pkl — 1724 exemplos, 597 GO terms\n",
|
| 553 |
+
"✓ Corrigido: embeddings/test_protbert.pkl — 1724 exemplos, 597 GO terms\n"
|
| 554 |
+
]
|
| 555 |
+
}
|
| 556 |
+
],
|
| 557 |
+
"source": [
|
| 558 |
+
"import pandas as pd\n",
|
| 559 |
+
"import joblib\n",
|
| 560 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 561 |
+
"\n",
|
| 562 |
+
"# Obter GO terms do ficheiro de teste\n",
|
| 563 |
+
"df_test = pd.read_csv(\"data/mf-test.csv\")\n",
|
| 564 |
+
"test_terms = sorted(set(term for row in df_test[\"go_terms\"].str.split(\";\") for term in row))\n",
|
| 565 |
+
"\n",
|
| 566 |
+
"# Função para corrigir um .pkl com base nos GO terms do teste\n",
|
| 567 |
+
"def patch_to_common_terms(csv_path, pkl_path, common_terms):\n",
|
| 568 |
+
" df = pd.read_csv(csv_path)\n",
|
| 569 |
+
" terms_split = df[\"go_terms\"].str.split(\";\")\n",
|
| 570 |
+
" \n",
|
| 571 |
+
" # Apenas termos presentes nos common_terms\n",
|
| 572 |
+
" terms_filtered = terms_split.apply(lambda lst: [t for t in lst if t in common_terms])\n",
|
| 573 |
+
" \n",
|
| 574 |
+
" mlb = MultiLabelBinarizer(classes=common_terms)\n",
|
| 575 |
+
" Y = mlb.fit_transform(terms_filtered)\n",
|
| 576 |
+
"\n",
|
| 577 |
+
" data = joblib.load(pkl_path)\n",
|
| 578 |
+
" data[\"labels\"] = Y\n",
|
| 579 |
+
" data[\"go_terms\"] = mlb.classes_.tolist()\n",
|
| 580 |
+
" \n",
|
| 581 |
+
" joblib.dump(data, pkl_path, compress=3)\n",
|
| 582 |
+
" print(f\"✓ Corrigido: {pkl_path} — {Y.shape[0]} exemplos, {Y.shape[1]} GO terms\")\n",
|
| 583 |
+
"\n",
|
| 584 |
+
"# Aplicar às 3 partições\n",
|
| 585 |
+
"patch_to_common_terms(\"data/mf-training.csv\", \"embeddings/train_protbert.pkl\", test_terms)\n",
|
| 586 |
+
"patch_to_common_terms(\"data/mf-validation.csv\", \"embeddings/val_protbert.pkl\", test_terms)\n",
|
| 587 |
+
"patch_to_common_terms(\"data/mf-test.csv\", \"embeddings/test_protbert.pkl\", test_terms)\n",
|
| 588 |
+
"\n"
|
| 589 |
+
]
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"cell_type": "code",
|
| 593 |
+
"execution_count": 2,
|
| 594 |
+
"id": "1785d8a9-23fc-4490-8d71-29cc91a4cb57",
|
| 595 |
+
"metadata": {},
|
| 596 |
+
"outputs": [
|
| 597 |
+
{
|
| 598 |
+
"name": "stdout",
|
| 599 |
+
"output_type": "stream",
|
| 600 |
+
"text": [
|
| 601 |
+
"✓ Embeddings carregados: (31142, 1024) → 597 GO terms\n",
|
| 602 |
+
"Epoch 1/100\n",
|
| 603 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0357 - binary_accuracy: 0.9894 - val_loss: 0.0334 - val_binary_accuracy: 0.9902\n",
|
| 604 |
+
"Epoch 2/100\n",
|
| 605 |
+
"974/974 [==============================] - 10s 11ms/step - loss: 0.0276 - binary_accuracy: 0.9914 - val_loss: 0.0328 - val_binary_accuracy: 0.9901\n",
|
| 606 |
+
"Epoch 3/100\n",
|
| 607 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0268 - binary_accuracy: 0.9916 - val_loss: 0.0326 - val_binary_accuracy: 0.9904\n",
|
| 608 |
+
"Epoch 4/100\n",
|
| 609 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0264 - binary_accuracy: 0.9917 - val_loss: 0.0321 - val_binary_accuracy: 0.9902\n",
|
| 610 |
+
"Epoch 5/100\n",
|
| 611 |
+
"974/974 [==============================] - 11s 12ms/step - loss: 0.0260 - binary_accuracy: 0.9918 - val_loss: 0.0318 - val_binary_accuracy: 0.9903\n",
|
| 612 |
+
"Epoch 6/100\n",
|
| 613 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0257 - binary_accuracy: 0.9918 - val_loss: 0.0326 - val_binary_accuracy: 0.9903\n",
|
| 614 |
+
"Epoch 7/100\n",
|
| 615 |
+
"974/974 [==============================] - 11s 12ms/step - loss: 0.0255 - binary_accuracy: 0.9919 - val_loss: 0.0321 - val_binary_accuracy: 0.9906\n",
|
| 616 |
+
"Epoch 8/100\n",
|
| 617 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0252 - binary_accuracy: 0.9919 - val_loss: 0.0329 - val_binary_accuracy: 0.9904\n",
|
| 618 |
+
"Epoch 9/100\n",
|
| 619 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0251 - binary_accuracy: 0.9919 - val_loss: 0.0320 - val_binary_accuracy: 0.9905\n",
|
| 620 |
+
"Epoch 10/100\n",
|
| 621 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0249 - binary_accuracy: 0.9920 - val_loss: 0.0318 - val_binary_accuracy: 0.9904\n",
|
| 622 |
+
"54/54 [==============================] - 0s 2ms/step\n",
|
| 623 |
+
"Previsões guardadas em mf-protbert-pam1.npy\n",
|
| 624 |
+
"Modelos guardado em models/\n"
|
| 625 |
+
]
|
| 626 |
+
}
|
| 627 |
+
],
|
| 628 |
+
"source": [
|
| 629 |
+
"import tensorflow as tf\n",
|
| 630 |
+
"import joblib\n",
|
| 631 |
+
"import numpy as np\n",
|
| 632 |
+
"from tensorflow.keras import Input\n",
|
| 633 |
+
"from tensorflow.keras.models import Sequential\n",
|
| 634 |
+
"from tensorflow.keras.layers import Dense, Dropout\n",
|
| 635 |
+
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"# Carregar embeddings\n",
|
| 638 |
+
"train = joblib.load(\"embeddings/train_protbert.pkl\")\n",
|
| 639 |
+
"val = joblib.load(\"embeddings/val_protbert.pkl\")\n",
|
| 640 |
+
"test = joblib.load(\"embeddings/test_protbert.pkl\")\n",
|
| 641 |
+
"\n",
|
| 642 |
+
"X_train, y_train = train[\"embeddings\"], train[\"labels\"]\n",
|
| 643 |
+
"X_val, y_val = val[\"embeddings\"], val[\"labels\"]\n",
|
| 644 |
+
"X_test, y_test = test[\"embeddings\"], test[\"labels\"]\n",
|
| 645 |
+
"\n",
|
| 646 |
+
"print(f\"✓ Embeddings carregados: {X_train.shape} → {y_train.shape[1]} GO terms\")\n",
|
| 647 |
+
"\n",
|
| 648 |
+
"# Garantir consistência de classes\n",
|
| 649 |
+
"max_classes = y_train.shape[1] # 602 GO terms (do treino)\n",
|
| 650 |
+
"\n",
|
| 651 |
+
"def pad_labels(y, target_dim=max_classes):\n",
|
| 652 |
+
" if y.shape[1] < target_dim:\n",
|
| 653 |
+
" padding = np.zeros((y.shape[0], target_dim - y.shape[1]), dtype=np.int8)\n",
|
| 654 |
+
" return np.hstack([y, padding])\n",
|
| 655 |
+
" return y\n",
|
| 656 |
+
"\n",
|
| 657 |
+
"y_val = pad_labels(y_val)\n",
|
| 658 |
+
"y_test = pad_labels(y_test)\n",
|
| 659 |
+
"\n",
|
| 660 |
+
"# Modelo MLP\n",
|
| 661 |
+
"model = Sequential([\n",
|
| 662 |
+
" Dense(1024, activation=\"relu\", input_shape=(X_train.shape[1],)),\n",
|
| 663 |
+
" Dropout(0.3),\n",
|
| 664 |
+
" Dense(512, activation=\"relu\"),\n",
|
| 665 |
+
" Dropout(0.3),\n",
|
| 666 |
+
" Dense(max_classes, activation=\"sigmoid\")\n",
|
| 667 |
+
"])\n",
|
| 668 |
+
"\n",
|
| 669 |
+
"model.compile(loss=\"binary_crossentropy\",\n",
|
| 670 |
+
" optimizer=\"adam\",\n",
|
| 671 |
+
" metrics=[\"binary_accuracy\"])\n",
|
| 672 |
+
"\n",
|
| 673 |
+
"# Early stopping e treino\n",
|
| 674 |
+
"callbacks = [\n",
|
| 675 |
+
" EarlyStopping(monitor=\"val_loss\", patience=5, restore_best_weights=True)\n",
|
| 676 |
+
"]\n",
|
| 677 |
+
"\n",
|
| 678 |
+
"model.fit(X_train, y_train,\n",
|
| 679 |
+
" validation_data=(X_val, y_val),\n",
|
| 680 |
+
" epochs=100,\n",
|
| 681 |
+
" batch_size=32,\n",
|
| 682 |
+
" callbacks=callbacks,\n",
|
| 683 |
+
" verbose=1)\n",
|
| 684 |
+
"\n",
|
| 685 |
+
"# Previsões\n",
|
| 686 |
+
"y_prob = model.predict(X_test)\n",
|
| 687 |
+
"np.save(\"predictions/mf-protbert-pam1.npy\", y_prob)\n",
|
| 688 |
+
"print(\"Previsões guardadas em mf-protbert-pam1.npy\")\n",
|
| 689 |
+
"\n",
|
| 690 |
+
"# Modelo\n",
|
| 691 |
+
"model.save(\"models/mlp_protbert.h5\")\n",
|
| 692 |
+
"model.save(\"models/mlp_protbert.keras\")\n",
|
| 693 |
+
"print(\"Modelos guardado em models/\")"
|
| 694 |
+
]
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"cell_type": "code",
|
| 698 |
+
"execution_count": 3,
|
| 699 |
+
"id": "fdb66630-76dc-43a0-bd56-45052175fdba",
|
| 700 |
+
"metadata": {},
|
| 701 |
+
"outputs": [
|
| 702 |
+
{
|
| 703 |
+
"name": "stdout",
|
| 704 |
+
"output_type": "stream",
|
| 705 |
+
"text": [
|
| 706 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n",
|
| 707 |
+
"✓ Embeddings: (1724, 597) labels × 597 GO terms\n",
|
| 708 |
+
"\n",
|
| 709 |
+
"📊 Resultados finais (ProtBERT + PAM1 + propagação):\n",
|
| 710 |
+
"Fmax = 0.6611\n",
|
| 711 |
+
"Thr. = 0.45\n",
|
| 712 |
+
"AuPRC = 0.6951\n",
|
| 713 |
+
"Smin = 13.4386\n"
|
| 714 |
+
]
|
| 715 |
+
}
|
| 716 |
+
],
|
| 717 |
+
"source": [
|
| 718 |
+
"import numpy as np\n",
|
| 719 |
+
"from sklearn.metrics import precision_recall_curve, auc\n",
|
| 720 |
+
"from goatools.obo_parser import GODag\n",
|
| 721 |
+
"import joblib\n",
|
| 722 |
+
"import math\n",
|
| 723 |
+
"\n",
|
| 724 |
+
"# Parâmetros\n",
|
| 725 |
+
"GO_FILE = \"go.obo\"\n",
|
| 726 |
+
"THRESHOLDS = np.arange(0.0, 1.01, 0.01)\n",
|
| 727 |
+
"ALPHA = 0.5\n",
|
| 728 |
+
"\n",
|
| 729 |
+
"# Carregar dados\n",
|
| 730 |
+
"test = joblib.load(\"embeddings/test_protbert.pkl\")\n",
|
| 731 |
+
"y_true = test[\"labels\"]\n",
|
| 732 |
+
"terms = test[\"go_terms\"]\n",
|
| 733 |
+
"y_prob = np.load(\"predictions/mf-protbert-pam1.npy\")\n",
|
| 734 |
+
"go_dag = GODag(GO_FILE)\n",
|
| 735 |
+
"\n",
|
| 736 |
+
"print(f\"✓ Embeddings: {y_true.shape} labels × {len(terms)} GO terms\")\n",
|
| 737 |
+
"\n",
|
| 738 |
+
"# Fmax\n",
|
| 739 |
+
"def compute_fmax(y_true, y_prob, thresholds):\n",
|
| 740 |
+
" fmax, best_thr = 0, 0\n",
|
| 741 |
+
" for t in thresholds:\n",
|
| 742 |
+
" y_pred = (y_prob >= t).astype(int)\n",
|
| 743 |
+
" tp = (y_true * y_pred).sum(axis=1)\n",
|
| 744 |
+
" fp = ((1 - y_true) * y_pred).sum(axis=1)\n",
|
| 745 |
+
" fn = (y_true * (1 - y_pred)).sum(axis=1)\n",
|
| 746 |
+
" precision = tp / (tp + fp + 1e-8)\n",
|
| 747 |
+
" recall = tp / (tp + fn + 1e-8)\n",
|
| 748 |
+
" f1 = 2 * precision * recall / (precision + recall + 1e-8)\n",
|
| 749 |
+
" avg_f1 = np.mean(f1)\n",
|
| 750 |
+
" if avg_f1 > fmax:\n",
|
| 751 |
+
" fmax, best_thr = avg_f1, t\n",
|
| 752 |
+
" return fmax, best_thr\n",
|
| 753 |
+
"\n",
|
| 754 |
+
"# AuPRC micro\n",
|
| 755 |
+
"def compute_auprc(y_true, y_prob):\n",
|
| 756 |
+
" precision, recall, _ = precision_recall_curve(y_true.ravel(), y_prob.ravel())\n",
|
| 757 |
+
" return auc(recall, precision)\n",
|
| 758 |
+
"\n",
|
| 759 |
+
"# Smin\n",
|
| 760 |
+
"def compute_smin(y_true, y_prob, terms, threshold, go_dag, alpha=ALPHA):\n",
|
| 761 |
+
" y_pred = (y_prob >= threshold).astype(int)\n",
|
| 762 |
+
" ic = {}\n",
|
| 763 |
+
" total = (y_true + y_pred).sum(axis=0).sum()\n",
|
| 764 |
+
" for i, term in enumerate(terms):\n",
|
| 765 |
+
" freq = (y_true[:, i] + y_pred[:, i]).sum()\n",
|
| 766 |
+
" ic[term] = -np.log((freq + 1e-8) / total)\n",
|
| 767 |
+
"\n",
|
| 768 |
+
" s_values = []\n",
|
| 769 |
+
" for true_vec, pred_vec in zip(y_true, y_pred):\n",
|
| 770 |
+
" true_terms = {terms[i] for i in np.where(true_vec)[0]}\n",
|
| 771 |
+
" pred_terms = {terms[i] for i in np.where(pred_vec)[0]}\n",
|
| 772 |
+
"\n",
|
| 773 |
+
" anc_true = set()\n",
|
| 774 |
+
" for t in true_terms:\n",
|
| 775 |
+
" if t in go_dag:\n",
|
| 776 |
+
" anc_true |= go_dag[t].get_all_parents()\n",
|
| 777 |
+
" anc_pred = set()\n",
|
| 778 |
+
" for t in pred_terms:\n",
|
| 779 |
+
" if t in go_dag:\n",
|
| 780 |
+
" anc_pred |= go_dag[t].get_all_parents()\n",
|
| 781 |
+
"\n",
|
| 782 |
+
" ru = pred_terms - true_terms\n",
|
| 783 |
+
" mi = true_terms - pred_terms\n",
|
| 784 |
+
" dist_ru = sum(ic.get(t, 0) for t in ru)\n",
|
| 785 |
+
" dist_mi = sum(ic.get(t, 0) for t in mi)\n",
|
| 786 |
+
" s = math.sqrt((alpha * dist_ru)**2 + ((1 - alpha) * dist_mi)**2)\n",
|
| 787 |
+
" s_values.append(s)\n",
|
| 788 |
+
"\n",
|
| 789 |
+
" return np.mean(s_values)\n",
|
| 790 |
+
"\n",
|
| 791 |
+
"# --- 6. Avaliar ----------------------------------------------------------\n",
|
| 792 |
+
"fmax, thr = compute_fmax(y_true, y_prob, THRESHOLDS)\n",
|
| 793 |
+
"auprc = compute_auprc(y_true, y_prob)\n",
|
| 794 |
+
"smin = compute_smin(y_true, y_prob, terms, thr, go_dag)\n",
|
| 795 |
+
"\n",
|
| 796 |
+
"print(f\"\\n📊 Resultados finais (ProtBERT + PAM1 + propagação):\")\n",
|
| 797 |
+
"print(f\"Fmax = {fmax:.4f}\")\n",
|
| 798 |
+
"print(f\"Thr. = {thr:.2f}\")\n",
|
| 799 |
+
"print(f\"AuPRC = {auprc:.4f}\")\n",
|
| 800 |
+
"print(f\"Smin = {smin:.4f}\")\n"
|
| 801 |
+
]
|
| 802 |
+
}
|
| 803 |
+
],
|
| 804 |
+
"metadata": {
|
| 805 |
+
"kernelspec": {
|
| 806 |
+
"display_name": "Python 3 (ipykernel)",
|
| 807 |
+
"language": "python",
|
| 808 |
+
"name": "python3"
|
| 809 |
+
},
|
| 810 |
+
"language_info": {
|
| 811 |
+
"codemirror_mode": {
|
| 812 |
+
"name": "ipython",
|
| 813 |
+
"version": 3
|
| 814 |
+
},
|
| 815 |
+
"file_extension": ".py",
|
| 816 |
+
"mimetype": "text/x-python",
|
| 817 |
+
"name": "python",
|
| 818 |
+
"nbconvert_exporter": "python",
|
| 819 |
+
"pygments_lexer": "ipython3",
|
| 820 |
+
"version": "3.10.16"
|
| 821 |
+
}
|
| 822 |
+
},
|
| 823 |
+
"nbformat": 4,
|
| 824 |
+
"nbformat_minor": 5
|
| 825 |
+
}
|
notebooks/mlp_protbertbfd.ipynb
ADDED
|
@@ -0,0 +1,802 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "c6dbc330-062a-48f0-8242-3f21cc1c9c2b",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n",
|
| 14 |
+
"✓ Ficheiros criados:\n",
|
| 15 |
+
" - data/mf-training.csv : (31142, 3)\n",
|
| 16 |
+
" - data/mf-validation.csv: (1724, 3)\n",
|
| 17 |
+
" - data/mf-test.csv : (1724, 3)\n",
|
| 18 |
+
"GO terms únicos (após propagação e filtro): 602\n"
|
| 19 |
+
]
|
| 20 |
+
}
|
| 21 |
+
],
|
| 22 |
+
"source": [
|
| 23 |
+
"import pandas as pd\n",
|
| 24 |
+
"from Bio import SeqIO\n",
|
| 25 |
+
"from collections import Counter\n",
|
| 26 |
+
"from goatools.obo_parser import GODag\n",
|
| 27 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 28 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 29 |
+
"from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n",
|
| 30 |
+
"import numpy as np\n",
|
| 31 |
+
"import os\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"# Carregar GO anotações\n",
|
| 34 |
+
"annotations = pd.read_csv(\"uniprot_sprot_exp.txt\", sep=\"\\t\", names=[\"protein_id\", \"go_term\", \"go_category\"])\n",
|
| 35 |
+
"annotations_f = annotations[annotations[\"go_category\"] == \"F\"]\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"# Carregar DAG e propagar GO terms\n",
|
| 38 |
+
"# propagação hierárquica\n",
|
| 39 |
+
"# https://geneontology.org/docs/download-ontology/\n",
|
| 40 |
+
"go_dag = GODag(\"go.obo\")\n",
|
| 41 |
+
"mf_terms = {t for t, o in go_dag.items() if o.namespace == \"molecular_function\"}\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"def propagate_terms(term_list):\n",
|
| 44 |
+
" full = set()\n",
|
| 45 |
+
" for t in term_list:\n",
|
| 46 |
+
" if t not in go_dag:\n",
|
| 47 |
+
" continue\n",
|
| 48 |
+
" full.add(t)\n",
|
| 49 |
+
" full.update(go_dag[t].get_all_parents())\n",
|
| 50 |
+
" return list(full & mf_terms)\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"# Carregar sequências\n",
|
| 53 |
+
"seqs, ids = [], []\n",
|
| 54 |
+
"for record in SeqIO.parse(\"uniprot_sprot_exp.fasta\", \"fasta\"):\n",
|
| 55 |
+
" ids.append(record.id)\n",
|
| 56 |
+
" seqs.append(str(record.seq))\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"seq_df = pd.DataFrame({\"protein_id\": ids, \"sequence\": seqs})\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"# Juntar com GO anotado e propagar\n",
|
| 61 |
+
"grouped = annotations_f.groupby(\"protein_id\")[\"go_term\"].apply(list).reset_index()\n",
|
| 62 |
+
"data = seq_df.merge(grouped, on=\"protein_id\")\n",
|
| 63 |
+
"data = data[data[\"go_term\"].apply(len) > 0]\n",
|
| 64 |
+
"data[\"go_term\"] = data[\"go_term\"].apply(propagate_terms)\n",
|
| 65 |
+
"data = data[data[\"go_term\"].apply(len) > 0]\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"# Filtrar GO terms raros\n",
|
| 68 |
+
"# todos os terms com menos de 50 proteinas associadas\n",
|
| 69 |
+
"all_terms = [term for sublist in data[\"go_term\"] for term in sublist]\n",
|
| 70 |
+
"term_counts = Counter(all_terms)\n",
|
| 71 |
+
"valid_terms = {term for term, count in term_counts.items() if count >= 50}\n",
|
| 72 |
+
"data[\"go_term\"] = data[\"go_term\"].apply(lambda terms: [t for t in terms if t in valid_terms])\n",
|
| 73 |
+
"data = data[data[\"go_term\"].apply(len) > 0]\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"# Preparar dataset final\n",
|
| 76 |
+
"data[\"go_terms\"] = data[\"go_term\"].apply(lambda x: ';'.join(sorted(set(x))))\n",
|
| 77 |
+
"data = data[[\"protein_id\", \"sequence\", \"go_terms\"]].drop_duplicates()\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# Binarizar labels e dividir\n",
|
| 80 |
+
"mlb = MultiLabelBinarizer()\n",
|
| 81 |
+
"Y = mlb.fit_transform(data[\"go_terms\"].str.split(\";\"))\n",
|
| 82 |
+
"X = data[[\"protein_id\", \"sequence\"]].values\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"mskf = MultilabelStratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n",
|
| 85 |
+
"train_idx, temp_idx = next(mskf.split(X, Y))\n",
|
| 86 |
+
"val_idx, test_idx = np.array_split(temp_idx, 2)\n",
|
| 87 |
+
"\n",
|
| 88 |
+
"df_train = data.iloc[train_idx].copy()\n",
|
| 89 |
+
"df_val = data.iloc[val_idx].copy()\n",
|
| 90 |
+
"df_test = data.iloc[test_idx].copy()\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"# Guardar em CSV\n",
|
| 93 |
+
"os.makedirs(\"data\", exist_ok=True)\n",
|
| 94 |
+
"df_train.to_csv(\"data/mf-training.csv\", index=False)\n",
|
| 95 |
+
"df_val.to_csv(\"data/mf-validation.csv\", index=False)\n",
|
| 96 |
+
"df_test.to_csv(\"data/mf-test.csv\", index=False)\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"# Confirmar\n",
|
| 99 |
+
"print(\"✓ Ficheiros criados:\")\n",
|
| 100 |
+
"print(\" - data/mf-training.csv :\", df_train.shape)\n",
|
| 101 |
+
"print(\" - data/mf-validation.csv:\", df_val.shape)\n",
|
| 102 |
+
"print(\" - data/mf-test.csv :\", df_test.shape)\n",
|
| 103 |
+
"print(f\"GO terms únicos (após propagação e filtro): {len(mlb.classes_)}\")\n"
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"cell_type": "code",
|
| 108 |
+
"execution_count": 2,
|
| 109 |
+
"id": "6cf7aaa6-4941-4951-8d73-1f4f1f4362f3",
|
| 110 |
+
"metadata": {},
|
| 111 |
+
"outputs": [
|
| 112 |
+
{
|
| 113 |
+
"name": "stderr",
|
| 114 |
+
"output_type": "stream",
|
| 115 |
+
"text": [
|
| 116 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 117 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 118 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\transformers\\utils\\generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.\n",
|
| 119 |
+
" _torch_pytree._register_pytree_node(\n",
|
| 120 |
+
"100%|██████████| 31142/31142 [00:26<00:00, 1192.86it/s]\n",
|
| 121 |
+
"100%|██████████| 1724/1724 [00:00<00:00, 2570.68it/s]\n",
|
| 122 |
+
"C:\\Users\\Melvin\\anaconda3\\envs\\projeto_proteina2\\lib\\site-packages\\ktrain\\text\\preprocessor.py:382: UserWarning: The class_names argument is replacing the classes argument. Please update your code.\n",
|
| 123 |
+
" warnings.warn(\n"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"name": "stdout",
|
| 128 |
+
"output_type": "stream",
|
| 129 |
+
"text": [
|
| 130 |
+
"preprocessing train...\n",
|
| 131 |
+
"language: en\n",
|
| 132 |
+
"train sequence lengths:\n",
|
| 133 |
+
"\tmean : 423\n",
|
| 134 |
+
"\t95percentile : 604\n",
|
| 135 |
+
"\t99percentile : 715\n"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"data": {
|
| 140 |
+
"text/html": [
|
| 141 |
+
"\n",
|
| 142 |
+
"<style>\n",
|
| 143 |
+
" /* Turns off some styling */\n",
|
| 144 |
+
" progress {\n",
|
| 145 |
+
" /* gets rid of default border in Firefox and Opera. */\n",
|
| 146 |
+
" border: none;\n",
|
| 147 |
+
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
|
| 148 |
+
" background-size: auto;\n",
|
| 149 |
+
" }\n",
|
| 150 |
+
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
|
| 151 |
+
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
|
| 152 |
+
" }\n",
|
| 153 |
+
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
|
| 154 |
+
" background: #F44336;\n",
|
| 155 |
+
" }\n",
|
| 156 |
+
"</style>\n"
|
| 157 |
+
],
|
| 158 |
+
"text/plain": [
|
| 159 |
+
"<IPython.core.display.HTML object>"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"output_type": "display_data"
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"data": {
|
| 167 |
+
"text/html": [],
|
| 168 |
+
"text/plain": [
|
| 169 |
+
"<IPython.core.display.HTML object>"
|
| 170 |
+
]
|
| 171 |
+
},
|
| 172 |
+
"metadata": {},
|
| 173 |
+
"output_type": "display_data"
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"name": "stdout",
|
| 177 |
+
"output_type": "stream",
|
| 178 |
+
"text": [
|
| 179 |
+
"Is Multi-Label? True\n",
|
| 180 |
+
"preprocessing test...\n",
|
| 181 |
+
"language: en\n",
|
| 182 |
+
"test sequence lengths:\n",
|
| 183 |
+
"\tmean : 408\n",
|
| 184 |
+
"\t95percentile : 603\n",
|
| 185 |
+
"\t99percentile : 714\n"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"data": {
|
| 190 |
+
"text/html": [
|
| 191 |
+
"\n",
|
| 192 |
+
"<style>\n",
|
| 193 |
+
" /* Turns off some styling */\n",
|
| 194 |
+
" progress {\n",
|
| 195 |
+
" /* gets rid of default border in Firefox and Opera. */\n",
|
| 196 |
+
" border: none;\n",
|
| 197 |
+
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
|
| 198 |
+
" background-size: auto;\n",
|
| 199 |
+
" }\n",
|
| 200 |
+
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
|
| 201 |
+
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
|
| 202 |
+
" }\n",
|
| 203 |
+
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
|
| 204 |
+
" background: #F44336;\n",
|
| 205 |
+
" }\n",
|
| 206 |
+
"</style>\n"
|
| 207 |
+
],
|
| 208 |
+
"text/plain": [
|
| 209 |
+
"<IPython.core.display.HTML object>"
|
| 210 |
+
]
|
| 211 |
+
},
|
| 212 |
+
"metadata": {},
|
| 213 |
+
"output_type": "display_data"
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"data": {
|
| 217 |
+
"text/html": [],
|
| 218 |
+
"text/plain": [
|
| 219 |
+
"<IPython.core.display.HTML object>"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
"metadata": {},
|
| 223 |
+
"output_type": "display_data"
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"name": "stdout",
|
| 227 |
+
"output_type": "stream",
|
| 228 |
+
"text": [
|
| 229 |
+
"\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"begin training using triangular learning rate policy with max lr of 1e-05...\n",
|
| 232 |
+
"Epoch 1/10\n",
|
| 233 |
+
"40995/40995 [==============================] - 9020s 219ms/step - loss: 0.0740 - binary_accuracy: 0.9869 - val_loss: 0.0526 - val_binary_accuracy: 0.9866\n",
|
| 234 |
+
"Epoch 2/10\n",
|
| 235 |
+
"40995/40995 [==============================] - 8939s 218ms/step - loss: 0.0464 - binary_accuracy: 0.9877 - val_loss: 0.0457 - val_binary_accuracy: 0.9871\n",
|
| 236 |
+
"Epoch 3/10\n",
|
| 237 |
+
"40995/40995 [==============================] - 8881s 217ms/step - loss: 0.0413 - binary_accuracy: 0.9883 - val_loss: 0.0418 - val_binary_accuracy: 0.9877\n",
|
| 238 |
+
"Epoch 4/10\n",
|
| 239 |
+
"40995/40995 [==============================] - 10277s 251ms/step - loss: 0.0380 - binary_accuracy: 0.9888 - val_loss: 0.0396 - val_binary_accuracy: 0.9881\n",
|
| 240 |
+
"Epoch 5/10\n",
|
| 241 |
+
"40995/40995 [==============================] - 10565s 258ms/step - loss: 0.0357 - binary_accuracy: 0.9892 - val_loss: 0.0380 - val_binary_accuracy: 0.9883\n",
|
| 242 |
+
"Epoch 6/10\n",
|
| 243 |
+
"40995/40995 [==============================] - 10693s 261ms/step - loss: 0.0338 - binary_accuracy: 0.9895 - val_loss: 0.0369 - val_binary_accuracy: 0.9885\n",
|
| 244 |
+
"Epoch 7/10\n",
|
| 245 |
+
"40995/40995 [==============================] - 12055s 294ms/step - loss: 0.0323 - binary_accuracy: 0.9898 - val_loss: 0.0360 - val_binary_accuracy: 0.9888\n",
|
| 246 |
+
"Epoch 8/10\n",
|
| 247 |
+
"40995/40995 [==============================] - 10225s 249ms/step - loss: 0.0309 - binary_accuracy: 0.9901 - val_loss: 0.0353 - val_binary_accuracy: 0.9890\n",
|
| 248 |
+
"Epoch 9/10\n",
|
| 249 |
+
"40995/40995 [==============================] - 10308s 251ms/step - loss: 0.0297 - binary_accuracy: 0.9904 - val_loss: 0.0347 - val_binary_accuracy: 0.9891\n",
|
| 250 |
+
"Epoch 10/10\n",
|
| 251 |
+
"40995/40995 [==============================] - 10275s 251ms/step - loss: 0.0286 - binary_accuracy: 0.9907 - val_loss: 0.0346 - val_binary_accuracy: 0.9893\n",
|
| 252 |
+
"Weights from best epoch have been loaded into model.\n"
|
| 253 |
+
]
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"data": {
|
| 257 |
+
"text/plain": [
|
| 258 |
+
"<keras.callbacks.History at 0x2b644b84fd0>"
|
| 259 |
+
]
|
| 260 |
+
},
|
| 261 |
+
"execution_count": 2,
|
| 262 |
+
"metadata": {},
|
| 263 |
+
"output_type": "execute_result"
|
| 264 |
+
}
|
| 265 |
+
],
|
| 266 |
+
"source": [
|
| 267 |
+
"import pandas as pd\n",
|
| 268 |
+
"import numpy as np\n",
|
| 269 |
+
"from tqdm import tqdm\n",
|
| 270 |
+
"import random\n",
|
| 271 |
+
"import os\n",
|
| 272 |
+
"import ktrain\n",
|
| 273 |
+
"from ktrain import text\n",
|
| 274 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"\n",
|
| 277 |
+
"# PAM1\n",
|
| 278 |
+
"# PAM matrix model of protein evolution\n",
|
| 279 |
+
"# DOI:10.1093/oxfordjournals.molbev.a040360\n",
|
| 280 |
+
"pam_data = {\n",
|
| 281 |
+
" 'A': [9948, 19, 27, 42, 31, 46, 50, 92, 17, 7, 40, 88, 42, 41, 122, 279, 255, 9, 72, 723],\n",
|
| 282 |
+
" 'R': [14, 9871, 24, 38, 37, 130, 38, 62, 49, 4, 58, 205, 26, 33, 47, 103, 104, 5, 36, 52],\n",
|
| 283 |
+
" 'N': [20, 22, 9860, 181, 29, 36, 41, 67, 31, 5, 22, 49, 23, 10, 33, 83, 66, 3, 43, 32],\n",
|
| 284 |
+
" 'D': [40, 34, 187, 9818, 11, 63, 98, 61, 23, 5, 25, 54, 43, 13, 27, 88, 55, 4, 29, 36],\n",
|
| 285 |
+
" 'C': [20, 16, 26, 9, 9987, 10, 17, 37, 12, 2, 16, 26, 10, 19, 27, 26, 25, 2, 6, 67],\n",
|
| 286 |
+
" 'Q': [29, 118, 29, 49, 8, 9816, 72, 55, 36, 4, 60, 158, 35, 22, 39, 86, 74, 3, 34, 28],\n",
|
| 287 |
+
" 'E': [35, 29, 41, 101, 12, 71, 9804, 56, 33, 5, 36, 107, 42, 20, 38, 87, 69, 4, 30, 42],\n",
|
| 288 |
+
" 'G': [96, 61, 77, 70, 38, 51, 58, 9868, 26, 6, 37, 53, 39, 28, 69, 134, 116, 5, 47, 60],\n",
|
| 289 |
+
" 'H': [17, 53, 33, 19, 15, 39, 34, 24, 9907, 3, 32, 57, 24, 15, 27, 47, 43, 2, 22, 19],\n",
|
| 290 |
+
" 'I': [6, 3, 6, 6, 3, 5, 6, 7, 3, 9973, 23, 13, 12, 41, 93, 84, 115, 3, 8, 102],\n",
|
| 291 |
+
" 'L': [26, 39, 17, 15, 7, 33, 22, 20, 19, 27, 9864, 49, 24, 78, 117, 148, 193, 5, 24, 70],\n",
|
| 292 |
+
" 'K': [60, 198, 43, 52, 12, 142, 96, 53, 42, 10, 63, 9710, 33, 26, 54, 109, 102, 5, 43, 42],\n",
|
| 293 |
+
" 'M': [21, 22, 15, 18, 6, 20, 18, 18, 17, 11, 27, 32, 9945, 26, 34, 61, 71, 3, 12, 31],\n",
|
| 294 |
+
" 'F': [18, 17, 8, 6, 8, 11, 10, 16, 10, 44, 92, 24, 29, 9899, 89, 88, 142, 7, 14, 68],\n",
|
| 295 |
+
" 'P': [97, 47, 35, 29, 23, 35, 38, 57, 21, 24, 47, 56, 28, 76, 9785, 115, 77, 4, 24, 35],\n",
|
| 296 |
+
" 'S': [241, 87, 76, 73, 17, 56, 60, 99, 32, 13, 69, 92, 42, 67, 100, 9605, 212, 8, 63, 70],\n",
|
| 297 |
+
" 'T': [186, 78, 54, 37, 14, 42, 42, 83, 28, 23, 84, 85, 53, 93, 66, 182, 9676, 8, 39, 90],\n",
|
| 298 |
+
" 'W': [2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 5, 3, 4, 4, 9960, 3, 4],\n",
|
| 299 |
+
" 'Y': [29, 21, 17, 9, 4, 13, 9, 21, 10, 7, 20, 17, 11, 23, 19, 41, 31, 3, 9935, 23],\n",
|
| 300 |
+
" 'V': [368, 27, 18, 18, 50, 23, 34, 64, 15, 85, 72, 42, 33, 88, 42, 112, 137, 4, 20, 9514]\n",
|
| 301 |
+
"}\n",
|
| 302 |
+
"pam_raw = pd.DataFrame(pam_data, index=list(pam_data.keys()))\n",
|
| 303 |
+
"pam_matrix = pam_raw.div(pam_raw.sum(axis=1), axis=0)\n",
|
| 304 |
+
"list_amino = pam_raw.columns.tolist()\n",
|
| 305 |
+
"pam_dict = {\n",
|
| 306 |
+
" aa: {sub: pam_matrix.loc[aa, sub] for sub in list_amino}\n",
|
| 307 |
+
" for aa in list_amino\n",
|
| 308 |
+
"}\n",
|
| 309 |
+
"\n",
|
| 310 |
+
"def pam1_substitution(aa):\n",
|
| 311 |
+
" if aa not in pam_dict:\n",
|
| 312 |
+
" return aa\n",
|
| 313 |
+
" subs = list(pam_dict[aa].keys())\n",
|
| 314 |
+
" probs = list(pam_dict[aa].values())\n",
|
| 315 |
+
" return np.random.choice(subs, p=probs)\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"def augment_sequence(seq, sub_prob=0.05):\n",
|
| 318 |
+
" return ''.join([pam1_substitution(aa) if random.random() < sub_prob else aa for aa in seq])\n",
|
| 319 |
+
"\n",
|
| 320 |
+
"def slice_sequence(seq, win=512):\n",
|
| 321 |
+
" return [seq[i:i+win] for i in range(0, len(seq), win)]\n",
|
| 322 |
+
"\n",
|
| 323 |
+
"def generate_data(df, augment=False):\n",
|
| 324 |
+
" X, y = [], []\n",
|
| 325 |
+
" label_cols = [col for col in df.columns if col.startswith(\"GO:\")]\n",
|
| 326 |
+
" for _, row in tqdm(df.iterrows(), total=len(df)):\n",
|
| 327 |
+
" seq = row[\"sequence\"]\n",
|
| 328 |
+
" if augment:\n",
|
| 329 |
+
" seq = augment_sequence(seq)\n",
|
| 330 |
+
" seq_slices = slice_sequence(seq)\n",
|
| 331 |
+
" X.extend(seq_slices)\n",
|
| 332 |
+
" lbl = row[label_cols].values.astype(int)\n",
|
| 333 |
+
" y.extend([lbl] * len(seq_slices))\n",
|
| 334 |
+
" return X, np.array(y), label_cols\n",
|
| 335 |
+
"\n",
|
| 336 |
+
"def format_sequence(seq): return \" \".join(list(seq))\n",
|
| 337 |
+
"\n",
|
| 338 |
+
"# Função para carregar e binarizar\n",
|
| 339 |
+
"def load_and_binarize(csv_path, mlb=None):\n",
|
| 340 |
+
" df = pd.read_csv(csv_path)\n",
|
| 341 |
+
" df[\"go_terms\"] = df[\"go_terms\"].str.split(\";\")\n",
|
| 342 |
+
" if mlb is None:\n",
|
| 343 |
+
" mlb = MultiLabelBinarizer()\n",
|
| 344 |
+
" labels = mlb.fit_transform(df[\"go_terms\"])\n",
|
| 345 |
+
" else:\n",
|
| 346 |
+
" labels = mlb.transform(df[\"go_terms\"])\n",
|
| 347 |
+
" labels_df = pd.DataFrame(labels, columns=mlb.classes_)\n",
|
| 348 |
+
" df = df.reset_index(drop=True).join(labels_df)\n",
|
| 349 |
+
" return df, mlb\n",
|
| 350 |
+
"\n",
|
| 351 |
+
"# Carregar os dados\n",
|
| 352 |
+
"df_train, mlb = load_and_binarize(\"data/mf-training.csv\")\n",
|
| 353 |
+
"df_val, _ = load_and_binarize(\"data/mf-validation.csv\", mlb=mlb)\n",
|
| 354 |
+
"\n",
|
| 355 |
+
"# Gerar com augmentation no treino\n",
|
| 356 |
+
"X_train, y_train, term_cols = generate_data(df_train, augment=True)\n",
|
| 357 |
+
"X_val, y_val, _ = generate_data(df_val, augment=False)\n",
|
| 358 |
+
"\n",
|
| 359 |
+
"# Preparar texto para tokenizer\n",
|
| 360 |
+
"X_train_fmt = list(map(format_sequence, X_train))\n",
|
| 361 |
+
"X_val_fmt = list(map(format_sequence, X_val))\n",
|
| 362 |
+
"\n",
|
| 363 |
+
"# Fine-tune ProtBERT-BFD\n",
|
| 364 |
+
"# https://huggingface.co/Rostlab/prot_bert_bfd\n",
|
| 365 |
+
"# https://doi.org/10.1093/bioinformatics/btac020\n",
|
| 366 |
+
"# Dados de treino -> BFD (Big Fantastic Database) (2.1 bilhões de sequências)\n",
|
| 367 |
+
"MODEL_NAME = \"Rostlab/prot_bert_bfd\"\n",
|
| 368 |
+
"MAX_LEN = 512\n",
|
| 369 |
+
"BATCH_SIZE = 1\n",
|
| 370 |
+
"\n",
|
| 371 |
+
"t = text.Transformer(MODEL_NAME, maxlen=MAX_LEN, classes=term_cols)\n",
|
| 372 |
+
"trn = t.preprocess_train(X_train_fmt, y_train)\n",
|
| 373 |
+
"val = t.preprocess_test(X_val_fmt, y_val)\n",
|
| 374 |
+
"\n",
|
| 375 |
+
"model = t.get_classifier()\n",
|
| 376 |
+
"learner = ktrain.get_learner(model,\n",
|
| 377 |
+
" train_data=trn,\n",
|
| 378 |
+
" val_data=val,\n",
|
| 379 |
+
" batch_size=BATCH_SIZE)\n",
|
| 380 |
+
"\n",
|
| 381 |
+
"learner.autofit(lr=1e-5,\n",
|
| 382 |
+
" epochs=10,\n",
|
| 383 |
+
" early_stopping=1,\n",
|
| 384 |
+
" checkpoint_folder=\"mf-fine-tuned-protbertbfd\")\n"
|
| 385 |
+
]
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"cell_type": "code",
|
| 389 |
+
"execution_count": 8,
|
| 390 |
+
"id": "9b39c439-5708-4787-bfee-d3a4d3aa190d",
|
| 391 |
+
"metadata": {},
|
| 392 |
+
"outputs": [
|
| 393 |
+
{
|
| 394 |
+
"name": "stdout",
|
| 395 |
+
"output_type": "stream",
|
| 396 |
+
"text": [
|
| 397 |
+
"✓ Tokenizer base e modelo fine-tuned carregados com sucesso\n"
|
| 398 |
+
]
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"name": "stderr",
|
| 402 |
+
"output_type": "stream",
|
| 403 |
+
"text": [
|
| 404 |
+
"Processando data/mf-training.csv: 100%|██████████| 31142/31142 [5:17:56<00:00, 1.63it/s] \n"
|
| 405 |
+
]
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"name": "stdout",
|
| 409 |
+
"output_type": "stream",
|
| 410 |
+
"text": [
|
| 411 |
+
"✓ Guardado embeddings\\train_protbertbfd.pkl — 31142 proteínas\n"
|
| 412 |
+
]
|
| 413 |
+
},
|
| 414 |
+
{
|
| 415 |
+
"name": "stderr",
|
| 416 |
+
"output_type": "stream",
|
| 417 |
+
"text": [
|
| 418 |
+
"Processando data/mf-validation.csv: 100%|██████████| 1724/1724 [19:15<00:00, 1.49it/s]\n"
|
| 419 |
+
]
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"name": "stdout",
|
| 423 |
+
"output_type": "stream",
|
| 424 |
+
"text": [
|
| 425 |
+
"✓ Guardado embeddings\\val_protbertbfd.pkl — 1724 proteínas\n"
|
| 426 |
+
]
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"name": "stderr",
|
| 430 |
+
"output_type": "stream",
|
| 431 |
+
"text": [
|
| 432 |
+
"Processando data/mf-test.csv: 100%|██████████| 1724/1724 [17:15<00:00, 1.66it/s]\n"
|
| 433 |
+
]
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
"name": "stdout",
|
| 437 |
+
"output_type": "stream",
|
| 438 |
+
"text": [
|
| 439 |
+
"✓ Guardado embeddings\\test_protbertbfd.pkl — 1724 proteínas\n"
|
| 440 |
+
]
|
| 441 |
+
}
|
| 442 |
+
],
|
| 443 |
+
"source": [
|
| 444 |
+
"import os\n",
|
| 445 |
+
"import pandas as pd\n",
|
| 446 |
+
"import numpy as np\n",
|
| 447 |
+
"from tqdm import tqdm\n",
|
| 448 |
+
"import joblib\n",
|
| 449 |
+
"import gc\n",
|
| 450 |
+
"from transformers import AutoTokenizer, TFAutoModel\n",
|
| 451 |
+
"\n",
|
| 452 |
+
"# Parâmetros\n",
|
| 453 |
+
"MODEL_DIR = \"weights/mf-fine-tuned-protbertbfd\"\n",
|
| 454 |
+
"MODEL_NAME = \"Rostlab/prot_bert_bfd\"\n",
|
| 455 |
+
"OUT_DIR = \"embeddings\"\n",
|
| 456 |
+
"BATCH_TOK = 16\n",
|
| 457 |
+
"\n",
|
| 458 |
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)\n",
|
| 459 |
+
"model = TFAutoModel.from_pretrained(MODEL_DIR, from_pt=False)\n",
|
| 460 |
+
"\n",
|
| 461 |
+
"print(\"✓ Tokenizer base e modelo fine-tuned carregados com sucesso\")\n",
|
| 462 |
+
"\n",
|
| 463 |
+
"# Funções auxiliares\n",
|
| 464 |
+
"\n",
|
| 465 |
+
"def get_embeddings(batch, tokenizer, model):\n",
|
| 466 |
+
" tokens = tokenizer(batch, return_tensors=\"tf\", padding=True, truncation=True, max_length=512)\n",
|
| 467 |
+
" output = model(**tokens)\n",
|
| 468 |
+
" return output.last_hidden_state[:, 0, :].numpy()\n",
|
| 469 |
+
"\n",
|
| 470 |
+
"def process_split(csv_path, out_path):\n",
|
| 471 |
+
" df = pd.read_csv(csv_path)\n",
|
| 472 |
+
" label_cols = [col for col in df.columns if col.startswith(\"GO:\")]\n",
|
| 473 |
+
" prot_ids, embeds, labels = [], [], []\n",
|
| 474 |
+
"\n",
|
| 475 |
+
" for _, row in tqdm(df.iterrows(), total=len(df), desc=f\"Processando {csv_path}\"):\n",
|
| 476 |
+
" slices = slice_sequence(row[\"sequence\"])\n",
|
| 477 |
+
" slices_fmt = list(map(format_sequence, slices))\n",
|
| 478 |
+
"\n",
|
| 479 |
+
" slice_embeds = []\n",
|
| 480 |
+
" for i in range(0, len(slices_fmt), BATCH_TOK):\n",
|
| 481 |
+
" batch = slices_fmt[i:i+BATCH_TOK]\n",
|
| 482 |
+
" slice_embeds.append(get_embeddings(batch, tokenizer, model))\n",
|
| 483 |
+
" slice_embeds = np.vstack(slice_embeds)\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" prot_embed = slice_embeds.mean(axis=0)\n",
|
| 486 |
+
" prot_ids.append(row[\"protein_id\"])\n",
|
| 487 |
+
" embeds.append(prot_embed.astype(np.float32))\n",
|
| 488 |
+
" labels.append(row[label_cols].values.astype(np.int8))\n",
|
| 489 |
+
" gc.collect()\n",
|
| 490 |
+
"\n",
|
| 491 |
+
" embeds = np.vstack(embeds)\n",
|
| 492 |
+
" labels = np.vstack(labels)\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" joblib.dump({\n",
|
| 495 |
+
" \"protein_ids\": prot_ids,\n",
|
| 496 |
+
" \"embeddings\": embeds,\n",
|
| 497 |
+
" \"labels\": labels,\n",
|
| 498 |
+
" \"go_terms\": label_cols\n",
|
| 499 |
+
" }, out_path, compress=3)\n",
|
| 500 |
+
"\n",
|
| 501 |
+
" print(f\"✓ Guardado {out_path} — {embeds.shape[0]} proteínas\")\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"# Aplicar\n",
|
| 504 |
+
"os.makedirs(OUT_DIR, exist_ok=True)\n",
|
| 505 |
+
"\n",
|
| 506 |
+
"process_split(\"data/mf-training.csv\", os.path.join(OUT_DIR, \"train_protbertbfd.pkl\"))\n",
|
| 507 |
+
"process_split(\"data/mf-validation.csv\", os.path.join(OUT_DIR, \"val_protbertbfd.pkl\"))\n",
|
| 508 |
+
"process_split(\"data/mf-test.csv\", os.path.join(OUT_DIR, \"test_protbertbfd.pkl\"))\n"
|
| 509 |
+
]
|
| 510 |
+
},
|
| 511 |
+
{
|
| 512 |
+
"cell_type": "code",
|
| 513 |
+
"execution_count": 9,
|
| 514 |
+
"id": "ad0c5421-e0a1-4a6a-8ace-2c69aeab0e0d",
|
| 515 |
+
"metadata": {},
|
| 516 |
+
"outputs": [
|
| 517 |
+
{
|
| 518 |
+
"name": "stdout",
|
| 519 |
+
"output_type": "stream",
|
| 520 |
+
"text": [
|
| 521 |
+
"✓ Corrigido: embeddings/train_protbertbfd.pkl — 31142 exemplos, 597 GO terms\n",
|
| 522 |
+
"✓ Corrigido: embeddings/val_protbertbfd.pkl — 1724 exemplos, 597 GO terms\n",
|
| 523 |
+
"✓ Corrigido: embeddings/test_protbertbfd.pkl — 1724 exemplos, 597 GO terms\n"
|
| 524 |
+
]
|
| 525 |
+
}
|
| 526 |
+
],
|
| 527 |
+
"source": [
|
| 528 |
+
"import pandas as pd\n",
|
| 529 |
+
"import joblib\n",
|
| 530 |
+
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
| 531 |
+
"\n",
|
| 532 |
+
"# Obter GO terms do ficheiro de teste\n",
|
| 533 |
+
"df_test = pd.read_csv(\"data/mf-test.csv\")\n",
|
| 534 |
+
"test_terms = sorted(set(term for row in df_test[\"go_terms\"].str.split(\";\") for term in row))\n",
|
| 535 |
+
"\n",
|
| 536 |
+
"# Função para corrigir um .pkl com base nos GO terms do teste\n",
|
| 537 |
+
"def patch_to_common_terms(csv_path, pkl_path, common_terms):\n",
|
| 538 |
+
" df = pd.read_csv(csv_path)\n",
|
| 539 |
+
" terms_split = df[\"go_terms\"].str.split(\";\")\n",
|
| 540 |
+
" \n",
|
| 541 |
+
" # Apenas termos presentes nos common_terms\n",
|
| 542 |
+
" terms_filtered = terms_split.apply(lambda lst: [t for t in lst if t in common_terms])\n",
|
| 543 |
+
" \n",
|
| 544 |
+
" mlb = MultiLabelBinarizer(classes=common_terms)\n",
|
| 545 |
+
" Y = mlb.fit_transform(terms_filtered)\n",
|
| 546 |
+
"\n",
|
| 547 |
+
" data = joblib.load(pkl_path)\n",
|
| 548 |
+
" data[\"labels\"] = Y\n",
|
| 549 |
+
" data[\"go_terms\"] = mlb.classes_.tolist()\n",
|
| 550 |
+
" \n",
|
| 551 |
+
" joblib.dump(data, pkl_path, compress=3)\n",
|
| 552 |
+
" print(f\"✓ Corrigido: {pkl_path} — {Y.shape[0]} exemplos, {Y.shape[1]} GO terms\")\n",
|
| 553 |
+
"\n",
|
| 554 |
+
"# Aplicar às 3 partições\n",
|
| 555 |
+
"patch_to_common_terms(\"data/mf-training.csv\", \"embeddings/train_protbertbfd.pkl\", test_terms)\n",
|
| 556 |
+
"patch_to_common_terms(\"data/mf-validation.csv\", \"embeddings/val_protbertbfd.pkl\", test_terms)\n",
|
| 557 |
+
"patch_to_common_terms(\"data/mf-test.csv\", \"embeddings/test_protbertbfd.pkl\", test_terms)\n"
|
| 558 |
+
]
|
| 559 |
+
},
|
| 560 |
+
{
|
| 561 |
+
"cell_type": "code",
|
| 562 |
+
"execution_count": 2,
|
| 563 |
+
"id": "1785d8a9-23fc-4490-8d71-29cc91a4cb57",
|
| 564 |
+
"metadata": {},
|
| 565 |
+
"outputs": [
|
| 566 |
+
{
|
| 567 |
+
"name": "stdout",
|
| 568 |
+
"output_type": "stream",
|
| 569 |
+
"text": [
|
| 570 |
+
"✓ Embeddings carregados: (31142, 1024) → 597 GO terms\n",
|
| 571 |
+
"Epoch 1/100\n",
|
| 572 |
+
"974/974 [==============================] - 12s 11ms/step - loss: 0.0339 - binary_accuracy: 0.9900 - val_loss: 0.0327 - val_binary_accuracy: 0.9905\n",
|
| 573 |
+
"Epoch 2/100\n",
|
| 574 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0253 - binary_accuracy: 0.9922 - val_loss: 0.0323 - val_binary_accuracy: 0.9906\n",
|
| 575 |
+
"Epoch 3/100\n",
|
| 576 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0244 - binary_accuracy: 0.9923 - val_loss: 0.0326 - val_binary_accuracy: 0.9906\n",
|
| 577 |
+
"Epoch 4/100\n",
|
| 578 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0239 - binary_accuracy: 0.9925 - val_loss: 0.0328 - val_binary_accuracy: 0.9906\n",
|
| 579 |
+
"Epoch 5/100\n",
|
| 580 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0236 - binary_accuracy: 0.9925 - val_loss: 0.0321 - val_binary_accuracy: 0.9906\n",
|
| 581 |
+
"Epoch 6/100\n",
|
| 582 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0233 - binary_accuracy: 0.9926 - val_loss: 0.0328 - val_binary_accuracy: 0.9907\n",
|
| 583 |
+
"Epoch 7/100\n",
|
| 584 |
+
"974/974 [==============================] - 11s 11ms/step - loss: 0.0232 - binary_accuracy: 0.9926 - val_loss: 0.0330 - val_binary_accuracy: 0.9908\n",
|
| 585 |
+
"Epoch 8/100\n",
|
| 586 |
+
"974/974 [==============================] - 11s 12ms/step - loss: 0.0229 - binary_accuracy: 0.9927 - val_loss: 0.0325 - val_binary_accuracy: 0.9907\n",
|
| 587 |
+
"Epoch 9/100\n",
|
| 588 |
+
"974/974 [==============================] - 12s 12ms/step - loss: 0.0226 - binary_accuracy: 0.9927 - val_loss: 0.0327 - val_binary_accuracy: 0.9906\n",
|
| 589 |
+
"Epoch 10/100\n",
|
| 590 |
+
"974/974 [==============================] - 12s 12ms/step - loss: 0.0226 - binary_accuracy: 0.9927 - val_loss: 0.0327 - val_binary_accuracy: 0.9907\n",
|
| 591 |
+
"54/54 [==============================] - 0s 2ms/step\n",
|
| 592 |
+
"Previsões guardadas em mf-protbertbfd-pam1.npy\n",
|
| 593 |
+
"Modelo guardado em models/\n"
|
| 594 |
+
]
|
| 595 |
+
}
|
| 596 |
+
],
|
| 597 |
+
"source": [
|
| 598 |
+
"import tensorflow as tf\n",
|
| 599 |
+
"import joblib\n",
|
| 600 |
+
"import numpy as np\n",
|
| 601 |
+
"from tensorflow.keras import Input\n",
|
| 602 |
+
"from tensorflow.keras.models import Sequential\n",
|
| 603 |
+
"from tensorflow.keras.layers import Dense, Dropout\n",
|
| 604 |
+
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
| 605 |
+
"\n",
|
| 606 |
+
"# Carregar embeddings\n",
|
| 607 |
+
"train = joblib.load(\"embeddings/train_protbertbfd.pkl\")\n",
|
| 608 |
+
"val = joblib.load(\"embeddings/val_protbertbfd.pkl\")\n",
|
| 609 |
+
"test = joblib.load(\"embeddings/test_protbertbfd.pkl\")\n",
|
| 610 |
+
"\n",
|
| 611 |
+
"X_train, y_train = train[\"embeddings\"], train[\"labels\"]\n",
|
| 612 |
+
"X_val, y_val = val[\"embeddings\"], val[\"labels\"]\n",
|
| 613 |
+
"X_test, y_test = test[\"embeddings\"], test[\"labels\"]\n",
|
| 614 |
+
"\n",
|
| 615 |
+
"print(f\"✓ Embeddings carregados: {X_train.shape} → {y_train.shape[1]} GO terms\")\n",
|
| 616 |
+
"\n",
|
| 617 |
+
"# Garantir consistência de classes\n",
|
| 618 |
+
"max_classes = y_train.shape[1] # 602 GO terms (do treino)\n",
|
| 619 |
+
"\n",
|
| 620 |
+
"def pad_labels(y, target_dim=max_classes):\n",
|
| 621 |
+
" if y.shape[1] < target_dim:\n",
|
| 622 |
+
" padding = np.zeros((y.shape[0], target_dim - y.shape[1]), dtype=np.int8)\n",
|
| 623 |
+
" return np.hstack([y, padding])\n",
|
| 624 |
+
" return y\n",
|
| 625 |
+
"\n",
|
| 626 |
+
"y_val = pad_labels(y_val)\n",
|
| 627 |
+
"y_test = pad_labels(y_test)\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"# Modelo MLP\n",
|
| 630 |
+
"model = Sequential([\n",
|
| 631 |
+
" Dense(1024, activation=\"relu\", input_shape=(X_train.shape[1],)),\n",
|
| 632 |
+
" Dropout(0.3),\n",
|
| 633 |
+
" Dense(512, activation=\"relu\"),\n",
|
| 634 |
+
" Dropout(0.3),\n",
|
| 635 |
+
" Dense(max_classes, activation=\"sigmoid\")\n",
|
| 636 |
+
"])\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"model.compile(loss=\"binary_crossentropy\",\n",
|
| 639 |
+
" optimizer=\"adam\",\n",
|
| 640 |
+
" metrics=[\"binary_accuracy\"])\n",
|
| 641 |
+
"\n",
|
| 642 |
+
"# Early stopping e treino\n",
|
| 643 |
+
"callbacks = [\n",
|
| 644 |
+
" EarlyStopping(monitor=\"val_loss\", patience=5, restore_best_weights=True)\n",
|
| 645 |
+
"]\n",
|
| 646 |
+
"\n",
|
| 647 |
+
"model.fit(X_train, y_train,\n",
|
| 648 |
+
" validation_data=(X_val, y_val),\n",
|
| 649 |
+
" epochs=100,\n",
|
| 650 |
+
" batch_size=32,\n",
|
| 651 |
+
" callbacks=callbacks,\n",
|
| 652 |
+
" verbose=1)\n",
|
| 653 |
+
"\n",
|
| 654 |
+
"# Previsões\n",
|
| 655 |
+
"y_prob = model.predict(X_test)\n",
|
| 656 |
+
"np.save(\"predictions/mf-protbertbfd-pam1.npy\", y_prob)\n",
|
| 657 |
+
"print(\"Previsões guardadas em mf-protbertbfd-pam1.npy\")\n",
|
| 658 |
+
"\n",
|
| 659 |
+
"# Modelo\n",
|
| 660 |
+
"model.save(\"models/mlp_protbertbfd.h5\")\n",
|
| 661 |
+
"model.save(\"models/mlp_protbertbfd.keras\")\n",
|
| 662 |
+
"print(\"Modelo guardado em models/\")"
|
| 663 |
+
]
|
| 664 |
+
},
|
| 665 |
+
{
|
| 666 |
+
"cell_type": "code",
|
| 667 |
+
"execution_count": 3,
|
| 668 |
+
"id": "fdb66630-76dc-43a0-bd56-45052175fdba",
|
| 669 |
+
"metadata": {},
|
| 670 |
+
"outputs": [
|
| 671 |
+
{
|
| 672 |
+
"name": "stdout",
|
| 673 |
+
"output_type": "stream",
|
| 674 |
+
"text": [
|
| 675 |
+
"go.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms\n",
|
| 676 |
+
"✓ Embeddings: (1724, 597) labels × 597 GO terms\n",
|
| 677 |
+
"\n",
|
| 678 |
+
"📊 Resultados finais (ProtBERTBFD + PAM1 + propagação):\n",
|
| 679 |
+
"Fmax = 0.6588\n",
|
| 680 |
+
"Thr. = 0.46\n",
|
| 681 |
+
"AuPRC = 0.6991\n",
|
| 682 |
+
"Smin = 13.5461\n"
|
| 683 |
+
]
|
| 684 |
+
}
|
| 685 |
+
],
|
| 686 |
+
"source": [
|
| 687 |
+
"import numpy as np\n",
|
| 688 |
+
"from sklearn.metrics import precision_recall_curve, auc\n",
|
| 689 |
+
"from goatools.obo_parser import GODag\n",
|
| 690 |
+
"import joblib\n",
|
| 691 |
+
"import math\n",
|
| 692 |
+
"\n",
|
| 693 |
+
"# Parâmetros\n",
|
| 694 |
+
"GO_FILE = \"go.obo\"\n",
|
| 695 |
+
"THRESHOLDS = np.arange(0.0, 1.01, 0.01)\n",
|
| 696 |
+
"ALPHA = 0.5\n",
|
| 697 |
+
"\n",
|
| 698 |
+
"# Carregar dados\n",
|
| 699 |
+
"test = joblib.load(\"embeddings/test_protbertbfd.pkl\")\n",
|
| 700 |
+
"y_true = test[\"labels\"]\n",
|
| 701 |
+
"terms = test[\"go_terms\"]\n",
|
| 702 |
+
"y_prob = np.load(\"predictions/mf-protbertbfd-pam1.npy\")\n",
|
| 703 |
+
"go_dag = GODag(GO_FILE)\n",
|
| 704 |
+
"\n",
|
| 705 |
+
"print(f\"✓ Embeddings: {y_true.shape} labels × {len(terms)} GO terms\")\n",
|
| 706 |
+
"\n",
|
| 707 |
+
"# Fmax\n",
|
| 708 |
+
"def compute_fmax(y_true, y_prob, thresholds):\n",
|
| 709 |
+
" fmax, best_thr = 0, 0\n",
|
| 710 |
+
" for t in thresholds:\n",
|
| 711 |
+
" y_pred = (y_prob >= t).astype(int)\n",
|
| 712 |
+
" tp = (y_true * y_pred).sum(axis=1)\n",
|
| 713 |
+
" fp = ((1 - y_true) * y_pred).sum(axis=1)\n",
|
| 714 |
+
" fn = (y_true * (1 - y_pred)).sum(axis=1)\n",
|
| 715 |
+
" precision = tp / (tp + fp + 1e-8)\n",
|
| 716 |
+
" recall = tp / (tp + fn + 1e-8)\n",
|
| 717 |
+
" f1 = 2 * precision * recall / (precision + recall + 1e-8)\n",
|
| 718 |
+
" avg_f1 = np.mean(f1)\n",
|
| 719 |
+
" if avg_f1 > fmax:\n",
|
| 720 |
+
" fmax, best_thr = avg_f1, t\n",
|
| 721 |
+
" return fmax, best_thr\n",
|
| 722 |
+
"\n",
|
| 723 |
+
"# AuPRC micro\n",
|
| 724 |
+
"def compute_auprc(y_true, y_prob):\n",
|
| 725 |
+
" precision, recall, _ = precision_recall_curve(y_true.ravel(), y_prob.ravel())\n",
|
| 726 |
+
" return auc(recall, precision)\n",
|
| 727 |
+
"\n",
|
| 728 |
+
"# Smin\n",
|
| 729 |
+
"def compute_smin(y_true, y_prob, terms, threshold, go_dag, alpha=ALPHA):\n",
|
| 730 |
+
" y_pred = (y_prob >= threshold).astype(int)\n",
|
| 731 |
+
" ic = {}\n",
|
| 732 |
+
" total = (y_true + y_pred).sum(axis=0).sum()\n",
|
| 733 |
+
" for i, term in enumerate(terms):\n",
|
| 734 |
+
" freq = (y_true[:, i] + y_pred[:, i]).sum()\n",
|
| 735 |
+
" ic[term] = -np.log((freq + 1e-8) / total)\n",
|
| 736 |
+
"\n",
|
| 737 |
+
" s_values = []\n",
|
| 738 |
+
" for true_vec, pred_vec in zip(y_true, y_pred):\n",
|
| 739 |
+
" true_terms = {terms[i] for i in np.where(true_vec)[0]}\n",
|
| 740 |
+
" pred_terms = {terms[i] for i in np.where(pred_vec)[0]}\n",
|
| 741 |
+
"\n",
|
| 742 |
+
" anc_true = set()\n",
|
| 743 |
+
" for t in true_terms:\n",
|
| 744 |
+
" if t in go_dag:\n",
|
| 745 |
+
" anc_true |= go_dag[t].get_all_parents()\n",
|
| 746 |
+
" anc_pred = set()\n",
|
| 747 |
+
" for t in pred_terms:\n",
|
| 748 |
+
" if t in go_dag:\n",
|
| 749 |
+
" anc_pred |= go_dag[t].get_all_parents()\n",
|
| 750 |
+
"\n",
|
| 751 |
+
" ru = pred_terms - true_terms\n",
|
| 752 |
+
" mi = true_terms - pred_terms\n",
|
| 753 |
+
" dist_ru = sum(ic.get(t, 0) for t in ru)\n",
|
| 754 |
+
" dist_mi = sum(ic.get(t, 0) for t in mi)\n",
|
| 755 |
+
" s = math.sqrt((alpha * dist_ru)**2 + ((1 - alpha) * dist_mi)**2)\n",
|
| 756 |
+
" s_values.append(s)\n",
|
| 757 |
+
"\n",
|
| 758 |
+
" return np.mean(s_values)\n",
|
| 759 |
+
"\n",
|
| 760 |
+
"# Avaliar\n",
|
| 761 |
+
"fmax, thr = compute_fmax(y_true, y_prob, THRESHOLDS)\n",
|
| 762 |
+
"auprc = compute_auprc(y_true, y_prob)\n",
|
| 763 |
+
"smin = compute_smin(y_true, y_prob, terms, thr, go_dag)\n",
|
| 764 |
+
"\n",
|
| 765 |
+
"print(f\"\\n📊 Resultados finais (ProtBERTBFD + PAM1 + propagação):\")\n",
|
| 766 |
+
"print(f\"Fmax = {fmax:.4f}\")\n",
|
| 767 |
+
"print(f\"Thr. = {thr:.2f}\")\n",
|
| 768 |
+
"print(f\"AuPRC = {auprc:.4f}\")\n",
|
| 769 |
+
"print(f\"Smin = {smin:.4f}\")\n"
|
| 770 |
+
]
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"cell_type": "code",
|
| 774 |
+
"execution_count": null,
|
| 775 |
+
"id": "70d131ef-ef84-42ee-953b-0d3f1268694d",
|
| 776 |
+
"metadata": {},
|
| 777 |
+
"outputs": [],
|
| 778 |
+
"source": []
|
| 779 |
+
}
|
| 780 |
+
],
|
| 781 |
+
"metadata": {
|
| 782 |
+
"kernelspec": {
|
| 783 |
+
"display_name": "Python 3 (ipykernel)",
|
| 784 |
+
"language": "python",
|
| 785 |
+
"name": "python3"
|
| 786 |
+
},
|
| 787 |
+
"language_info": {
|
| 788 |
+
"codemirror_mode": {
|
| 789 |
+
"name": "ipython",
|
| 790 |
+
"version": 3
|
| 791 |
+
},
|
| 792 |
+
"file_extension": ".py",
|
| 793 |
+
"mimetype": "text/x-python",
|
| 794 |
+
"name": "python",
|
| 795 |
+
"nbconvert_exporter": "python",
|
| 796 |
+
"pygments_lexer": "ipython3",
|
| 797 |
+
"version": "3.10.16"
|
| 798 |
+
}
|
| 799 |
+
},
|
| 800 |
+
"nbformat": 4,
|
| 801 |
+
"nbformat_minor": 5
|
| 802 |
+
}
|