Update geneformer/tokenizer.py
Browse files- geneformer/tokenizer.py +8 -2
geneformer/tokenizer.py
CHANGED
|
@@ -126,8 +126,11 @@ def sum_ensembl_ids(
|
|
| 126 |
gene_ids_collapsed = [
|
| 127 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
|
| 128 |
]
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
if len(set(gene_ids_in_dict)) == len(set(
|
| 131 |
# Keep original Ensembl IDs as `ensembl_id_original`
|
| 132 |
rename_attr(data.ra, "ensembl_id", "ensembl_id_original")
|
| 133 |
data.ra["ensembl_id"] = gene_ids_collapsed
|
|
@@ -223,7 +226,10 @@ def sum_ensembl_ids(
|
|
| 223 |
gene_ids_collapsed = [
|
| 224 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
|
| 225 |
]
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
| 227 |
data.var.ensembl_id = data.var.ensembl_id.map(gene_mapping_dict)
|
| 228 |
return data
|
| 229 |
|
|
|
|
| 126 |
gene_ids_collapsed = [
|
| 127 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
|
| 128 |
]
|
| 129 |
+
gene_ids_collapsed_in_dict = [
|
| 130 |
+
gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
|
| 131 |
+
]
|
| 132 |
|
| 133 |
+
if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
|
| 134 |
# Keep original Ensembl IDs as `ensembl_id_original`
|
| 135 |
rename_attr(data.ra, "ensembl_id", "ensembl_id_original")
|
| 136 |
data.ra["ensembl_id"] = gene_ids_collapsed
|
|
|
|
| 226 |
gene_ids_collapsed = [
|
| 227 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
|
| 228 |
]
|
| 229 |
+
gene_ids_collapsed_in_dict = [
|
| 230 |
+
gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
|
| 231 |
+
]
|
| 232 |
+
if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
|
| 233 |
data.var.ensembl_id = data.var.ensembl_id.map(gene_mapping_dict)
|
| 234 |
return data
|
| 235 |
|