Update geneformer/tokenizer.py
Browse files- geneformer/tokenizer.py +13 -0
geneformer/tokenizer.py
CHANGED
|
@@ -100,6 +100,9 @@ def sum_ensembl_ids(
|
|
| 100 |
"ensembl_id" in data.ra.keys()
|
| 101 |
), "'ensembl_id' column missing from data.ra.keys()"
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 104 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 105 |
gene_ids_in_dict = [
|
|
@@ -197,6 +200,10 @@ def sum_ensembl_ids(
|
|
| 197 |
"ensembl_id" in data.var.columns
|
| 198 |
), "'ensembl_id' column missing from data.var"
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 201 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 202 |
gene_ids_in_dict = [
|
|
@@ -516,6 +523,7 @@ class TranscriptomeTokenizer:
|
|
| 516 |
file_cell_metadata = {
|
| 517 |
attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
|
| 518 |
}
|
|
|
|
| 519 |
|
| 520 |
dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom")
|
| 521 |
loom_file_path = sum_ensembl_ids(
|
|
@@ -591,6 +599,11 @@ class TranscriptomeTokenizer:
|
|
| 591 |
if str(dedup_filename) == str(loom_file_path):
|
| 592 |
os.remove(str(dedup_filename))
|
| 593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
return tokenized_cells, file_cell_metadata
|
| 595 |
|
| 596 |
def create_dataset(
|
|
|
|
| 100 |
"ensembl_id" in data.ra.keys()
|
| 101 |
), "'ensembl_id' column missing from data.ra.keys()"
|
| 102 |
|
| 103 |
+
assert (
|
| 104 |
+
"ensembl_id_collapsed" not in data.ra.keys()
|
| 105 |
+
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
| 106 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 107 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 108 |
gene_ids_in_dict = [
|
|
|
|
| 200 |
"ensembl_id" in data.var.columns
|
| 201 |
), "'ensembl_id' column missing from data.var"
|
| 202 |
|
| 203 |
+
assert (
|
| 204 |
+
"ensembl_id_collapsed" not in data.var.columns
|
| 205 |
+
), "'ensembl_id_collapsed' column already exists in data.var"
|
| 206 |
+
|
| 207 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 208 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 209 |
gene_ids_in_dict = [
|
|
|
|
| 523 |
file_cell_metadata = {
|
| 524 |
attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
|
| 525 |
}
|
| 526 |
+
loom_file_path_original = loom_file_path
|
| 527 |
|
| 528 |
dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom")
|
| 529 |
loom_file_path = sum_ensembl_ids(
|
|
|
|
| 599 |
if str(dedup_filename) == str(loom_file_path):
|
| 600 |
os.remove(str(dedup_filename))
|
| 601 |
|
| 602 |
+
with lp.connect(str(loom_file_path_original)) as data:
|
| 603 |
+
if "ensembl_id_collapsed" in data.ra.keys():
|
| 604 |
+
del data.ra["ensembl_id_collapsed"]
|
| 605 |
+
|
| 606 |
+
|
| 607 |
return tokenized_cells, file_cell_metadata
|
| 608 |
|
| 609 |
def create_dataset(
|