Christina Theodoris
commited on
Commit
·
57b9778
1
Parent(s):
acd253c
Update tokenizer to allow tokenization without custom cell attributes
Browse files- geneformer/tokenizer.py +21 -12
geneformer/tokenizer.py
CHANGED
|
@@ -42,7 +42,7 @@ def tokenize_cell(gene_vector, gene_tokens):
|
|
| 42 |
class TranscriptomeTokenizer:
|
| 43 |
def __init__(
|
| 44 |
self,
|
| 45 |
-
custom_attr_name_dict,
|
| 46 |
nproc=1,
|
| 47 |
gene_median_file=GENE_MEDIAN_FILE,
|
| 48 |
token_dictionary_file=TOKEN_DICTIONARY_FILE,
|
|
@@ -52,7 +52,7 @@ class TranscriptomeTokenizer:
|
|
| 52 |
|
| 53 |
Parameters
|
| 54 |
----------
|
| 55 |
-
custom_attr_name_dict : dict
|
| 56 |
Dictionary of custom attributes to be added to the dataset.
|
| 57 |
Keys are the names of the attributes in the loom file.
|
| 58 |
Values are the names of the attributes in the dataset.
|
|
@@ -106,8 +106,9 @@ class TranscriptomeTokenizer:
|
|
| 106 |
|
| 107 |
def tokenize_files(self, loom_data_directory):
|
| 108 |
tokenized_cells = []
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 111 |
|
| 112 |
# loops through directories to tokenize .loom files
|
| 113 |
for loom_file_path in loom_data_directory.glob("*.loom"):
|
|
@@ -116,15 +117,19 @@ class TranscriptomeTokenizer:
|
|
| 116 |
loom_file_path
|
| 117 |
)
|
| 118 |
tokenized_cells += file_tokenized_cells
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
return tokenized_cells, cell_metadata
|
| 123 |
|
| 124 |
def tokenize_file(self, loom_file_path):
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
with lp.connect(str(loom_file_path)) as data:
|
| 130 |
# define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors
|
|
@@ -181,15 +186,19 @@ class TranscriptomeTokenizer:
|
|
| 181 |
]
|
| 182 |
|
| 183 |
# add custom attributes for subview to dict
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
return tokenized_cells, file_cell_metadata
|
| 188 |
|
| 189 |
def create_dataset(self, tokenized_cells, cell_metadata):
|
| 190 |
# create dict for dataset creation
|
| 191 |
dataset_dict = {"input_ids": tokenized_cells}
|
| 192 |
-
|
|
|
|
| 193 |
|
| 194 |
# create dataset
|
| 195 |
output_dataset = Dataset.from_dict(dataset_dict)
|
|
|
|
| 42 |
class TranscriptomeTokenizer:
|
| 43 |
def __init__(
|
| 44 |
self,
|
| 45 |
+
custom_attr_name_dict=None,
|
| 46 |
nproc=1,
|
| 47 |
gene_median_file=GENE_MEDIAN_FILE,
|
| 48 |
token_dictionary_file=TOKEN_DICTIONARY_FILE,
|
|
|
|
| 52 |
|
| 53 |
Parameters
|
| 54 |
----------
|
| 55 |
+
custom_attr_name_dict : None, dict
|
| 56 |
Dictionary of custom attributes to be added to the dataset.
|
| 57 |
Keys are the names of the attributes in the loom file.
|
| 58 |
Values are the names of the attributes in the dataset.
|
|
|
|
| 106 |
|
| 107 |
def tokenize_files(self, loom_data_directory):
|
| 108 |
tokenized_cells = []
|
| 109 |
+
if self.custom_attr_name_dict is not None:
|
| 110 |
+
loom_cell_attr = [attr_key for attr_key in self.custom_attr_name_dict.keys()]
|
| 111 |
+
cell_metadata = {attr_key: [] for attr_key in self.custom_attr_name_dict.values()}
|
| 112 |
|
| 113 |
# loops through directories to tokenize .loom files
|
| 114 |
for loom_file_path in loom_data_directory.glob("*.loom"):
|
|
|
|
| 117 |
loom_file_path
|
| 118 |
)
|
| 119 |
tokenized_cells += file_tokenized_cells
|
| 120 |
+
if self.custom_attr_name_dict is not None:
|
| 121 |
+
for k in loom_cell_attr:
|
| 122 |
+
cell_metadata[self.custom_attr_name_dict[k]] += file_cell_metadata[k]
|
| 123 |
+
else:
|
| 124 |
+
cell_metadata = None
|
| 125 |
|
| 126 |
return tokenized_cells, cell_metadata
|
| 127 |
|
| 128 |
def tokenize_file(self, loom_file_path):
|
| 129 |
+
if self.custom_attr_name_dict is not None:
|
| 130 |
+
file_cell_metadata = {
|
| 131 |
+
attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
|
| 132 |
+
}
|
| 133 |
|
| 134 |
with lp.connect(str(loom_file_path)) as data:
|
| 135 |
# define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors
|
|
|
|
| 186 |
]
|
| 187 |
|
| 188 |
# add custom attributes for subview to dict
|
| 189 |
+
if self.custom_attr_name_dict is not None:
|
| 190 |
+
for k in file_cell_metadata.keys():
|
| 191 |
+
file_cell_metadata[k] += subview.ca[k].tolist()
|
| 192 |
+
else:
|
| 193 |
+
file_cell_metadata = None
|
| 194 |
|
| 195 |
return tokenized_cells, file_cell_metadata
|
| 196 |
|
| 197 |
def create_dataset(self, tokenized_cells, cell_metadata):
|
| 198 |
# create dict for dataset creation
|
| 199 |
dataset_dict = {"input_ids": tokenized_cells}
|
| 200 |
+
if self.custom_attr_name_dict is not None:
|
| 201 |
+
dataset_dict.update(cell_metadata)
|
| 202 |
|
| 203 |
# create dataset
|
| 204 |
output_dataset = Dataset.from_dict(dataset_dict)
|