Refactor: Convert mask_token_id, pad_token_id, and all_special_ids to properties (#395)
Browse files- Refactor: Convert mask_token_id, pad_token_id, and all_special_ids to properties (02febb8c9cc71177ab74ab03ab83e597b7ec1df7)
Co-authored-by: Madhavan Venkatesh <[email protected]>
geneformer/collator_for_classification.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
Geneformer collator for gene and cell classification.
|
| 3 |
-
|
| 4 |
Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
|
| 5 |
"""
|
| 6 |
|
|
@@ -85,13 +84,25 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
| 85 |
self.token_dictionary = kwargs.get("token_dictionary")
|
| 86 |
self.padding_side = "right"
|
| 87 |
self.model_input_names = ["input_ids"]
|
| 88 |
-
self.
|
| 89 |
-
self.
|
| 90 |
-
self.
|
| 91 |
self.token_dictionary.get("<mask>"),
|
| 92 |
self.token_dictionary.get("<pad>"),
|
| 93 |
]
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
def _get_padding_truncation_strategies(
|
| 96 |
self,
|
| 97 |
padding=True,
|
|
@@ -258,29 +269,23 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
| 258 |
"""
|
| 259 |
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
| 260 |
in the batch.
|
| 261 |
-
|
| 262 |
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
|
| 263 |
``self.pad_token_id`` and ``self.pad_token_type_id``)
|
| 264 |
-
|
| 265 |
.. note::
|
| 266 |
-
|
| 267 |
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
|
| 268 |
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
|
| 269 |
case of PyTorch tensors, you will lose the specific device of your tensors however.
|
| 270 |
-
|
| 271 |
Args:
|
| 272 |
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
| 273 |
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
|
| 274 |
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
|
| 275 |
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
|
| 276 |
well as in a PyTorch Dataloader collate function.
|
| 277 |
-
|
| 278 |
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
| 279 |
see the note above for the return type.
|
| 280 |
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
| 281 |
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
| 282 |
index) among:
|
| 283 |
-
|
| 284 |
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
| 285 |
single sequence if provided).
|
| 286 |
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
|
@@ -291,17 +296,14 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
| 291 |
Maximum length of the returned list and optionally padding length (see above).
|
| 292 |
pad_to_multiple_of (:obj:`int`, `optional`):
|
| 293 |
If set will pad the sequence to a multiple of the provided value.
|
| 294 |
-
|
| 295 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
| 296 |
>= 7.5 (Volta).
|
| 297 |
return_attention_mask (:obj:`bool`, `optional`):
|
| 298 |
Whether to return the attention mask. If left to the default, will return the attention mask according
|
| 299 |
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
| 300 |
-
|
| 301 |
`What are attention masks? <../glossary.html#attention-mask>`__
|
| 302 |
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
| 303 |
If set, will return tensors instead of list of python integers. Acceptable values are:
|
| 304 |
-
|
| 305 |
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
| 306 |
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
| 307 |
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
|
@@ -418,18 +420,15 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
|
|
| 418 |
) -> dict:
|
| 419 |
"""
|
| 420 |
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
| 421 |
-
|
| 422 |
Args:
|
| 423 |
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
| 424 |
max_length: maximum length of the returned list and optionally padding length (see below).
|
| 425 |
Will truncate by taking into account the special tokens.
|
| 426 |
padding_strategy: PaddingStrategy to use for padding.
|
| 427 |
-
|
| 428 |
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
| 429 |
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
| 430 |
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
| 431 |
The tokenizer padding sides are defined in self.padding_side:
|
| 432 |
-
|
| 433 |
- 'left': pads on the left of the sequences
|
| 434 |
- 'right': pads on the right of the sequences
|
| 435 |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
|
|
|
| 1 |
"""
|
| 2 |
Geneformer collator for gene and cell classification.
|
|
|
|
| 3 |
Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
|
| 4 |
"""
|
| 5 |
|
|
|
|
| 84 |
self.token_dictionary = kwargs.get("token_dictionary")
|
| 85 |
self.padding_side = "right"
|
| 86 |
self.model_input_names = ["input_ids"]
|
| 87 |
+
self._mask_token_id = self.token_dictionary.get("<mask>")
|
| 88 |
+
self._pad_token_id = self.token_dictionary.get("<pad>")
|
| 89 |
+
self._all_special_ids = [
|
| 90 |
self.token_dictionary.get("<mask>"),
|
| 91 |
self.token_dictionary.get("<pad>"),
|
| 92 |
]
|
| 93 |
|
| 94 |
+
@property
|
| 95 |
+
def all_special_ids(self):
|
| 96 |
+
return self._all_special_ids
|
| 97 |
+
|
| 98 |
+
@property
|
| 99 |
+
def mask_token_id(self):
|
| 100 |
+
return self._mask_token_id
|
| 101 |
+
|
| 102 |
+
@property
|
| 103 |
+
def pad_token_id(self):
|
| 104 |
+
return self._pad_token_id
|
| 105 |
+
|
| 106 |
def _get_padding_truncation_strategies(
|
| 107 |
self,
|
| 108 |
padding=True,
|
|
|
|
| 269 |
"""
|
| 270 |
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
|
| 271 |
in the batch.
|
|
|
|
| 272 |
Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
|
| 273 |
``self.pad_token_id`` and ``self.pad_token_type_id``)
|
|
|
|
| 274 |
.. note::
|
|
|
|
| 275 |
If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
|
| 276 |
result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
|
| 277 |
case of PyTorch tensors, you will lose the specific device of your tensors however.
|
|
|
|
| 278 |
Args:
|
| 279 |
encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
|
| 280 |
Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
|
| 281 |
List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
|
| 282 |
List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
|
| 283 |
well as in a PyTorch Dataloader collate function.
|
|
|
|
| 284 |
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
| 285 |
see the note above for the return type.
|
| 286 |
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
| 287 |
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
| 288 |
index) among:
|
|
|
|
| 289 |
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
| 290 |
single sequence if provided).
|
| 291 |
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
|
|
|
| 296 |
Maximum length of the returned list and optionally padding length (see above).
|
| 297 |
pad_to_multiple_of (:obj:`int`, `optional`):
|
| 298 |
If set will pad the sequence to a multiple of the provided value.
|
|
|
|
| 299 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
| 300 |
>= 7.5 (Volta).
|
| 301 |
return_attention_mask (:obj:`bool`, `optional`):
|
| 302 |
Whether to return the attention mask. If left to the default, will return the attention mask according
|
| 303 |
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
|
|
|
| 304 |
`What are attention masks? <../glossary.html#attention-mask>`__
|
| 305 |
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
| 306 |
If set, will return tensors instead of list of python integers. Acceptable values are:
|
|
|
|
| 307 |
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
| 308 |
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
| 309 |
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
|
|
|
| 420 |
) -> dict:
|
| 421 |
"""
|
| 422 |
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
|
|
|
| 423 |
Args:
|
| 424 |
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
| 425 |
max_length: maximum length of the returned list and optionally padding length (see below).
|
| 426 |
Will truncate by taking into account the special tokens.
|
| 427 |
padding_strategy: PaddingStrategy to use for padding.
|
|
|
|
| 428 |
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
| 429 |
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
| 430 |
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
| 431 |
The tokenizer padding sides are defined in self.padding_side:
|
|
|
|
| 432 |
- 'left': pads on the left of the sequences
|
| 433 |
- 'right': pads on the right of the sequences
|
| 434 |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|