|
|
import collections |
|
|
import os |
|
|
import re |
|
|
from typing import List, Optional |
|
|
from transformers import PreTrainedTokenizer |
|
|
from SmilesPE.tokenizer import SPE_Tokenizer |
|
|
import torch |
|
|
|
|
|
def load_vocab(vocab_file): |
|
|
"""Loads a vocabulary file into a dictionary.""" |
|
|
vocab = collections.OrderedDict() |
|
|
with open(vocab_file, "r", encoding="utf-8") as reader: |
|
|
tokens = reader.readlines() |
|
|
for index, token in enumerate(tokens): |
|
|
token = token.rstrip("\n") |
|
|
vocab[token] = index |
|
|
return vocab |
|
|
|
|
|
class Atomwise_Tokenizer(object): |
|
|
"""Run atom-level SMILES tokenization""" |
|
|
|
|
|
def __init__(self): |
|
|
""" Constructs a atom-level Tokenizer. |
|
|
""" |
|
|
|
|
|
self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" |
|
|
|
|
|
self.regex = re.compile(self.regex_pattern) |
|
|
|
|
|
def tokenize(self, text): |
|
|
""" Basic Tokenization of a SMILES. |
|
|
""" |
|
|
tokens = [token for token in self.regex.findall(text)] |
|
|
return tokens |
|
|
|
|
|
class SMILES_SPE_Tokenizer(PreTrainedTokenizer): |
|
|
r""" |
|
|
Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). |
|
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users |
|
|
should refer to the superclass for more information regarding methods. |
|
|
Args: |
|
|
vocab_file (:obj:`string`): |
|
|
File containing the vocabulary. |
|
|
spe_file (:obj:`string`): |
|
|
File containing the trained SMILES Pair Encoding vocabulary. |
|
|
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): |
|
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
|
|
token instead. |
|
|
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): |
|
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences |
|
|
for sequence classification or for a text and a question for question answering. |
|
|
It is also used as the last token of a sequence built with special tokens. |
|
|
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): |
|
|
The token used for padding, for example when batching sequences of different lengths. |
|
|
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): |
|
|
The classifier token which is used when doing sequence classification (classification of the whole |
|
|
sequence instead of per-token classification). It is the first token of the sequence when built with |
|
|
special tokens. |
|
|
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): |
|
|
The token used for masking values. This is the token used when training this model with masked language |
|
|
modeling. This is the token which the model will try to predict. |
|
|
""" |
|
|
|
|
|
def __init__(self, vocab_file, spe_file, |
|
|
unk_token="[UNK]", |
|
|
sep_token="[SEP]", |
|
|
pad_token="[PAD]", |
|
|
cls_token="[CLS]", |
|
|
mask_token="[MASK]", |
|
|
**kwargs): |
|
|
if not os.path.isfile(vocab_file): |
|
|
raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file)) |
|
|
if not os.path.isfile(spe_file): |
|
|
raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file)) |
|
|
|
|
|
self.vocab = load_vocab(vocab_file) |
|
|
self.spe_vocab = open(spe_file, 'r', encoding='utf-8') |
|
|
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) |
|
|
self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab) |
|
|
|
|
|
super().__init__( |
|
|
unk_token=unk_token, |
|
|
sep_token=sep_token, |
|
|
pad_token=pad_token, |
|
|
cls_token=cls_token, |
|
|
mask_token=mask_token, |
|
|
**kwargs) |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self.vocab) |
|
|
|
|
|
def get_vocab(self): |
|
|
return dict(self.vocab, **self.added_tokens_encoder) |
|
|
|
|
|
def _tokenize(self, text): |
|
|
return self.spe_tokenizer.tokenize(text).split(' ') |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
""" Converts a token (str) in an id using the vocab. """ |
|
|
return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
|
|
|
|
|
|
def encode(self, token_array): |
|
|
token_ids = [] |
|
|
token_ids.append(2) |
|
|
for token in token_array: |
|
|
id = self._convert_token_to_id(token) |
|
|
token_ids.append(id) |
|
|
token_ids.append(3) |
|
|
token_ids = torch.tensor([token_ids]) |
|
|
attn_mask = torch.ones_like(token_ids) |
|
|
return {'input_ids': token_ids, 'attention_mask': attn_mask} |
|
|
|
|
|
def decode(self, token_ids, skip_special_tokens=True): |
|
|
token_ids = token_ids.squeeze(0).cpu().tolist() |
|
|
token_array = [] |
|
|
for idx in token_ids: |
|
|
if idx == 3: |
|
|
break |
|
|
if skip_special_tokens and idx in self.all_special_ids: |
|
|
continue |
|
|
token = self._convert_id_to_token(idx) |
|
|
token_array.append(token) |
|
|
sequence = "".join(token_array) |
|
|
return sequence |
|
|
|
|
|
def batch_decode(self, batch_token_ids, skip_special_tokens=True): |
|
|
sequences = [] |
|
|
for token_ids in batch_token_ids: |
|
|
sequences.append(self.decode(token_ids)) |
|
|
return sequences |
|
|
|
|
|
def get_token_split(self, token_ids): |
|
|
if isinstance(token_ids, torch.Tensor): |
|
|
token_ids = token_ids.cpu().tolist() |
|
|
|
|
|
token_array = [] |
|
|
for seq_ids in token_ids: |
|
|
seq_array = [] |
|
|
for id in seq_ids: |
|
|
token = self._convert_id_to_token(id) |
|
|
seq_array.append(token) |
|
|
token_array.append(seq_array) |
|
|
|
|
|
return token_array |
|
|
|
|
|
def _convert_id_to_token(self, index): |
|
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
|
return self.ids_to_tokens.get(index, self.unk_token) |
|
|
|
|
|
def convert_tokens_to_string(self, tokens): |
|
|
""" Converts a sequence of tokens (string) in a single string. """ |
|
|
out_string = " ".join(tokens).replace(" ##", "").strip() |
|
|
return out_string |
|
|
|
|
|
def build_inputs_with_special_tokens( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
|
|
) -> List[int]: |
|
|
""" |
|
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
|
|
by concatenating and adding special tokens. |
|
|
A BERT sequence has the following format: |
|
|
- single sequence: ``[CLS] X [SEP]`` |
|
|
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` |
|
|
Args: |
|
|
token_ids_0 (:obj:`List[int]`): |
|
|
List of IDs to which the special tokens will be added |
|
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
Returns: |
|
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. |
|
|
""" |
|
|
if token_ids_1 is None: |
|
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
|
|
cls = [self.cls_token_id] |
|
|
sep = [self.sep_token_id] |
|
|
return cls + token_ids_0 + sep + token_ids_1 + sep |
|
|
|
|
|
def get_special_tokens_mask( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
|
|
) -> List[int]: |
|
|
""" |
|
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
|
|
special tokens using the tokenizer ``prepare_for_model`` method. |
|
|
Args: |
|
|
token_ids_0 (:obj:`List[int]`): |
|
|
List of ids. |
|
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
|
|
Set to True if the token list is already formatted with special tokens for the model |
|
|
Returns: |
|
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
|
|
""" |
|
|
|
|
|
if already_has_special_tokens: |
|
|
if token_ids_1 is not None: |
|
|
raise ValueError( |
|
|
"You should not supply a second sequence if the provided sequence of " |
|
|
"ids is already formated with special tokens for the model." |
|
|
) |
|
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) |
|
|
|
|
|
if token_ids_1 is not None: |
|
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
|
|
return [1] + ([0] * len(token_ids_0)) + [1] |
|
|
|
|
|
def create_token_type_ids_from_sequences( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
|
|
) -> List[int]: |
|
|
""" |
|
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. |
|
|
A BERT sequence pair mask has the following format: |
|
|
:: |
|
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
|
|
| first sequence | second sequence | |
|
|
if token_ids_1 is None, only returns the first portion of the mask (0's). |
|
|
Args: |
|
|
token_ids_0 (:obj:`List[int]`): |
|
|
List of ids. |
|
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
Returns: |
|
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given |
|
|
sequence(s). |
|
|
""" |
|
|
sep = [self.sep_token_id] |
|
|
cls = [self.cls_token_id] |
|
|
if token_ids_1 is None: |
|
|
return len(cls + token_ids_0 + sep) * [0] |
|
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
|
|
|
|
|
def save_vocabulary(self, vocab_path): |
|
|
""" |
|
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. |
|
|
Args: |
|
|
vocab_path (:obj:`str`): |
|
|
The directory in which to save the vocabulary. |
|
|
Returns: |
|
|
:obj:`Tuple(str)`: Paths to the files saved. |
|
|
""" |
|
|
index = 0 |
|
|
vocab_file = vocab_path |
|
|
with open(vocab_file, "w", encoding="utf-8") as writer: |
|
|
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): |
|
|
if index != token_index: |
|
|
index = token_index |
|
|
writer.write(token + "\n") |
|
|
index += 1 |
|
|
return (vocab_file,) |
|
|
|
|
|
class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer): |
|
|
r""" |
|
|
Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). |
|
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users |
|
|
should refer to the superclass for more information regarding methods. |
|
|
Args: |
|
|
vocab_file (:obj:`string`): |
|
|
File containing the vocabulary. |
|
|
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): |
|
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
|
|
token instead. |
|
|
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): |
|
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences |
|
|
for sequence classification or for a text and a question for question answering. |
|
|
It is also used as the last token of a sequence built with special tokens. |
|
|
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): |
|
|
The token used for padding, for example when batching sequences of different lengths. |
|
|
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): |
|
|
The classifier token which is used when doing sequence classification (classification of the whole |
|
|
sequence instead of per-token classification). It is the first token of the sequence when built with |
|
|
special tokens. |
|
|
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): |
|
|
The token used for masking values. This is the token used when training this model with masked language |
|
|
modeling. This is the token which the model will try to predict. |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_file, |
|
|
unk_token="[UNK]", |
|
|
sep_token="[SEP]", |
|
|
pad_token="[PAD]", |
|
|
cls_token="[CLS]", |
|
|
mask_token="[MASK]", |
|
|
**kwargs |
|
|
): |
|
|
super().__init__( |
|
|
unk_token=unk_token, |
|
|
sep_token=sep_token, |
|
|
pad_token=pad_token, |
|
|
cls_token=cls_token, |
|
|
mask_token=mask_token, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
if not os.path.isfile(vocab_file): |
|
|
raise ValueError( |
|
|
"Can't find a vocabulary file at path '{}'.".format(vocab_file) |
|
|
) |
|
|
self.vocab = load_vocab(vocab_file) |
|
|
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) |
|
|
self.tokenizer = Atomwise_Tokenizer() |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self.vocab) |
|
|
|
|
|
def get_vocab(self): |
|
|
return dict(self.vocab, **self.added_tokens_encoder) |
|
|
|
|
|
|
|
|
def _tokenize(self, text): |
|
|
return self.tokenizer.tokenize(text) |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
""" Converts a token (str) in an id using the vocab. """ |
|
|
return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
|
|
|
def _convert_id_to_token(self, index): |
|
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
|
return self.ids_to_tokens.get(index, self.unk_token) |
|
|
|
|
|
def convert_tokens_to_string(self, tokens): |
|
|
""" Converts a sequence of tokens (string) in a single string. """ |
|
|
out_string = " ".join(tokens).replace(" ##", "").strip() |
|
|
return out_string |
|
|
|
|
|
def build_inputs_with_special_tokens( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
|
|
) -> List[int]: |
|
|
""" |
|
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
|
|
by concatenating and adding special tokens. |
|
|
A BERT sequence has the following format: |
|
|
- single sequence: ``[CLS] X [SEP]`` |
|
|
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` |
|
|
Args: |
|
|
token_ids_0 (:obj:`List[int]`): |
|
|
List of IDs to which the special tokens will be added |
|
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
Returns: |
|
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. |
|
|
""" |
|
|
if token_ids_1 is None: |
|
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] |
|
|
cls = [self.cls_token_id] |
|
|
sep = [self.sep_token_id] |
|
|
return cls + token_ids_0 + sep + token_ids_1 + sep |
|
|
|
|
|
def get_special_tokens_mask( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False |
|
|
) -> List[int]: |
|
|
""" |
|
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
|
|
special tokens using the tokenizer ``prepare_for_model`` method. |
|
|
Args: |
|
|
token_ids_0 (:obj:`List[int]`): |
|
|
List of ids. |
|
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
|
|
Set to True if the token list is already formatted with special tokens for the model |
|
|
Returns: |
|
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
|
|
""" |
|
|
|
|
|
if already_has_special_tokens: |
|
|
if token_ids_1 is not None: |
|
|
raise ValueError( |
|
|
"You should not supply a second sequence if the provided sequence of " |
|
|
"ids is already formated with special tokens for the model." |
|
|
) |
|
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) |
|
|
|
|
|
if token_ids_1 is not None: |
|
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] |
|
|
return [1] + ([0] * len(token_ids_0)) + [1] |
|
|
|
|
|
def create_token_type_ids_from_sequences( |
|
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
|
|
) -> List[int]: |
|
|
""" |
|
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. |
|
|
A BERT sequence pair mask has the following format: |
|
|
:: |
|
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 |
|
|
| first sequence | second sequence | |
|
|
if token_ids_1 is None, only returns the first portion of the mask (0's). |
|
|
Args: |
|
|
token_ids_0 (:obj:`List[int]`): |
|
|
List of ids. |
|
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
Returns: |
|
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given |
|
|
sequence(s). |
|
|
""" |
|
|
sep = [self.sep_token_id] |
|
|
cls = [self.cls_token_id] |
|
|
if token_ids_1 is None: |
|
|
return len(cls + token_ids_0 + sep) * [0] |
|
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] |
|
|
|
|
|
def save_vocabulary(self, vocab_path): |
|
|
""" |
|
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. |
|
|
Args: |
|
|
vocab_path (:obj:`str`): |
|
|
The directory in which to save the vocabulary. |
|
|
Returns: |
|
|
:obj:`Tuple(str)`: Paths to the files saved. |
|
|
""" |
|
|
index = 0 |
|
|
vocab_file = vocab_path |
|
|
with open(vocab_file, "w", encoding="utf-8") as writer: |
|
|
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): |
|
|
if index != token_index: |
|
|
index = token_index |
|
|
writer.write(token + "\n") |
|
|
index += 1 |
|
|
return (vocab_file,) |
|
|
|