Spaces:
Runtime error
Runtime error
im
commited on
Commit
·
085ad06
1
Parent(s):
ab03e32
add BPE, Unigram, and WordPiece tokenisers
Browse files- .streamlit/config.toml +1 -1
- app.py +279 -82
.streamlit/config.toml
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
[theme]
|
| 2 |
-
base="
|
| 3 |
font="sans serif"
|
|
|
|
| 1 |
[theme]
|
| 2 |
+
base="light"
|
| 3 |
font="sans serif"
|
app.py
CHANGED
|
@@ -39,13 +39,16 @@ advancements:
|
|
| 39 |
|
| 40 |
Understanding these foundational concepts is crucial to comprehending the overall structure and function of the
|
| 41 |
Transformer model. They are the building blocks from which the rest of the model is constructed, and their roles
|
| 42 |
-
within the architecture are essential to the model's ability to process and generate language.
|
|
|
|
|
|
|
| 43 |
|
| 44 |
Given the importance and complexity of these concepts, I have chosen to dedicate the first article in this series
|
| 45 |
solely to Tokenisation and embeddings. The decision to separate the topics into individual articles is driven by a
|
| 46 |
desire to provide a thorough and in-depth understanding of each component of the Transformer model.
|
| 47 |
|
| 48 |
-
|
|
|
|
| 49 |
""")
|
| 50 |
|
| 51 |
with st.expander("Copernicus Museum in Warsaw"):
|
|
@@ -56,7 +59,7 @@ providing the opportunity to explore different concepts firsthand. **They even f
|
|
| 56 |
illustrates the neuron activation process during the recognition of handwritten digits!**
|
| 57 |
|
| 58 |
Taking inspiration from this approach, we'll embark on our journey into the world of Transformer models by first
|
| 59 |
-
establishing a firm understanding of
|
| 60 |
needed to delve into the more complex aspects of these models later on.
|
| 61 |
|
| 62 |
I encourage you not to hesitate in modifying parameters or experimenting with different models in the provided
|
|
@@ -67,6 +70,7 @@ through this virtual, interactive museum of AI. Enjoy the exploration!
|
|
| 67 |
|
| 68 |
divider()
|
| 69 |
|
|
|
|
| 70 |
st.header("Tokenisers and Tokenisation")
|
| 71 |
|
| 72 |
st.write("""Tokenisation is the initial step in the data preprocessing pipeline for natural language processing (NLP)
|
|
@@ -78,40 +82,36 @@ the language of the text.
|
|
| 78 |
|
| 79 |
from transformers import AutoTokenizer
|
| 80 |
|
| 81 |
-
sentence = st.text_input("
|
| 82 |
sentence_split = sentence.split()
|
| 83 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 84 |
sentence_tokenise_bert = tokenizer.tokenize(sentence)
|
| 85 |
sentence_encode_bert = tokenizer.encode(sentence)
|
| 86 |
sentence_encode_bert = list(zip(sentence_tokenise_bert, sentence_encode_bert))
|
| 87 |
|
| 88 |
-
st.write(f"""
|
| 89 |
-
|
| 90 |
-
""")
|
| 91 |
-
st.code(f"""
|
| 92 |
-
"{sentence}"
|
| 93 |
-
""")
|
| 94 |
-
|
| 95 |
-
st.write(f"""
|
| 96 |
-
A basic word-level Tokenisation would produce tokens:
|
| 97 |
""")
|
| 98 |
st.code(f"""
|
| 99 |
{sentence_split}
|
| 100 |
""")
|
| 101 |
|
| 102 |
|
| 103 |
-
st.write(f"""
|
| 104 |
-
However,
|
| 105 |
-
""")
|
|
|
|
|
|
|
|
|
|
| 106 |
st.code(f"""
|
| 107 |
{sentence_tokenise_bert}
|
| 108 |
""")
|
| 109 |
|
| 110 |
-
with st.expander("click to look at the code:"):
|
| 111 |
st.code(f"""\
|
| 112 |
from transformers import AutoTokenizer
|
| 113 |
|
| 114 |
-
sentence =
|
| 115 |
sentence_split = sentence.split()
|
| 116 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 117 |
sentence_tokenise_bert = tokenizer.tokenize(sentence)
|
|
@@ -144,38 +144,91 @@ Changing the Tokeniser is akin to altering the model's language on the fly—ima
|
|
| 144 |
then taking the exam in French or Spanish. You might get lucky, but it's a considerable risk.
|
| 145 |
""")
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
training_dataset = st.text_area("*Training Dataset - Vocabulary:*", value=training_dataset, height=200)
|
| 158 |
training_dataset = training_dataset.split('\n')
|
| 159 |
vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
|
|
|
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
-
# TODO: add more tokenisers
|
| 163 |
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
| 164 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 165 |
-
# tokenizer = Tokenizer(models.Unigram())
|
| 166 |
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 167 |
tokenizer.decoder = decoders.ByteLevel()
|
| 168 |
trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocabulary_size)
|
| 169 |
-
|
| 170 |
-
# trainer = trainers.UnigramTrainer(
|
| 171 |
-
# vocab_size=20000,
|
| 172 |
-
# initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
| 173 |
-
# special_tokens=["<PAD>", "<BOS>", "<EOS>"],
|
| 174 |
-
# )
|
| 175 |
-
|
| 176 |
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 177 |
-
|
| 178 |
-
sentence = st.text_input("*Text to tokenise:*", value="[CLS] Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")
|
| 179 |
output = tokenizer.encode(sentence)
|
| 180 |
|
| 181 |
st.write("*Tokens:*")
|
|
@@ -185,60 +238,204 @@ Readability counts.
|
|
| 185 |
attention_mast: {output.attention_mask}
|
| 186 |
""")
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
st.subheader("Try Yourself:")
|
| 191 |
-
st.write(f"""
|
|
|
|
| 192 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
| 193 |
-
more understandable and containing less tokens*
|
| 194 |
""")
|
| 195 |
|
| 196 |
-
st.
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
[
|
| 202 |
-
most models have a fixed-size vocabulary, it's not possible to have a unique token for every possible word. The [UNK]
|
| 203 |
-
token is used as a catch-all for any words the model hasn't seen before. E.g. in our example we 'decided' that Large
|
| 204 |
-
Language (LL) abbreviation is not part of the model's vocabulary.
|
| 205 |
-
|
| 206 |
-
[CLS]: Stands for "classification". In models like BERT, this token is added at the beginning of every input
|
| 207 |
-
sequence. The representation (embedding) of this token is used as the aggregate sequence representation for
|
| 208 |
-
classification tasks. In other words, the model is trained to encode the meaning of the entire sequence into this token.
|
| 209 |
-
|
| 210 |
-
[SEP]: Stands for "separator". This token is used to separate different sequences when the model needs to take more
|
| 211 |
-
than one input sequence. For example, in question-answering tasks, the model takes two inputs: a question and a
|
| 212 |
-
passage that contains the answer. The two inputs are separated by a [SEP] token.
|
| 213 |
-
|
| 214 |
-
[MASK]: This token is specific to models like BERT, which are trained with a masked language modelling objective.
|
| 215 |
-
During training, some percentage of the input tokens are replaced with the [MASK] token, and the model's goal is to
|
| 216 |
-
predict the original value of the masked tokens.
|
| 217 |
-
|
| 218 |
-
[PAD]: Stands for "padding". This token is used to fill in the extra spaces when batching sequences of different
|
| 219 |
-
lengths together. Since models require input sequences to be the same length, shorter sequences are extended with [
|
| 220 |
-
PAD] tokens. In our example, we extended the length of the input sequence to 16 tokens.
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
tokenizer =
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 232 |
-
output = tokenizer.encode(
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
|
| 236 |
with st.expander("References:"):
|
| 237 |
st.write("""\
|
| 238 |
-
- https://huggingface.co/docs/transformers/tokenizer_summary
|
| 239 |
-
- https://huggingface.co/docs/tokenizers/training_from_memory
|
| 240 |
-
- https://en.wikipedia.org/wiki/Byte_pair_encoding
|
| 241 |
-
|
| 242 |
""")
|
| 243 |
|
| 244 |
divider()
|
|
|
|
| 39 |
|
| 40 |
Understanding these foundational concepts is crucial to comprehending the overall structure and function of the
|
| 41 |
Transformer model. They are the building blocks from which the rest of the model is constructed, and their roles
|
| 42 |
+
within the architecture are essential to the model's ability to process and generate language. In my view,
|
| 43 |
+
a comprehensive and simple explanation may give a reader a significant advantage in using LLMs. Feynman once said:
|
| 44 |
+
"*I think I can safely say that nobody understands quantum mechanics.*". Because he couldn't explain it to a freshman.
|
| 45 |
|
| 46 |
Given the importance and complexity of these concepts, I have chosen to dedicate the first article in this series
|
| 47 |
solely to Tokenisation and embeddings. The decision to separate the topics into individual articles is driven by a
|
| 48 |
desire to provide a thorough and in-depth understanding of each component of the Transformer model.
|
| 49 |
|
| 50 |
+
Note: *HuggingFace provides an exceptional [tutorial on Transformer models](https://huggingface.co/docs/transformers/index).
|
| 51 |
+
That tutorial is particularly beneficial for readers willing to dive into advanced topics.*
|
| 52 |
""")
|
| 53 |
|
| 54 |
with st.expander("Copernicus Museum in Warsaw"):
|
|
|
|
| 59 |
illustrates the neuron activation process during the recognition of handwritten digits!**
|
| 60 |
|
| 61 |
Taking inspiration from this approach, we'll embark on our journey into the world of Transformer models by first
|
| 62 |
+
establishing a firm understanding of tokenisation and embeddings. This foundation will equip us with the knowledge
|
| 63 |
needed to delve into the more complex aspects of these models later on.
|
| 64 |
|
| 65 |
I encourage you not to hesitate in modifying parameters or experimenting with different models in the provided
|
|
|
|
| 70 |
|
| 71 |
divider()
|
| 72 |
|
| 73 |
+
|
| 74 |
st.header("Tokenisers and Tokenisation")
|
| 75 |
|
| 76 |
st.write("""Tokenisation is the initial step in the data preprocessing pipeline for natural language processing (NLP)
|
|
|
|
| 82 |
|
| 83 |
from transformers import AutoTokenizer
|
| 84 |
|
| 85 |
+
sentence = st.text_input("Consider the sentence: (you can change it):", value="Tokenising text is a fundamental step for NLP models.")
|
| 86 |
sentence_split = sentence.split()
|
| 87 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 88 |
sentence_tokenise_bert = tokenizer.tokenize(sentence)
|
| 89 |
sentence_encode_bert = tokenizer.encode(sentence)
|
| 90 |
sentence_encode_bert = list(zip(sentence_tokenise_bert, sentence_encode_bert))
|
| 91 |
|
| 92 |
+
st.write(f"""\
|
| 93 |
+
A basic word-level tokenisation, which splits a text by spaces, would produce next tokens:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
""")
|
| 95 |
st.code(f"""
|
| 96 |
{sentence_split}
|
| 97 |
""")
|
| 98 |
|
| 99 |
|
| 100 |
+
st.write(f"""\
|
| 101 |
+
However, we notice that the punctuation may attached to the words. It is disadvantageous, how the tokenization dealt with the word "Don't".
|
| 102 |
+
"Don't" stands for "do not", so it would be better tokenized as ["Do", "n't"]. (Hint: try another sentence: "I musn't tell lies. Don't do this.") This is where things start getting complicated,
|
| 103 |
+
and part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text,
|
| 104 |
+
a different tokenized output is generated for the same text.
|
| 105 |
+
A more sophisticated algorithm, with several optimizations, might generate a different set of tokens: """)
|
| 106 |
st.code(f"""
|
| 107 |
{sentence_tokenise_bert}
|
| 108 |
""")
|
| 109 |
|
| 110 |
+
with st.expander("click here to look at the Python code:"):
|
| 111 |
st.code(f"""\
|
| 112 |
from transformers import AutoTokenizer
|
| 113 |
|
| 114 |
+
sentence = "{sentence}"
|
| 115 |
sentence_split = sentence.split()
|
| 116 |
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
| 117 |
sentence_tokenise_bert = tokenizer.tokenize(sentence)
|
|
|
|
| 144 |
then taking the exam in French or Spanish. You might get lucky, but it's a considerable risk.
|
| 145 |
""")
|
| 146 |
|
| 147 |
+
training_dataset = """\
|
| 148 |
+
Beautiful is better than ugly.
|
| 149 |
+
Explicit is better than implicit.
|
| 150 |
+
Simple is better than complex.
|
| 151 |
+
Complex is better than complicated.
|
| 152 |
+
Flat is better than nested.
|
| 153 |
+
Sparse is better than dense.
|
| 154 |
+
Readability counts.
|
| 155 |
+
"""
|
| 156 |
+
|
| 157 |
+
tokeniser_name = st.selectbox(label="Choose your tokeniser", options=["BPE", 'Unigram', 'WordPiece'])
|
| 158 |
+
if tokeniser_name == 'BPE':
|
| 159 |
+
st.subheader("Byte-Pair Encoding (BPE)")
|
| 160 |
+
st.write("""\
|
| 161 |
+
Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword
|
| 162 |
+
Units (Sennrich et al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the
|
| 163 |
+
training data into words. Pre-tokenization can be as simple as space tokenization, e.g. GPT-2, Roberta. More
|
| 164 |
+
advanced pre-tokenization include rule-based tokenization, e.g. XLM, FlauBERT which uses Moses for most
|
| 165 |
+
languages, or GPT which uses Spacy and ftfy, to count the frequency of each word in the training corpus.
|
| 166 |
+
|
| 167 |
+
After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the
|
| 168 |
+
training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the
|
| 169 |
+
set of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so
|
| 170 |
+
until the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a
|
| 171 |
+
hyperparameter to define before training the tokenizer.
|
| 172 |
+
|
| 173 |
+
As an example, let’s assume that after pre-tokenization, the following set of words including their frequency has
|
| 174 |
+
been determined:
|
| 175 |
+
""")
|
| 176 |
+
st.code(""" ("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5) """)
|
| 177 |
+
st.write("""\
|
| 178 |
+
Consequently, the base vocabulary is ["b", "g", "h", "n", "p", "s", "u"]. Splitting all words into symbols of the base vocabulary, we obtain:
|
| 179 |
+
""")
|
| 180 |
+
st.code(""" ("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5) """)
|
| 181 |
+
st.write("""\
|
| 182 |
+
BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs
|
| 183 |
+
most frequently. In the example above "h" followed by "u" is present 10 + 5 = 15 times (10 times in the 10
|
| 184 |
+
occurrences of "hug", 5 times in the 5 occurrences of "hugs"). However, the most frequent symbol pair is "u"
|
| 185 |
+
followed by "g", occurring 10 + 5 + 5 = 20 times in total. Thus, the first merge rule the tokenizer learns is to
|
| 186 |
+
group all "u" symbols followed by a "g" symbol together. Next, "ug" is added to the vocabulary. The set of words
|
| 187 |
+
then becomes
|
| 188 |
+
""")
|
| 189 |
+
st.code(""" ("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5) """)
|
| 190 |
+
st.write("""\
|
| 191 |
+
BPE then identifies the next most common symbol pair. It’s "u" followed by "n", which occurs 16
|
| 192 |
+
times. "u", "n" is merged to "un" and added to the vocabulary. The next most frequent symbol pair is "h" followed
|
| 193 |
+
by "ug", occurring 15 times. Again the pair is merged and "hug" can be added to the vocabulary.
|
| 194 |
+
|
| 195 |
+
At this stage, the vocabulary is ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"] and our set of unique words is represented as
|
| 196 |
+
""")
|
| 197 |
+
st.code(""" ("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5) """)
|
| 198 |
+
st.write("""\
|
| 199 |
+
Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules
|
| 200 |
+
would then be applied to new words (as long as those new words do not include symbols that were not in the base
|
| 201 |
+
vocabulary). For instance, the word "bug" would be tokenized to ["b", "ug"] but "mug" would be tokenized as [
|
| 202 |
+
"[unk]", "ug"] since the symbol "m" is not in the base vocabulary. In general, single letters such as "m" are not
|
| 203 |
+
replaced by the "[unk]" symbol because the training data usually includes at least one occurrence of each letter,
|
| 204 |
+
but it is likely to happen for very special characters like emojis.
|
| 205 |
+
|
| 206 |
+
As mentioned earlier, the vocabulary size, i.e. the base vocabulary size + the number of merges, is a hyperparameter
|
| 207 |
+
to choose. For instance GPT has a vocabulary size of 40,478 since they have 478 base characters and chose to stop
|
| 208 |
+
training after 40,000 merges.
|
| 209 |
+
""")
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
st.subheader("Try Yourself:")
|
| 213 |
+
st.write(f"""\
|
| 214 |
+
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 215 |
+
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
| 216 |
+
more understandable and containing less tokens (ids)*
|
| 217 |
+
""")
|
| 218 |
+
|
| 219 |
training_dataset = st.text_area("*Training Dataset - Vocabulary:*", value=training_dataset, height=200)
|
| 220 |
training_dataset = training_dataset.split('\n')
|
| 221 |
vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
|
| 222 |
+
sentence = st.text_input(label="*Text to tokenise:*",
|
| 223 |
+
value="[CLS] Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")
|
| 224 |
|
| 225 |
|
|
|
|
| 226 |
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
| 227 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
|
|
|
| 228 |
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 229 |
tokenizer.decoder = decoders.ByteLevel()
|
| 230 |
trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocabulary_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
|
|
|
|
|
|
| 232 |
output = tokenizer.encode(sentence)
|
| 233 |
|
| 234 |
st.write("*Tokens:*")
|
|
|
|
| 238 |
attention_mast: {output.attention_mask}
|
| 239 |
""")
|
| 240 |
|
| 241 |
+
st.write(""" *well done if you get ids like these: [1, 57, 49, 28, 10, 58, 55, 52, 31, 54, 5, 2, 3, 3, 3]!*""")
|
| 242 |
+
|
| 243 |
+
with st.expander("Python code:"):
|
| 244 |
+
st.code(f"""
|
| 245 |
+
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
| 246 |
+
|
| 247 |
+
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 248 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 249 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 250 |
+
trainer = trainers.BpeTrainer(
|
| 251 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
| 252 |
+
vocab_size={vocabulary_size})
|
| 253 |
+
training_dataset = {training_dataset}
|
| 254 |
+
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 255 |
+
output = tokenizer.encode("{sentence}")
|
| 256 |
+
""", language='python')
|
| 257 |
+
elif tokeniser_name == 'Unigram':
|
| 258 |
+
st.subheader("""Unigram""")
|
| 259 |
+
st.write("""\
|
| 260 |
+
Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural
|
| 261 |
+
Network Translation Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf).
|
| 262 |
+
In contrast to BPE or WordPiece, Unigram initializes its base vocabulary to a large number of symbols and
|
| 263 |
+
progressively trims down each symbol to obtain a smaller vocabulary. The base vocabulary could for instance
|
| 264 |
+
correspond to all pre-tokenized words and the most common substrings. Unigram is not used directly for any of the
|
| 265 |
+
models in the transformers, but it’s used in conjunction with SentencePiece.
|
| 266 |
+
|
| 267 |
+
At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training
|
| 268 |
+
data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary,
|
| 269 |
+
the algorithm computes how much the overall loss would increase if the symbol was to be removed from the vocabulary.
|
| 270 |
+
Unigram then removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest,
|
| 271 |
+
i.e. those symbols that least affect the overall loss over the training data. This process is repeated until the
|
| 272 |
+
vocabulary has reached the desired size. The Unigram algorithm always keeps the base characters so that any word can
|
| 273 |
+
be tokenized.
|
| 274 |
+
|
| 275 |
+
Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of
|
| 276 |
+
tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:
|
| 277 |
+
""")
|
| 278 |
+
st.code(""" ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"] """)
|
| 279 |
+
st.write("""\
|
| 280 |
+
"hugs" could be tokenized both as ["hug", "s"], ["h", "ug", "s"] or ["h", "u", "g", "s"]. So which
|
| 281 |
+
one to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary
|
| 282 |
+
so that the probability of each possible tokenization can be computed after training. The algorithm simply picks
|
| 283 |
+
the most likely tokenization in practice, but also offers the possibility to sample a possible tokenization
|
| 284 |
+
according to their probabilities.
|
| 285 |
+
""")
|
| 286 |
|
| 287 |
+
st.subheader("Try Yourself:")
|
| 288 |
+
st.write(f"""\
|
| 289 |
+
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 290 |
+
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
| 291 |
+
more understandable and containing less tokens (ids)*
|
| 292 |
+
""")
|
| 293 |
+
|
| 294 |
+
training_dataset = st.text_area("*Training Dataset - Vocabulary(change it and looks at resulted tokens):*", value=training_dataset, height=200)
|
| 295 |
+
training_dataset = training_dataset.split('\n')
|
| 296 |
+
vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
|
| 297 |
+
sentence = st.text_input(label="*Text to tokenise:*",
|
| 298 |
+
value="[CLS] Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")
|
| 299 |
+
|
| 300 |
+
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
| 301 |
+
|
| 302 |
+
tokenizer = Tokenizer(models.Unigram())
|
| 303 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 304 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 305 |
+
trainer = trainers.UnigramTrainer(
|
| 306 |
+
vocab_size=vocabulary_size,
|
| 307 |
+
unk_token="[UNK]",
|
| 308 |
+
# initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
| 309 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
| 310 |
+
)
|
| 311 |
+
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 312 |
+
output = tokenizer.encode(sentence)
|
| 313 |
+
|
| 314 |
+
# TODO: make it more visible, container with a differect color or something
|
| 315 |
+
st.write("*Tokens:*")
|
| 316 |
+
st.code(f"""{output.tokens}""")
|
| 317 |
+
st.code(f"""\
|
| 318 |
+
ids: {output.ids}
|
| 319 |
+
attention_mast: {output.attention_mask}
|
| 320 |
+
""")
|
| 321 |
+
|
| 322 |
+
st.write(""" *well done if you get ids like these: [1, 57, 49, 28, 10, 58, 55, 52, 31, 54, 5, 2, 3, 3, 3]!*""")
|
| 323 |
+
with st.expander("Python code:"):
|
| 324 |
+
st.code(f"""\
|
| 325 |
+
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
| 326 |
+
|
| 327 |
+
tokenizer = Tokenizer(models.Unigram())
|
| 328 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 329 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 330 |
+
trainer = trainers.UnigramTrainer(
|
| 331 |
+
vocab_size={vocabulary_size},
|
| 332 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
| 333 |
+
)
|
| 334 |
+
training_dataset = {training_dataset}
|
| 335 |
+
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 336 |
+
output = tokenizer.encode("{sentence}")
|
| 337 |
+
""", language='python')
|
| 338 |
+
elif tokeniser_name == 'WordPiece':
|
| 339 |
+
st.subheader("""WordPiece""")
|
| 340 |
+
st.write("""\
|
| 341 |
+
WordPiece is the subword tokenization algorithm used for BERT, DistilBERT, and Electra. The
|
| 342 |
+
algorithm was outlined in [Japanese and Korean Voice Search (Schuster et al.,
|
| 343 |
+
2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very
|
| 344 |
+
similar to BPE. WordPiece first initializes the vocabulary to include every character present in the training
|
| 345 |
+
data and progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the
|
| 346 |
+
most frequent symbol pair, but the one that maximizes the likelihood of the training data once added to the
|
| 347 |
+
vocabulary.
|
| 348 |
+
|
| 349 |
+
So what does this mean exactly? Referring to the example from BPE tokeniser, maximizing the likelihood of the training data is
|
| 350 |
+
equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by
|
| 351 |
+
its second symbol is the greatest among all symbol pairs. E.g. "u", followed by "g" would have only been merged if
|
| 352 |
+
the probability of "ug" divided by "u", "g" would have been greater than for any other symbol pair. Intuitively,
|
| 353 |
+
WordPiece is slightly different to BPE in that it evaluates what it loses by merging two symbols to ensure it’s worth
|
| 354 |
+
it.
|
| 355 |
+
""")
|
| 356 |
|
| 357 |
st.subheader("Try Yourself:")
|
| 358 |
+
st.write(f"""\
|
| 359 |
+
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 360 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
| 361 |
+
more understandable and containing less tokens (ids)*
|
| 362 |
""")
|
| 363 |
|
| 364 |
+
training_dataset = st.text_area("*Training Dataset - Vocabulary(change it and looks at resulted tokens):*",
|
| 365 |
+
value=training_dataset, height=200)
|
| 366 |
+
training_dataset = training_dataset.split('\n')
|
| 367 |
+
vocabulary_size = st.number_input("Vocabulary Size:", value=100000)
|
| 368 |
+
sentence = st.text_input(label="*Text to tokenise:*",
|
| 369 |
+
value="[CLS] Tokenising text is a fundamental step for NLP models. [SEP] [PAD] [PAD] [PAD]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
+
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, trainers
|
| 372 |
+
|
| 373 |
+
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
|
| 374 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
| 375 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 376 |
+
trainer = trainers.WordPieceTrainer(
|
| 377 |
+
vocab_size=vocabulary_size,
|
| 378 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
| 379 |
+
)
|
| 380 |
+
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 381 |
+
output = tokenizer.encode(sentence)
|
| 382 |
+
|
| 383 |
+
# TODO: make it more visible, container with a differect color or something
|
| 384 |
+
st.write("*Tokens:*")
|
| 385 |
+
st.code(f"""{output.tokens}""")
|
| 386 |
+
st.code(f"""\
|
| 387 |
+
ids: {output.ids}
|
| 388 |
+
attention_mast: {output.attention_mask}
|
| 389 |
+
""")
|
| 390 |
+
|
| 391 |
+
st.write(""" *well done if you get ids like these: [1, 76, 72, 50, 10, 77, 71, 68, 66, 78, 5, 2, 3, 3, 3]!*""")
|
| 392 |
+
with st.expander("Python code:"):
|
| 393 |
+
st.code(f"""\
|
| 394 |
+
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, trainers
|
| 395 |
+
|
| 396 |
+
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
|
| 397 |
+
trainer = trainers.WordPieceTrainer(
|
| 398 |
+
vocab_size={vocabulary_size},
|
| 399 |
+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
|
| 400 |
+
)
|
| 401 |
+
training_dataset = {training_dataset}
|
| 402 |
+
tokenizer.train_from_iterator(training_dataset, trainer=trainer)
|
| 403 |
+
output = tokenizer.encode("{sentence}")
|
| 404 |
+
""", language='python')
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
with st.expander("Special tokens meaning:"):
|
| 408 |
+
st.write("""\
|
| 409 |
+
\\#\\# prefix: It means that the preceding string is not whitespace, any token with this prefix should be
|
| 410 |
+
merged with the previous token when you convert the tokens back to a string.
|
| 411 |
+
|
| 412 |
+
[UNK]: Stands for "unknown". This token is used to represent any word that is not in the model's vocabulary. Since
|
| 413 |
+
most models have a fixed-size vocabulary, it's not possible to have a unique token for every possible word. The [UNK]
|
| 414 |
+
token is used as a catch-all for any words the model hasn't seen before. E.g. in our example we 'decided' that Large
|
| 415 |
+
Language (LL) abbreviation is not part of the model's vocabulary.
|
| 416 |
+
|
| 417 |
+
[CLS]: Stands for "classification". In models like BERT, this token is added at the beginning of every input
|
| 418 |
+
sequence. The representation (embedding) of this token is used as the aggregate sequence representation for
|
| 419 |
+
classification tasks. In other words, the model is trained to encode the meaning of the entire sequence into this token.
|
| 420 |
+
|
| 421 |
+
[SEP]: Stands for "separator". This token is used to separate different sequences when the model needs to take more
|
| 422 |
+
than one input sequence. For example, in question-answering tasks, the model takes two inputs: a question and a
|
| 423 |
+
passage that contains the answer. The two inputs are separated by a [SEP] token.
|
| 424 |
+
|
| 425 |
+
[MASK]: This token is specific to models like BERT, which are trained with a masked language modelling objective.
|
| 426 |
+
During training, some percentage of the input tokens are replaced with the [MASK] token, and the model's goal is to
|
| 427 |
+
predict the original value of the masked tokens.
|
| 428 |
+
|
| 429 |
+
[PAD]: Stands for "padding". This token is used to fill in the extra spaces when batching sequences of different
|
| 430 |
+
lengths together. Since models require input sequences to be the same length, shorter sequences are extended with [
|
| 431 |
+
PAD] tokens. In our example, we extended the length of the input sequence to 16 tokens.
|
| 432 |
+
""")
|
| 433 |
|
| 434 |
|
| 435 |
with st.expander("References:"):
|
| 436 |
st.write("""\
|
| 437 |
+
- https://huggingface.co/docs/transformers/tokenizer_summary
|
| 438 |
+
- https://huggingface.co/docs/tokenizers/training_from_memory
|
|
|
|
|
|
|
| 439 |
""")
|
| 440 |
|
| 441 |
divider()
|