arabic-tokenizers-leaderboard

Runtime error

App Files Files Community

MohamedRashad commited on May 23, 2024

Commit

d2e2dfe

1 Parent(s): 0c0efc4

refactor: improve tokenization for Arabic text

Browse files

Files changed (2) hide show

app.py +121 -59
arabic_tokenizers_leaderboard.jsonl +14 -0

app.py CHANGED Viewed

@@ -7,39 +7,74 @@ import random
 from pathlib import Path
 initial_list_of_models = [
     "Xenova/gpt-4o",
     "NousResearch/Meta-Llama-3-8B",
     "CohereForAI/c4ai-command-r-v01",
     "CohereForAI/c4ai-command-r-plus",
     "core42/jais-13b",
 ]
-dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
 dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 if dataframe_path.exists():
     df = pd.read_json(dataframe_path, lines=True)
 else:
-    df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])
-for model_name in tqdm(initial_list_of_models):
-    if model_name in df["📛 Models"].values:
-        continue
     tokenizer = AutoTokenizer.from_pretrained(
         model_name, use_fast=True, trust_remote_code=True
     )
     vocab_size = tokenizer.vocab_size
-    number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
-    df = df._append(
-        {
-            "📛 Models": model_name,
-            "📘 Vocab Size": vocab_size,
-            "➕ Total Number of Tokens": number_of_tokens,
-            "Tokenizer Class": tokenizer.__class__.__name__,
-        },
-        ignore_index=True,
-    )
 # Sort the dataframe by the number of tokens
 df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
@@ -47,59 +82,57 @@ df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
 # Save the dataframe to a csv file
 df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
 def submit(model_name):
     global df
     if model_name in df["📛 Models"].values:
-        return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name, use_fast=True, trust_remote_code=True
-    )
-    vocab_size = tokenizer.vocab_size
-    number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
-    df = df._append(
-        {
-            "📛 Models": model_name,
-            "➕ Total Number of Tokens": number_of_tokens,
-            "📘 Vocab Size": vocab_size,
-            "Tokenizer Class": tokenizer.__class__.__name__,
-        },
-        ignore_index=True,
-    )
     df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
     df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
-    return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
 def generate_distinct_colors(n):
     """Generate n visually distinct colors in hexadecimal format."""
     if n > 256**3:
         raise ValueError("Cannot generate more than 16,777,216 unique colors.")
     # To ensure colors are distinct, calculate an appropriate distance between colors
     # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
-    spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
     max_val = 256 - spacing
     # Set to keep track of used colors
     used_colors = set()
     # List to store the result colors
     result = []
     attempts = 0
     while len(result) < n:
         # Generate a color with a random start and controlled spacing
         r = random.randint(0, max_val)
         g = random.randint(0, max_val)
         b = random.randint(0, max_val)
         # Scale up by spacing to ensure minimum distance between colors
         r = min(255, r * spacing)
         g = min(255, g * spacing)
         b = min(255, b * spacing)
         # Format the color in hexadecimal
         color = f"#{r:02X}{g:02X}{b:02X}"
         # Ensure this color hasn't been used
         if color not in used_colors:
             used_colors.add(color)
@@ -111,29 +144,31 @@ def generate_distinct_colors(n):
                 spacing = max(1, spacing - 1)
                 max_val = 256 - spacing
                 attempts = 0
     return result
 def decode_bpe_tokens(tokens):
     fixed_tokens = []
     for token in tokens:
         # Check if the token starts with the special BPE space character 'Ġ'
-        if token.startswith('Ġ'):
             # Process the rest of the token
             try:
                 # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
-                fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
             except UnicodeDecodeError:
                 fixed_token = token  # Use the original token if decoding fails
         else:
             try:
                 # Directly encode and decode without misinterpretation steps
-                fixed_token = token.encode('utf-8').decode('utf-8')
             except UnicodeDecodeError:
                 fixed_token = token  # Use the original token if decoding fails
         fixed_tokens.append(fixed_token)
     return fixed_tokens
 def tokenize_text(text, chosen_model, better_tokenization=False):
     tokenizer = AutoTokenizer.from_pretrained(chosen_model)
     tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
@@ -144,11 +179,13 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
         for token in tokenized_text:
             correct_tokenized_text = ""
             for char in text:
-                correct_tokenized_text += char
-                current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
                 if current_token[0] == token:
                     final_tokenized_text.append(correct_tokenized_text)
-                    text = text[len(correct_tokenized_text):]
                     break
     else:
         final_tokenized_text = tokenized_text
@@ -158,19 +195,30 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
     color_map = {}
     for idx, token in enumerate(final_tokenized_text):
         output.append((token, str(idx)))
-        color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
     return gr.HighlightedText(output, color_map)
 def refresh():
     global df
     df = pd.read_json(dataframe_path, lines=True)
-    return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
-leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
-This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
-A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
 """
 with gr.Blocks() as demo:
@@ -188,7 +236,7 @@ with gr.Blocks() as demo:
                 y_title=" ",
                 width=1000,
                 height=400,
-                tooltip=["📘 Vocab Size", "➕ Total Number of Tokens"],
                 vertical=False,
                 x_label_angle=30,
             )
@@ -196,10 +244,18 @@ with gr.Blocks() as demo:
             label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
         )
         with gr.Row():
-            submit_new_model_btn = gr.Button(value="Submit New Model", variant="primary", scale=3)
             refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
     with gr.Tab(label="Try tokenizers"):
-        text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
         dropdown = gr.Dropdown(
             label="Select a model",
             choices=df["📛 Models"].tolist(),
@@ -207,12 +263,18 @@ with gr.Blocks() as demo:
         )
         with gr.Row():
             submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
-            checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
         tokenized_textbox = gr.HighlightedText(label="Tokenized text")
-    submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
     refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
-    submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
 demo.launch()

 from pathlib import Path
 initial_list_of_models = [
+    "asafaya/bert-base-arabic",
     "Xenova/gpt-4o",
+    "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
+    "FreedomIntelligence/AceGPT-13B",
+    "Qwen/Qwen1.5-7B-Chat",
+    "Qwen/Qwen1.5-110B-Chat",
+    "microsoft/Phi-3-mini-128k-instruct",
+    "unsloth/gemma-2b-bnb-4bit",
     "NousResearch/Meta-Llama-3-8B",
     "CohereForAI/c4ai-command-r-v01",
     "CohereForAI/c4ai-command-r-plus",
     "core42/jais-13b",
+    "core42/jais-30b-chat-v3",
 ]
 dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 if dataframe_path.exists():
     df = pd.read_json(dataframe_path, lines=True)
 else:
+    df = pd.DataFrame(
+        columns=[
+            "👳 Tokenize Tashkeel",
+            "📛 Models",
+            "🪺 Fertility Score",
+            "➕ Total Number of Tokens",
+            "📘 Vocab Size",
+            "Tokenizer Class",
+        ]
+    )
+# Datasets used for calculating the number of tokens
+arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
+arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
+arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
+all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
+print(f"Total number of samples: {len(all_data)}")
+all_text = " ".join(all_data)
+all_words = all_text.split()
+def benchmark_tokenizer(model_name) -> float:
+    # Initialize the tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         model_name, use_fast=True, trust_remote_code=True
     )
     vocab_size = tokenizer.vocab_size
+    total_number_of_tokens = len(tokenizer.tokenize(all_text))
+    # Check if the tokenizer maintains the tashkeel
+    dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
+    tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
+    tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"
+    return {
+        "👳 Tokenize Tashkeel": tashkeel_maintainer,
+        "📛 Models": model_name,
+        "🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
+        "📘 Vocab Size": vocab_size,
+        "➕ Total Number of Tokens": total_number_of_tokens,
+        "Tokenizer Class": tokenizer.__class__.__name__,
+    }
+for model_name in tqdm(initial_list_of_models):
+    if model_name in df["📛 Models"].values:
+        continue
+    benchmark_data = benchmark_tokenizer(model_name)
+    df = df._append(benchmark_data, ignore_index=True)
 # Sort the dataframe by the number of tokens
 df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
 # Save the dataframe to a csv file
 df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
 def submit(model_name):
     global df
     if model_name in df["📛 Models"].values:
+        return (
+            gr.Dataframe(df),
+            gr.BarPlot(df),
+            gr.Dropdown(choices=df["📛 Models"].tolist()),
+        )
+    benchmark_data = benchmark_tokenizer(model_name)
+    df = df._append(benchmark_data, ignore_index=True)
     df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
     df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
+    return (
+        gr.Dataframe(df),
+        gr.BarPlot(df),
+        gr.Dropdown(choices=df["📛 Models"].tolist()),
+    )
 def generate_distinct_colors(n):
     """Generate n visually distinct colors in hexadecimal format."""
     if n > 256**3:
         raise ValueError("Cannot generate more than 16,777,216 unique colors.")
     # To ensure colors are distinct, calculate an appropriate distance between colors
     # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
+    spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3))
     max_val = 256 - spacing
     # Set to keep track of used colors
     used_colors = set()
     # List to store the result colors
     result = []
     attempts = 0
     while len(result) < n:
         # Generate a color with a random start and controlled spacing
         r = random.randint(0, max_val)
         g = random.randint(0, max_val)
         b = random.randint(0, max_val)
         # Scale up by spacing to ensure minimum distance between colors
         r = min(255, r * spacing)
         g = min(255, g * spacing)
         b = min(255, b * spacing)
         # Format the color in hexadecimal
         color = f"#{r:02X}{g:02X}{b:02X}"
         # Ensure this color hasn't been used
         if color not in used_colors:
             used_colors.add(color)
                 spacing = max(1, spacing - 1)
                 max_val = 256 - spacing
                 attempts = 0
     return result
 def decode_bpe_tokens(tokens):
     fixed_tokens = []
     for token in tokens:
         # Check if the token starts with the special BPE space character 'Ġ'
+        if token.startswith("Ġ"):
             # Process the rest of the token
             try:
                 # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
+                fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
             except UnicodeDecodeError:
                 fixed_token = token  # Use the original token if decoding fails
         else:
             try:
                 # Directly encode and decode without misinterpretation steps
+                fixed_token = token.encode("utf-8").decode("utf-8")
             except UnicodeDecodeError:
                 fixed_token = token  # Use the original token if decoding fails
         fixed_tokens.append(fixed_token)
     return fixed_tokens
 def tokenize_text(text, chosen_model, better_tokenization=False):
     tokenizer = AutoTokenizer.from_pretrained(chosen_model)
     tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
         for token in tokenized_text:
             correct_tokenized_text = ""
             for char in text:
+                correct_tokenized_text += char
+                current_token = decode_bpe_tokens(
+                    tokenizer.tokenize(correct_tokenized_text)
+                )
                 if current_token[0] == token:
                     final_tokenized_text.append(correct_tokenized_text)
+                    text = text[len(correct_tokenized_text) :]
                     break
     else:
         final_tokenized_text = tokenized_text
     color_map = {}
     for idx, token in enumerate(final_tokenized_text):
         output.append((token, str(idx)))
+        color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]
     return gr.HighlightedText(output, color_map)
 def refresh():
     global df
     df = pd.read_json(dataframe_path, lines=True)
+    return (
+        gr.Dataframe(df),
+        gr.BarPlot(df),
+        gr.Dropdown(choices=df["📛 Models"].tolist()),
+    )
+leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).
+**A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.**
+## Updates
+1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
+1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (another way to interpret `Total Number of Tokens`).
+1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
+1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
+1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
 """
 with gr.Blocks() as demo:
                 y_title=" ",
                 width=1000,
                 height=400,
+                tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
                 vertical=False,
                 x_label_angle=30,
             )
             label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
         )
         with gr.Row():
+            submit_new_model_btn = gr.Button(
+                value="Submit New Model", variant="primary", scale=3
+            )
             refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
     with gr.Tab(label="Try tokenizers"):
+        text = gr.Textbox(
+            label="Enter a text",
+            lines=5,
+            value="السلام عليكم ورحمة الله",
+            rtl=True,
+            text_align="right",
+        )
         dropdown = gr.Dropdown(
             label="Select a model",
             choices=df["📛 Models"].tolist(),
         )
         with gr.Row():
             submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
+            checkbox = gr.Checkbox(
+                label="Better tokenization for Arabic Text", value=False, scale=1
+            )
         tokenized_textbox = gr.HighlightedText(label="Tokenized text")
+    submit_new_model_btn.click(
+        submit, model_name, outputs=[dataframe, barplot, dropdown]
+    )
     refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
+    submit_text_btn.click(
+        tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
+    )
 demo.launch()

arabic_tokenizers_leaderboard.jsonl ADDED Viewed

	@@ -0,0 +1,14 @@

+{"👳 Tokenize Tashkeel":"❌","📛 Models":"asafaya\/bert-base-arabic","🪺 Fertility Score":1.614,"➕ Total Number of Tokens":1242530,"📘 Vocab Size":32000,"Tokenizer Class":"BertTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-13b","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-30b-chat-v3","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-v1.5-13B-Chat","🪺 Fertility Score":1.888,"➕ Total Number of Tokens":1453838,"📘 Vocab Size":44800,"Tokenizer Class":"LlamaTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"Xenova\/gpt-4o","🪺 Fertility Score":2.115,"➕ Total Number of Tokens":1628374,"📘 Vocab Size":200000,"Tokenizer Class":"GPT2TokenizerFast"}
+{"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-v01","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
+{"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-plus","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"unsloth\/gemma-2b-bnb-4bit","🪺 Fertility Score":2.199,"➕ Total Number of Tokens":1692826,"📘 Vocab Size":256000,"Tokenizer Class":"GemmaTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"NousResearch\/Meta-Llama-3-8B","🪺 Fertility Score":2.374,"➕ Total Number of Tokens":1827816,"📘 Vocab Size":128000,"Tokenizer Class":"PreTrainedTokenizerFast"}
+{"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-7B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
+{"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-110B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-13B","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"microsoft\/Phi-3-mini-128k-instruct","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
+{"👳 Tokenize Tashkeel":"✅","📛 Models":"01-ai\/Yi-1.5-34B-Chat","🪺 Fertility Score":6.674,"➕ Total Number of Tokens":5138447,"📘 Vocab Size":64000,"Tokenizer Class":"LlamaTokenizerFast"}