Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						d2e2dfe
	
1
								Parent(s):
							
							0c0efc4
								
refactor: improve tokenization for Arabic text
Browse files- app.py +121 -59
- arabic_tokenizers_leaderboard.jsonl +14 -0
    	
        app.py
    CHANGED
    
    | @@ -7,39 +7,74 @@ import random | |
| 7 | 
             
            from pathlib import Path
         | 
| 8 |  | 
| 9 | 
             
            initial_list_of_models = [
         | 
|  | |
| 10 | 
             
                "Xenova/gpt-4o",
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 11 | 
             
                "NousResearch/Meta-Llama-3-8B",
         | 
| 12 | 
             
                "CohereForAI/c4ai-command-r-v01",
         | 
| 13 | 
             
                "CohereForAI/c4ai-command-r-plus",
         | 
| 14 | 
             
                "core42/jais-13b",
         | 
|  | |
| 15 | 
             
            ]
         | 
| 16 |  | 
| 17 | 
            -
            dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
         | 
| 18 | 
            -
             | 
| 19 | 
             
            dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
         | 
| 20 | 
            -
             | 
| 21 | 
             
            if dataframe_path.exists():
         | 
| 22 | 
             
                df = pd.read_json(dataframe_path, lines=True)
         | 
| 23 | 
             
            else:
         | 
| 24 | 
            -
                df = pd.DataFrame( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 25 |  | 
| 26 | 
            -
            for  | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 29 | 
             
                tokenizer = AutoTokenizer.from_pretrained(
         | 
| 30 | 
             
                    model_name, use_fast=True, trust_remote_code=True
         | 
| 31 | 
             
                )
         | 
| 32 | 
             
                vocab_size = tokenizer.vocab_size
         | 
| 33 | 
            -
                 | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
                     | 
| 42 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 43 |  | 
| 44 | 
             
            # Sort the dataframe by the number of tokens
         | 
| 45 | 
             
            df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
         | 
| @@ -47,59 +82,57 @@ df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) | |
| 47 | 
             
            # Save the dataframe to a csv file
         | 
| 48 | 
             
            df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
         | 
| 49 |  | 
|  | |
| 50 | 
             
            def submit(model_name):
         | 
| 51 | 
             
                global df
         | 
| 52 | 
             
                if model_name in df["📛 Models"].values:
         | 
| 53 | 
            -
                    return  | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
                 | 
| 59 | 
            -
                df = df._append(
         | 
| 60 | 
            -
                    {
         | 
| 61 | 
            -
                        "📛 Models": model_name,
         | 
| 62 | 
            -
                        "➕ Total Number of Tokens": number_of_tokens,
         | 
| 63 | 
            -
                        "📘 Vocab Size": vocab_size,
         | 
| 64 | 
            -
                        "Tokenizer Class": tokenizer.__class__.__name__,
         | 
| 65 | 
            -
                    },
         | 
| 66 | 
            -
                    ignore_index=True,
         | 
| 67 | 
            -
                )
         | 
| 68 | 
             
                df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
         | 
| 69 | 
             
                df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
         | 
| 70 | 
            -
                return  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 71 |  | 
| 72 | 
             
            def generate_distinct_colors(n):
         | 
| 73 | 
             
                """Generate n visually distinct colors in hexadecimal format."""
         | 
| 74 | 
             
                if n > 256**3:
         | 
| 75 | 
             
                    raise ValueError("Cannot generate more than 16,777,216 unique colors.")
         | 
| 76 | 
            -
             | 
| 77 | 
             
                # To ensure colors are distinct, calculate an appropriate distance between colors
         | 
| 78 | 
             
                # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
         | 
| 79 | 
            -
                spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
         | 
| 80 | 
             
                max_val = 256 - spacing
         | 
| 81 | 
            -
             | 
| 82 | 
             
                # Set to keep track of used colors
         | 
| 83 | 
             
                used_colors = set()
         | 
| 84 | 
            -
             | 
| 85 | 
             
                # List to store the result colors
         | 
| 86 | 
             
                result = []
         | 
| 87 | 
            -
             | 
| 88 | 
             
                attempts = 0
         | 
| 89 | 
             
                while len(result) < n:
         | 
| 90 | 
             
                    # Generate a color with a random start and controlled spacing
         | 
| 91 | 
             
                    r = random.randint(0, max_val)
         | 
| 92 | 
             
                    g = random.randint(0, max_val)
         | 
| 93 | 
             
                    b = random.randint(0, max_val)
         | 
| 94 | 
            -
             | 
| 95 | 
             
                    # Scale up by spacing to ensure minimum distance between colors
         | 
| 96 | 
             
                    r = min(255, r * spacing)
         | 
| 97 | 
             
                    g = min(255, g * spacing)
         | 
| 98 | 
             
                    b = min(255, b * spacing)
         | 
| 99 | 
            -
             | 
| 100 | 
             
                    # Format the color in hexadecimal
         | 
| 101 | 
             
                    color = f"#{r:02X}{g:02X}{b:02X}"
         | 
| 102 | 
            -
             | 
| 103 | 
             
                    # Ensure this color hasn't been used
         | 
| 104 | 
             
                    if color not in used_colors:
         | 
| 105 | 
             
                        used_colors.add(color)
         | 
| @@ -111,29 +144,31 @@ def generate_distinct_colors(n): | |
| 111 | 
             
                            spacing = max(1, spacing - 1)
         | 
| 112 | 
             
                            max_val = 256 - spacing
         | 
| 113 | 
             
                            attempts = 0
         | 
| 114 | 
            -
             | 
| 115 | 
             
                return result
         | 
| 116 |  | 
|  | |
| 117 | 
             
            def decode_bpe_tokens(tokens):
         | 
| 118 | 
             
                fixed_tokens = []
         | 
| 119 | 
             
                for token in tokens:
         | 
| 120 | 
             
                    # Check if the token starts with the special BPE space character 'Ġ'
         | 
| 121 | 
            -
                    if token.startswith( | 
| 122 | 
             
                        # Process the rest of the token
         | 
| 123 | 
             
                        try:
         | 
| 124 | 
             
                            # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
         | 
| 125 | 
            -
                            fixed_token =  | 
| 126 | 
             
                        except UnicodeDecodeError:
         | 
| 127 | 
             
                            fixed_token = token  # Use the original token if decoding fails
         | 
| 128 | 
             
                    else:
         | 
| 129 | 
             
                        try:
         | 
| 130 | 
             
                            # Directly encode and decode without misinterpretation steps
         | 
| 131 | 
            -
                            fixed_token = token.encode( | 
| 132 | 
             
                        except UnicodeDecodeError:
         | 
| 133 | 
             
                            fixed_token = token  # Use the original token if decoding fails
         | 
| 134 | 
             
                    fixed_tokens.append(fixed_token)
         | 
| 135 | 
             
                return fixed_tokens
         | 
| 136 |  | 
|  | |
| 137 | 
             
            def tokenize_text(text, chosen_model, better_tokenization=False):
         | 
| 138 | 
             
                tokenizer = AutoTokenizer.from_pretrained(chosen_model)
         | 
| 139 | 
             
                tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
         | 
| @@ -144,11 +179,13 @@ def tokenize_text(text, chosen_model, better_tokenization=False): | |
| 144 | 
             
                    for token in tokenized_text:
         | 
| 145 | 
             
                        correct_tokenized_text = ""
         | 
| 146 | 
             
                        for char in text:
         | 
| 147 | 
            -
                            correct_tokenized_text += char | 
| 148 | 
            -
                            current_token = decode_bpe_tokens( | 
|  | |
|  | |
| 149 | 
             
                            if current_token[0] == token:
         | 
| 150 | 
             
                                final_tokenized_text.append(correct_tokenized_text)
         | 
| 151 | 
            -
                                text = text[len(correct_tokenized_text):]
         | 
| 152 | 
             
                                break
         | 
| 153 | 
             
                else:
         | 
| 154 | 
             
                    final_tokenized_text = tokenized_text
         | 
| @@ -158,19 +195,30 @@ def tokenize_text(text, chosen_model, better_tokenization=False): | |
| 158 | 
             
                color_map = {}
         | 
| 159 | 
             
                for idx, token in enumerate(final_tokenized_text):
         | 
| 160 | 
             
                    output.append((token, str(idx)))
         | 
| 161 | 
            -
                    color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
         | 
| 162 |  | 
| 163 | 
             
                return gr.HighlightedText(output, color_map)
         | 
| 164 |  | 
|  | |
| 165 | 
             
            def refresh():
         | 
| 166 | 
             
                global df
         | 
| 167 | 
             
                df = pd.read_json(dataframe_path, lines=True)
         | 
| 168 | 
            -
                return  | 
|  | |
|  | |
|  | |
|  | |
| 169 |  | 
| 170 | 
            -
            leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens  | 
| 171 | 
            -
            This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
         | 
| 172 |  | 
| 173 | 
            -
            A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 174 | 
             
            """
         | 
| 175 |  | 
| 176 | 
             
            with gr.Blocks() as demo:
         | 
| @@ -188,7 +236,7 @@ with gr.Blocks() as demo: | |
| 188 | 
             
                            y_title=" ",
         | 
| 189 | 
             
                            width=1000,
         | 
| 190 | 
             
                            height=400,
         | 
| 191 | 
            -
                            tooltip=["📘 Vocab Size", " | 
| 192 | 
             
                            vertical=False,
         | 
| 193 | 
             
                            x_label_angle=30,
         | 
| 194 | 
             
                        )
         | 
| @@ -196,10 +244,18 @@ with gr.Blocks() as demo: | |
| 196 | 
             
                        label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
         | 
| 197 | 
             
                    )
         | 
| 198 | 
             
                    with gr.Row():
         | 
| 199 | 
            -
                        submit_new_model_btn = gr.Button( | 
|  | |
|  | |
| 200 | 
             
                        refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
         | 
| 201 | 
             
                with gr.Tab(label="Try tokenizers"):
         | 
| 202 | 
            -
                    text = gr.Textbox( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 203 | 
             
                    dropdown = gr.Dropdown(
         | 
| 204 | 
             
                        label="Select a model",
         | 
| 205 | 
             
                        choices=df["📛 Models"].tolist(),
         | 
| @@ -207,12 +263,18 @@ with gr.Blocks() as demo: | |
| 207 | 
             
                    )
         | 
| 208 | 
             
                    with gr.Row():
         | 
| 209 | 
             
                        submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
         | 
| 210 | 
            -
                        checkbox = gr.Checkbox( | 
|  | |
|  | |
| 211 | 
             
                    tokenized_textbox = gr.HighlightedText(label="Tokenized text")
         | 
| 212 |  | 
| 213 | 
            -
                submit_new_model_btn.click( | 
|  | |
|  | |
| 214 | 
             
                refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
         | 
| 215 | 
            -
                submit_text_btn.click( | 
|  | |
|  | |
| 216 |  | 
| 217 |  | 
| 218 | 
             
            demo.launch()
         | 
|  | |
| 7 | 
             
            from pathlib import Path
         | 
| 8 |  | 
| 9 | 
             
            initial_list_of_models = [
         | 
| 10 | 
            +
                "asafaya/bert-base-arabic",
         | 
| 11 | 
             
                "Xenova/gpt-4o",
         | 
| 12 | 
            +
                "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
         | 
| 13 | 
            +
                "FreedomIntelligence/AceGPT-13B",
         | 
| 14 | 
            +
                "Qwen/Qwen1.5-7B-Chat",
         | 
| 15 | 
            +
                "Qwen/Qwen1.5-110B-Chat",
         | 
| 16 | 
            +
                "microsoft/Phi-3-mini-128k-instruct",
         | 
| 17 | 
            +
                "unsloth/gemma-2b-bnb-4bit",
         | 
| 18 | 
             
                "NousResearch/Meta-Llama-3-8B",
         | 
| 19 | 
             
                "CohereForAI/c4ai-command-r-v01",
         | 
| 20 | 
             
                "CohereForAI/c4ai-command-r-plus",
         | 
| 21 | 
             
                "core42/jais-13b",
         | 
| 22 | 
            +
                "core42/jais-30b-chat-v3",
         | 
| 23 | 
             
            ]
         | 
| 24 |  | 
|  | |
|  | |
| 25 | 
             
            dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
         | 
|  | |
| 26 | 
             
            if dataframe_path.exists():
         | 
| 27 | 
             
                df = pd.read_json(dataframe_path, lines=True)
         | 
| 28 | 
             
            else:
         | 
| 29 | 
            +
                df = pd.DataFrame(
         | 
| 30 | 
            +
                    columns=[
         | 
| 31 | 
            +
                        "👳 Tokenize Tashkeel",
         | 
| 32 | 
            +
                        "📛 Models",
         | 
| 33 | 
            +
                        "🪺 Fertility Score",
         | 
| 34 | 
            +
                        "➕ Total Number of Tokens",
         | 
| 35 | 
            +
                        "📘 Vocab Size",
         | 
| 36 | 
            +
                        "Tokenizer Class",
         | 
| 37 | 
            +
                    ]
         | 
| 38 | 
            +
                )
         | 
| 39 |  | 
| 40 | 
            +
            # Datasets used for calculating the number of tokens
         | 
| 41 | 
            +
            arabic_dataset1 = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
         | 
| 42 | 
            +
            arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"]
         | 
| 43 | 
            +
            arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"]
         | 
| 44 | 
            +
            all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3
         | 
| 45 | 
            +
            print(f"Total number of samples: {len(all_data)}") 
         | 
| 46 | 
            +
            all_text = " ".join(all_data)
         | 
| 47 | 
            +
            all_words = all_text.split()
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            def benchmark_tokenizer(model_name) -> float:
         | 
| 50 | 
            +
                # Initialize the tokenizer
         | 
| 51 | 
             
                tokenizer = AutoTokenizer.from_pretrained(
         | 
| 52 | 
             
                    model_name, use_fast=True, trust_remote_code=True
         | 
| 53 | 
             
                )
         | 
| 54 | 
             
                vocab_size = tokenizer.vocab_size
         | 
| 55 | 
            +
                total_number_of_tokens = len(tokenizer.tokenize(all_text))
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                # Check if the tokenizer maintains the tashkeel
         | 
| 58 | 
            +
                dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ"
         | 
| 59 | 
            +
                tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True)
         | 
| 60 | 
            +
                tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌"
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                return {
         | 
| 63 | 
            +
                    "👳 Tokenize Tashkeel": tashkeel_maintainer,
         | 
| 64 | 
            +
                    "📛 Models": model_name,
         | 
| 65 | 
            +
                    "🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3),
         | 
| 66 | 
            +
                    "📘 Vocab Size": vocab_size,
         | 
| 67 | 
            +
                    "➕ Total Number of Tokens": total_number_of_tokens,
         | 
| 68 | 
            +
                    "Tokenizer Class": tokenizer.__class__.__name__,
         | 
| 69 | 
            +
                }
         | 
| 70 | 
            +
             | 
| 71 | 
            +
             | 
| 72 | 
            +
            for model_name in tqdm(initial_list_of_models):
         | 
| 73 | 
            +
                if model_name in df["📛 Models"].values:
         | 
| 74 | 
            +
                    continue
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                benchmark_data = benchmark_tokenizer(model_name)
         | 
| 77 | 
            +
                df = df._append(benchmark_data, ignore_index=True)
         | 
| 78 |  | 
| 79 | 
             
            # Sort the dataframe by the number of tokens
         | 
| 80 | 
             
            df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
         | 
|  | |
| 82 | 
             
            # Save the dataframe to a csv file
         | 
| 83 | 
             
            df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
         | 
| 84 |  | 
| 85 | 
            +
             | 
| 86 | 
             
            def submit(model_name):
         | 
| 87 | 
             
                global df
         | 
| 88 | 
             
                if model_name in df["📛 Models"].values:
         | 
| 89 | 
            +
                    return (
         | 
| 90 | 
            +
                        gr.Dataframe(df),
         | 
| 91 | 
            +
                        gr.BarPlot(df),
         | 
| 92 | 
            +
                        gr.Dropdown(choices=df["📛 Models"].tolist()),
         | 
| 93 | 
            +
                    )
         | 
| 94 | 
            +
                benchmark_data = benchmark_tokenizer(model_name)
         | 
| 95 | 
            +
                df = df._append(benchmark_data, ignore_index=True)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 96 | 
             
                df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
         | 
| 97 | 
             
                df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
         | 
| 98 | 
            +
                return (
         | 
| 99 | 
            +
                    gr.Dataframe(df),
         | 
| 100 | 
            +
                    gr.BarPlot(df),
         | 
| 101 | 
            +
                    gr.Dropdown(choices=df["📛 Models"].tolist()),
         | 
| 102 | 
            +
                )
         | 
| 103 | 
            +
             | 
| 104 |  | 
| 105 | 
             
            def generate_distinct_colors(n):
         | 
| 106 | 
             
                """Generate n visually distinct colors in hexadecimal format."""
         | 
| 107 | 
             
                if n > 256**3:
         | 
| 108 | 
             
                    raise ValueError("Cannot generate more than 16,777,216 unique colors.")
         | 
| 109 | 
            +
             | 
| 110 | 
             
                # To ensure colors are distinct, calculate an appropriate distance between colors
         | 
| 111 | 
             
                # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
         | 
| 112 | 
            +
                spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3))
         | 
| 113 | 
             
                max_val = 256 - spacing
         | 
| 114 | 
            +
             | 
| 115 | 
             
                # Set to keep track of used colors
         | 
| 116 | 
             
                used_colors = set()
         | 
| 117 | 
            +
             | 
| 118 | 
             
                # List to store the result colors
         | 
| 119 | 
             
                result = []
         | 
| 120 | 
            +
             | 
| 121 | 
             
                attempts = 0
         | 
| 122 | 
             
                while len(result) < n:
         | 
| 123 | 
             
                    # Generate a color with a random start and controlled spacing
         | 
| 124 | 
             
                    r = random.randint(0, max_val)
         | 
| 125 | 
             
                    g = random.randint(0, max_val)
         | 
| 126 | 
             
                    b = random.randint(0, max_val)
         | 
| 127 | 
            +
             | 
| 128 | 
             
                    # Scale up by spacing to ensure minimum distance between colors
         | 
| 129 | 
             
                    r = min(255, r * spacing)
         | 
| 130 | 
             
                    g = min(255, g * spacing)
         | 
| 131 | 
             
                    b = min(255, b * spacing)
         | 
| 132 | 
            +
             | 
| 133 | 
             
                    # Format the color in hexadecimal
         | 
| 134 | 
             
                    color = f"#{r:02X}{g:02X}{b:02X}"
         | 
| 135 | 
            +
             | 
| 136 | 
             
                    # Ensure this color hasn't been used
         | 
| 137 | 
             
                    if color not in used_colors:
         | 
| 138 | 
             
                        used_colors.add(color)
         | 
|  | |
| 144 | 
             
                            spacing = max(1, spacing - 1)
         | 
| 145 | 
             
                            max_val = 256 - spacing
         | 
| 146 | 
             
                            attempts = 0
         | 
| 147 | 
            +
             | 
| 148 | 
             
                return result
         | 
| 149 |  | 
| 150 | 
            +
             | 
| 151 | 
             
            def decode_bpe_tokens(tokens):
         | 
| 152 | 
             
                fixed_tokens = []
         | 
| 153 | 
             
                for token in tokens:
         | 
| 154 | 
             
                    # Check if the token starts with the special BPE space character 'Ġ'
         | 
| 155 | 
            +
                    if token.startswith("Ġ"):
         | 
| 156 | 
             
                        # Process the rest of the token
         | 
| 157 | 
             
                        try:
         | 
| 158 | 
             
                            # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
         | 
| 159 | 
            +
                            fixed_token = " " + token[1:].encode("utf-8").decode("utf-8")
         | 
| 160 | 
             
                        except UnicodeDecodeError:
         | 
| 161 | 
             
                            fixed_token = token  # Use the original token if decoding fails
         | 
| 162 | 
             
                    else:
         | 
| 163 | 
             
                        try:
         | 
| 164 | 
             
                            # Directly encode and decode without misinterpretation steps
         | 
| 165 | 
            +
                            fixed_token = token.encode("utf-8").decode("utf-8")
         | 
| 166 | 
             
                        except UnicodeDecodeError:
         | 
| 167 | 
             
                            fixed_token = token  # Use the original token if decoding fails
         | 
| 168 | 
             
                    fixed_tokens.append(fixed_token)
         | 
| 169 | 
             
                return fixed_tokens
         | 
| 170 |  | 
| 171 | 
            +
             | 
| 172 | 
             
            def tokenize_text(text, chosen_model, better_tokenization=False):
         | 
| 173 | 
             
                tokenizer = AutoTokenizer.from_pretrained(chosen_model)
         | 
| 174 | 
             
                tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
         | 
|  | |
| 179 | 
             
                    for token in tokenized_text:
         | 
| 180 | 
             
                        correct_tokenized_text = ""
         | 
| 181 | 
             
                        for char in text:
         | 
| 182 | 
            +
                            correct_tokenized_text += char
         | 
| 183 | 
            +
                            current_token = decode_bpe_tokens(
         | 
| 184 | 
            +
                                tokenizer.tokenize(correct_tokenized_text)
         | 
| 185 | 
            +
                            )
         | 
| 186 | 
             
                            if current_token[0] == token:
         | 
| 187 | 
             
                                final_tokenized_text.append(correct_tokenized_text)
         | 
| 188 | 
            +
                                text = text[len(correct_tokenized_text) :]
         | 
| 189 | 
             
                                break
         | 
| 190 | 
             
                else:
         | 
| 191 | 
             
                    final_tokenized_text = tokenized_text
         | 
|  | |
| 195 | 
             
                color_map = {}
         | 
| 196 | 
             
                for idx, token in enumerate(final_tokenized_text):
         | 
| 197 | 
             
                    output.append((token, str(idx)))
         | 
| 198 | 
            +
                    color_map[str(idx + 1)] = random_colors[idx % len(random_colors)]
         | 
| 199 |  | 
| 200 | 
             
                return gr.HighlightedText(output, color_map)
         | 
| 201 |  | 
| 202 | 
            +
             | 
| 203 | 
             
            def refresh():
         | 
| 204 | 
             
                global df
         | 
| 205 | 
             
                df = pd.read_json(dataframe_path, lines=True)
         | 
| 206 | 
            +
                return (
         | 
| 207 | 
            +
                    gr.Dataframe(df),
         | 
| 208 | 
            +
                    gr.BarPlot(df),
         | 
| 209 | 
            +
                    gr.Dropdown(choices=df["📛 Models"].tolist()),
         | 
| 210 | 
            +
                )
         | 
| 211 |  | 
| 212 | 
            +
            leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner).
         | 
|  | |
| 213 |  | 
| 214 | 
            +
            **A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.**
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            ## Updates
         | 
| 217 | 
            +
            1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)).
         | 
| 218 | 
            +
            1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (another way to interpret `Total Number of Tokens`).
         | 
| 219 | 
            +
            1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no).
         | 
| 220 | 
            +
            1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens).
         | 
| 221 | 
            +
            1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`)
         | 
| 222 | 
             
            """
         | 
| 223 |  | 
| 224 | 
             
            with gr.Blocks() as demo:
         | 
|  | |
| 236 | 
             
                            y_title=" ",
         | 
| 237 | 
             
                            width=1000,
         | 
| 238 | 
             
                            height=400,
         | 
| 239 | 
            +
                            tooltip=["📘 Vocab Size", "🪺 Fertility Score"],
         | 
| 240 | 
             
                            vertical=False,
         | 
| 241 | 
             
                            x_label_angle=30,
         | 
| 242 | 
             
                        )
         | 
|  | |
| 244 | 
             
                        label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
         | 
| 245 | 
             
                    )
         | 
| 246 | 
             
                    with gr.Row():
         | 
| 247 | 
            +
                        submit_new_model_btn = gr.Button(
         | 
| 248 | 
            +
                            value="Submit New Model", variant="primary", scale=3
         | 
| 249 | 
            +
                        )
         | 
| 250 | 
             
                        refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
         | 
| 251 | 
             
                with gr.Tab(label="Try tokenizers"):
         | 
| 252 | 
            +
                    text = gr.Textbox(
         | 
| 253 | 
            +
                        label="Enter a text",
         | 
| 254 | 
            +
                        lines=5,
         | 
| 255 | 
            +
                        value="السلام عليكم ورحمة الله",
         | 
| 256 | 
            +
                        rtl=True,
         | 
| 257 | 
            +
                        text_align="right",
         | 
| 258 | 
            +
                    )
         | 
| 259 | 
             
                    dropdown = gr.Dropdown(
         | 
| 260 | 
             
                        label="Select a model",
         | 
| 261 | 
             
                        choices=df["📛 Models"].tolist(),
         | 
|  | |
| 263 | 
             
                    )
         | 
| 264 | 
             
                    with gr.Row():
         | 
| 265 | 
             
                        submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
         | 
| 266 | 
            +
                        checkbox = gr.Checkbox(
         | 
| 267 | 
            +
                            label="Better tokenization for Arabic Text", value=False, scale=1
         | 
| 268 | 
            +
                        )
         | 
| 269 | 
             
                    tokenized_textbox = gr.HighlightedText(label="Tokenized text")
         | 
| 270 |  | 
| 271 | 
            +
                submit_new_model_btn.click(
         | 
| 272 | 
            +
                    submit, model_name, outputs=[dataframe, barplot, dropdown]
         | 
| 273 | 
            +
                )
         | 
| 274 | 
             
                refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
         | 
| 275 | 
            +
                submit_text_btn.click(
         | 
| 276 | 
            +
                    tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox]
         | 
| 277 | 
            +
                )
         | 
| 278 |  | 
| 279 |  | 
| 280 | 
             
            demo.launch()
         | 
    	
        arabic_tokenizers_leaderboard.jsonl
    ADDED
    
    | @@ -0,0 +1,14 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"👳 Tokenize Tashkeel":"❌","📛 Models":"asafaya\/bert-base-arabic","🪺 Fertility Score":1.614,"➕ Total Number of Tokens":1242530,"📘 Vocab Size":32000,"Tokenizer Class":"BertTokenizerFast"}
         | 
| 2 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-13b","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
         | 
| 3 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"core42\/jais-30b-chat-v3","🪺 Fertility Score":1.668,"➕ Total Number of Tokens":1284508,"📘 Vocab Size":84992,"Tokenizer Class":"PreTrainedTokenizerFast"}
         | 
| 4 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-v1.5-13B-Chat","🪺 Fertility Score":1.888,"➕ Total Number of Tokens":1453838,"📘 Vocab Size":44800,"Tokenizer Class":"LlamaTokenizerFast"}
         | 
| 5 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"Xenova\/gpt-4o","🪺 Fertility Score":2.115,"➕ Total Number of Tokens":1628374,"📘 Vocab Size":200000,"Tokenizer Class":"GPT2TokenizerFast"}
         | 
| 6 | 
            +
            {"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-v01","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
         | 
| 7 | 
            +
            {"👳 Tokenize Tashkeel":"❌","📛 Models":"CohereForAI\/c4ai-command-r-plus","🪺 Fertility Score":2.154,"➕ Total Number of Tokens":1658463,"📘 Vocab Size":255000,"Tokenizer Class":"CohereTokenizerFast"}
         | 
| 8 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"unsloth\/gemma-2b-bnb-4bit","🪺 Fertility Score":2.199,"➕ Total Number of Tokens":1692826,"📘 Vocab Size":256000,"Tokenizer Class":"GemmaTokenizerFast"}
         | 
| 9 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"NousResearch\/Meta-Llama-3-8B","🪺 Fertility Score":2.374,"➕ Total Number of Tokens":1827816,"📘 Vocab Size":128000,"Tokenizer Class":"PreTrainedTokenizerFast"}
         | 
| 10 | 
            +
            {"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-7B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
         | 
| 11 | 
            +
            {"👳 Tokenize Tashkeel":"❌","📛 Models":"Qwen\/Qwen1.5-110B-Chat","🪺 Fertility Score":2.444,"➕ Total Number of Tokens":1881958,"📘 Vocab Size":151643,"Tokenizer Class":"Qwen2TokenizerFast"}
         | 
| 12 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"FreedomIntelligence\/AceGPT-13B","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
         | 
| 13 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"microsoft\/Phi-3-mini-128k-instruct","🪺 Fertility Score":5.46,"➕ Total Number of Tokens":4203685,"📘 Vocab Size":32000,"Tokenizer Class":"LlamaTokenizerFast"}
         | 
| 14 | 
            +
            {"👳 Tokenize Tashkeel":"✅","📛 Models":"01-ai\/Yi-1.5-34B-Chat","🪺 Fertility Score":6.674,"➕ Total Number of Tokens":5138447,"📘 Vocab Size":64000,"Tokenizer Class":"LlamaTokenizerFast"}
         | 
 
			
