Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Running

App Files Files Community

MohamedRashad commited on May 20, 2024

Commit

0c0efc4

1 Parent(s): e4cac44

Add refresh functionality to update dataframe

Browse files

Files changed (1) hide show

app.py +11 -1

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ initial_list_of_models = [
 dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
 dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 if dataframe_path.exists():
     df = pd.read_json(dataframe_path, lines=True)
 else:
@@ -161,8 +162,14 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
     return gr.HighlightedText(output, color_map)
 leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
 This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
 A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
 """
@@ -188,7 +195,9 @@ with gr.Blocks() as demo:
         model_name = gr.Textbox(
             label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
         )
-        submit_new_model_btn = gr.Button(value="Submit", variant="primary")
     with gr.Tab(label="Try tokenizers"):
         text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
         dropdown = gr.Dropdown(
@@ -202,6 +211,7 @@ with gr.Blocks() as demo:
         tokenized_textbox = gr.HighlightedText(label="Tokenized text")
     submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
     submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])

 dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
 dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
 if dataframe_path.exists():
     df = pd.read_json(dataframe_path, lines=True)
 else:
     return gr.HighlightedText(output, color_map)
+def refresh():
+    global df
+    df = pd.read_json(dataframe_path, lines=True)
+    return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
 leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
 This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
 A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
 """
         model_name = gr.Textbox(
             label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
         )
+        with gr.Row():
+            submit_new_model_btn = gr.Button(value="Submit New Model", variant="primary", scale=3)
+            refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
     with gr.Tab(label="Try tokenizers"):
         text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
         dropdown = gr.Dropdown(
         tokenized_textbox = gr.HighlightedText(label="Tokenized text")
     submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
+    refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
     submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])