Commit
·
0c0efc4
1
Parent(s):
e4cac44
Add refresh functionality to update dataframe
Browse files
app.py
CHANGED
|
@@ -17,6 +17,7 @@ initial_list_of_models = [
|
|
| 17 |
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
|
| 18 |
|
| 19 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
|
|
|
| 20 |
if dataframe_path.exists():
|
| 21 |
df = pd.read_json(dataframe_path, lines=True)
|
| 22 |
else:
|
|
@@ -161,8 +162,14 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
|
|
| 161 |
|
| 162 |
return gr.HighlightedText(output, color_map)
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
|
| 165 |
This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
|
|
|
|
| 166 |
A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
|
| 167 |
"""
|
| 168 |
|
|
@@ -188,7 +195,9 @@ with gr.Blocks() as demo:
|
|
| 188 |
model_name = gr.Textbox(
|
| 189 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
| 190 |
)
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
with gr.Tab(label="Try tokenizers"):
|
| 193 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
| 194 |
dropdown = gr.Dropdown(
|
|
@@ -202,6 +211,7 @@ with gr.Blocks() as demo:
|
|
| 202 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
| 203 |
|
| 204 |
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
|
|
|
| 205 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
|
| 206 |
|
| 207 |
|
|
|
|
| 17 |
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
|
| 18 |
|
| 19 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
| 20 |
+
|
| 21 |
if dataframe_path.exists():
|
| 22 |
df = pd.read_json(dataframe_path, lines=True)
|
| 23 |
else:
|
|
|
|
| 162 |
|
| 163 |
return gr.HighlightedText(output, color_map)
|
| 164 |
|
| 165 |
+
def refresh():
|
| 166 |
+
global df
|
| 167 |
+
df = pd.read_json(dataframe_path, lines=True)
|
| 168 |
+
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
|
| 169 |
+
|
| 170 |
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
|
| 171 |
This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
|
| 172 |
+
|
| 173 |
A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
|
| 174 |
"""
|
| 175 |
|
|
|
|
| 195 |
model_name = gr.Textbox(
|
| 196 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
| 197 |
)
|
| 198 |
+
with gr.Row():
|
| 199 |
+
submit_new_model_btn = gr.Button(value="Submit New Model", variant="primary", scale=3)
|
| 200 |
+
refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
|
| 201 |
with gr.Tab(label="Try tokenizers"):
|
| 202 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
| 203 |
dropdown = gr.Dropdown(
|
|
|
|
| 211 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
| 212 |
|
| 213 |
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
| 214 |
+
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
|
| 215 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
|
| 216 |
|
| 217 |
|