Commit
·
76e4363
1
Parent(s):
3e57038
feat: Change layout, fix task order, fix colours for models, fix range
Browse files- .gitignore +1 -0
- app.py +23 -17
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
.venv
|
|
|
|
|
|
| 1 |
.venv
|
| 2 |
+
__pycache__
|
app.py
CHANGED
|
@@ -159,14 +159,15 @@ def main() -> None:
|
|
| 159 |
"of different language models on different tasks. It is based on the "
|
| 160 |
"generative results from the [ScandEval benchmark](https://scandeval.com)."
|
| 161 |
)
|
| 162 |
-
with gr.
|
| 163 |
-
with gr.
|
| 164 |
language_names_dropdown = gr.Dropdown(
|
| 165 |
choices=all_languages,
|
| 166 |
multiselect=True,
|
| 167 |
label="Languages",
|
| 168 |
value=["Danish"],
|
| 169 |
interactive=True,
|
|
|
|
| 170 |
)
|
| 171 |
model_ids_dropdown = gr.Dropdown(
|
| 172 |
choices=danish_models,
|
|
@@ -174,17 +175,15 @@ def main() -> None:
|
|
| 174 |
label="Models",
|
| 175 |
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
|
| 176 |
interactive=True,
|
|
|
|
| 177 |
)
|
| 178 |
use_win_ratio_checkbox = gr.Checkbox(
|
| 179 |
label="Compare models with win ratios (as opposed to raw scores)",
|
| 180 |
value=True,
|
| 181 |
interactive=True,
|
|
|
|
| 182 |
)
|
| 183 |
-
|
| 184 |
-
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
|
| 185 |
-
"Alexandra Institute</a>.</center>"
|
| 186 |
-
)
|
| 187 |
-
with gr.Column():
|
| 188 |
plot = gr.Plot(
|
| 189 |
value=produce_radial_plot(
|
| 190 |
model_ids_dropdown.value,
|
|
@@ -193,6 +192,11 @@ def main() -> None:
|
|
| 193 |
results_dfs=results_dfs,
|
| 194 |
),
|
| 195 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
language_names_dropdown.change(
|
| 198 |
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
|
|
@@ -371,7 +375,7 @@ def produce_radial_plot(
|
|
| 371 |
if model_id not in results_dfs_filtered[language].index:
|
| 372 |
continue
|
| 373 |
score = results_dfs_filtered[language].loc[model_id][task]
|
| 374 |
-
win_ratio = np.mean([
|
| 375 |
score >= other_score
|
| 376 |
for other_score in results_dfs_filtered[language][task].dropna()
|
| 377 |
])
|
|
@@ -383,22 +387,21 @@ def produce_radial_plot(
|
|
| 383 |
result_list.append(np.mean(scores))
|
| 384 |
results.append(result_list)
|
| 385 |
|
| 386 |
-
# Sort the results to avoid misleading radial plots
|
| 387 |
-
model_idx_with_highest_variance = np.argmax(
|
| 388 |
-
[np.std(result_list) for result_list in results]
|
| 389 |
-
)
|
| 390 |
-
sorted_idxs = np.argsort(results[model_idx_with_highest_variance])
|
| 391 |
-
results = [np.asarray(result_list)[sorted_idxs] for result_list in results]
|
| 392 |
-
tasks = np.asarray(tasks)[sorted_idxs]
|
| 393 |
-
|
| 394 |
# Add the results to a plotly figure
|
| 395 |
fig = go.Figure()
|
| 396 |
for model_id, result_list in zip(model_ids, results):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
fig.add_trace(go.Scatterpolar(
|
| 398 |
r=result_list,
|
| 399 |
theta=[task.name for task in tasks],
|
| 400 |
fill='toself',
|
| 401 |
name=model_id,
|
|
|
|
| 402 |
))
|
| 403 |
|
| 404 |
languages_str = ""
|
|
@@ -414,7 +417,10 @@ def produce_radial_plot(
|
|
| 414 |
|
| 415 |
# Builds the radial plot from the results
|
| 416 |
fig.update_layout(
|
| 417 |
-
polar=dict(radialaxis=dict(visible=True
|
|
|
|
|
|
|
|
|
|
| 418 |
)
|
| 419 |
|
| 420 |
logger.info("Successfully produced radial plot.")
|
|
|
|
| 159 |
"of different language models on different tasks. It is based on the "
|
| 160 |
"generative results from the [ScandEval benchmark](https://scandeval.com)."
|
| 161 |
)
|
| 162 |
+
with gr.Column():
|
| 163 |
+
with gr.Row():
|
| 164 |
language_names_dropdown = gr.Dropdown(
|
| 165 |
choices=all_languages,
|
| 166 |
multiselect=True,
|
| 167 |
label="Languages",
|
| 168 |
value=["Danish"],
|
| 169 |
interactive=True,
|
| 170 |
+
scale=2,
|
| 171 |
)
|
| 172 |
model_ids_dropdown = gr.Dropdown(
|
| 173 |
choices=danish_models,
|
|
|
|
| 175 |
label="Models",
|
| 176 |
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
|
| 177 |
interactive=True,
|
| 178 |
+
scale=2,
|
| 179 |
)
|
| 180 |
use_win_ratio_checkbox = gr.Checkbox(
|
| 181 |
label="Compare models with win ratios (as opposed to raw scores)",
|
| 182 |
value=True,
|
| 183 |
interactive=True,
|
| 184 |
+
scale=1,
|
| 185 |
)
|
| 186 |
+
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
plot = gr.Plot(
|
| 188 |
value=produce_radial_plot(
|
| 189 |
model_ids_dropdown.value,
|
|
|
|
| 192 |
results_dfs=results_dfs,
|
| 193 |
),
|
| 194 |
)
|
| 195 |
+
with gr.Row():
|
| 196 |
+
gr.Markdown(
|
| 197 |
+
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
|
| 198 |
+
"Alexandra Institute</a>.</center>"
|
| 199 |
+
)
|
| 200 |
|
| 201 |
language_names_dropdown.change(
|
| 202 |
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
|
|
|
|
| 375 |
if model_id not in results_dfs_filtered[language].index:
|
| 376 |
continue
|
| 377 |
score = results_dfs_filtered[language].loc[model_id][task]
|
| 378 |
+
win_ratio = 100 * np.mean([
|
| 379 |
score >= other_score
|
| 380 |
for other_score in results_dfs_filtered[language][task].dropna()
|
| 381 |
])
|
|
|
|
| 387 |
result_list.append(np.mean(scores))
|
| 388 |
results.append(result_list)
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
# Add the results to a plotly figure
|
| 391 |
fig = go.Figure()
|
| 392 |
for model_id, result_list in zip(model_ids, results):
|
| 393 |
+
|
| 394 |
+
# Generate colour for model, as an RGB triplet. The same model will always
|
| 395 |
+
# have the same colour
|
| 396 |
+
random.seed(model_id)
|
| 397 |
+
r, g, b = tuple(random.randint(0, 255) for _ in range(3))
|
| 398 |
+
|
| 399 |
fig.add_trace(go.Scatterpolar(
|
| 400 |
r=result_list,
|
| 401 |
theta=[task.name for task in tasks],
|
| 402 |
fill='toself',
|
| 403 |
name=model_id,
|
| 404 |
+
line=dict(color=f'rgb({r}, {g}, {b})'),
|
| 405 |
))
|
| 406 |
|
| 407 |
languages_str = ""
|
|
|
|
| 417 |
|
| 418 |
# Builds the radial plot from the results
|
| 419 |
fig.update_layout(
|
| 420 |
+
polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
|
| 421 |
+
showlegend=True,
|
| 422 |
+
title=title,
|
| 423 |
+
width=800,
|
| 424 |
)
|
| 425 |
|
| 426 |
logger.info("Successfully produced radial plot.")
|