Spaces:
Running
Running
Leaderboard, decimal places
Browse files
app.py
CHANGED
|
@@ -28,11 +28,9 @@ from common import (
|
|
| 28 |
)
|
| 29 |
from example_metrics import EXAMPLE_METRICS
|
| 30 |
|
| 31 |
-
import hashlib
|
| 32 |
-
|
| 33 |
|
| 34 |
# Model and ELO score data
|
| 35 |
-
DEFAULT_ELO =
|
| 36 |
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
| 37 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
| 38 |
vote_counts = defaultdict(int)
|
|
@@ -210,7 +208,7 @@ def get_current_votes():
|
|
| 210 |
return get_votes(db)
|
| 211 |
|
| 212 |
|
| 213 |
-
def get_leaderboard():
|
| 214 |
"""Generate leaderboard data using fresh votes from MongoDB."""
|
| 215 |
# Get fresh voting data
|
| 216 |
voting_data = get_current_votes()
|
|
@@ -263,12 +261,16 @@ def get_leaderboard():
|
|
| 263 |
leaderboard = []
|
| 264 |
for model in model_data.keys():
|
| 265 |
votes = matches[model]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
elo = ratings[model]
|
| 267 |
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
| 268 |
data = {
|
| 269 |
"Model": model,
|
| 270 |
-
"ELO Score": f"{elo
|
| 271 |
-
"95% CI": f"±{ci
|
| 272 |
"# Votes": votes,
|
| 273 |
"Organization": model_data[model]["organization"],
|
| 274 |
"License": model_data[model]["license"],
|
|
@@ -532,12 +534,52 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 532 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 533 |
|
| 534 |
with gr.TabItem("Leaderboard"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
stats_display = gr.Markdown()
|
| 536 |
leaderboard_table = gr.Dataframe(
|
| 537 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
| 538 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 539 |
)
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
with gr.TabItem("Policy"):
|
| 542 |
gr.Markdown(POLICY_CONTENT)
|
| 543 |
|
|
@@ -758,29 +800,6 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 758 |
outputs=[send_btn, regenerate_button],
|
| 759 |
)
|
| 760 |
|
| 761 |
-
# Update the leaderboard
|
| 762 |
-
def refresh_leaderboard():
|
| 763 |
-
"""Refresh the leaderboard data and stats."""
|
| 764 |
-
leaderboard = get_leaderboard()
|
| 765 |
-
data = [
|
| 766 |
-
[
|
| 767 |
-
entry["Model"],
|
| 768 |
-
float(entry["ELO Score"]),
|
| 769 |
-
entry["95% CI"],
|
| 770 |
-
entry["# Votes"],
|
| 771 |
-
entry["Organization"],
|
| 772 |
-
entry["License"],
|
| 773 |
-
]
|
| 774 |
-
for entry in leaderboard
|
| 775 |
-
]
|
| 776 |
-
stats = get_leaderboard_stats()
|
| 777 |
-
return [gr.update(value=data), gr.update(value=stats)]
|
| 778 |
-
|
| 779 |
-
# Add the load event at the very end, just before demo.launch()
|
| 780 |
-
demo.load(
|
| 781 |
-
fn=refresh_leaderboard, inputs=None, outputs=[leaderboard_table, stats_display]
|
| 782 |
-
)
|
| 783 |
-
|
| 784 |
# Add click handlers for metric buttons
|
| 785 |
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
| 786 |
|
|
|
|
| 28 |
)
|
| 29 |
from example_metrics import EXAMPLE_METRICS
|
| 30 |
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Model and ELO score data
|
| 33 |
+
DEFAULT_ELO = 1200 # Starting ELO for new models
|
| 34 |
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
| 35 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
| 36 |
vote_counts = defaultdict(int)
|
|
|
|
| 208 |
return get_votes(db)
|
| 209 |
|
| 210 |
|
| 211 |
+
def get_leaderboard(show_preliminary=True):
|
| 212 |
"""Generate leaderboard data using fresh votes from MongoDB."""
|
| 213 |
# Get fresh voting data
|
| 214 |
voting_data = get_current_votes()
|
|
|
|
| 261 |
leaderboard = []
|
| 262 |
for model in model_data.keys():
|
| 263 |
votes = matches[model]
|
| 264 |
+
# Skip models with < 500 votes if show_preliminary is False
|
| 265 |
+
if not show_preliminary and votes < 500:
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
elo = ratings[model]
|
| 269 |
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
| 270 |
data = {
|
| 271 |
"Model": model,
|
| 272 |
+
"ELO Score": f"{int(elo)}",
|
| 273 |
+
"95% CI": f"±{int(ci)}",
|
| 274 |
"# Votes": votes,
|
| 275 |
"Organization": model_data[model]["organization"],
|
| 276 |
"License": model_data[model]["license"],
|
|
|
|
| 534 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 535 |
|
| 536 |
with gr.TabItem("Leaderboard"):
|
| 537 |
+
with gr.Row():
|
| 538 |
+
with gr.Column(scale=1):
|
| 539 |
+
show_preliminary = gr.Checkbox(
|
| 540 |
+
label="Reveal preliminary results",
|
| 541 |
+
value=True, # Checked by default
|
| 542 |
+
info="Show all models, including models with less few human ratings (< 500 votes)",
|
| 543 |
+
interactive=True
|
| 544 |
+
)
|
| 545 |
stats_display = gr.Markdown()
|
| 546 |
leaderboard_table = gr.Dataframe(
|
| 547 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
| 548 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 549 |
)
|
| 550 |
|
| 551 |
+
# Update refresh_leaderboard to use the checkbox value
|
| 552 |
+
def refresh_leaderboard(show_preliminary):
|
| 553 |
+
"""Refresh the leaderboard data and stats."""
|
| 554 |
+
leaderboard = get_leaderboard(show_preliminary)
|
| 555 |
+
data = [
|
| 556 |
+
[
|
| 557 |
+
entry["Model"],
|
| 558 |
+
float(entry["ELO Score"]),
|
| 559 |
+
entry["95% CI"],
|
| 560 |
+
entry["# Votes"],
|
| 561 |
+
entry["Organization"],
|
| 562 |
+
entry["License"],
|
| 563 |
+
]
|
| 564 |
+
for entry in leaderboard
|
| 565 |
+
]
|
| 566 |
+
stats = get_leaderboard_stats()
|
| 567 |
+
return [gr.update(value=data), gr.update(value=stats)]
|
| 568 |
+
|
| 569 |
+
# Add change handler for checkbox
|
| 570 |
+
show_preliminary.change(
|
| 571 |
+
fn=refresh_leaderboard,
|
| 572 |
+
inputs=[show_preliminary],
|
| 573 |
+
outputs=[leaderboard_table, stats_display]
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
# Update the load event
|
| 577 |
+
demo.load(
|
| 578 |
+
fn=refresh_leaderboard,
|
| 579 |
+
inputs=[show_preliminary],
|
| 580 |
+
outputs=[leaderboard_table, stats_display]
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
with gr.TabItem("Policy"):
|
| 584 |
gr.Markdown(POLICY_CONTENT)
|
| 585 |
|
|
|
|
| 800 |
outputs=[send_btn, regenerate_button],
|
| 801 |
)
|
| 802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
# Add click handlers for metric buttons
|
| 804 |
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
| 805 |
|