Spaces:
Runtime error
Runtime error
Commit
·
9b8ac8f
1
Parent(s):
bcda822
add app file
Browse files- app.py +3 -9
- experiments.json +94 -5
app.py
CHANGED
|
@@ -5,19 +5,13 @@ import gradio as gr
|
|
| 5 |
from functools import lru_cache
|
| 6 |
|
| 7 |
# Load models and experiments
|
| 8 |
-
MODELS = [
|
| 9 |
-
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
| 10 |
-
"o3-mini-2025-01-31",
|
| 11 |
-
"meta-llama/Llama-3.3-70B-Instruct",
|
| 12 |
-
"moonshotai/Moonlight-16B-A3B-Instruct",
|
| 13 |
-
"gpt-4o",
|
| 14 |
-
"claude-3-7-sonnet-20250219",
|
| 15 |
-
"openai/gpt-4.5-preview-2025-02-27"
|
| 16 |
-
]
|
| 17 |
|
| 18 |
with open("experiments.json") as f:
|
| 19 |
experiments = json.load(f)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
@lru_cache
|
| 22 |
def load_details_and_results(model, benchmark, experiment_tag):
|
| 23 |
def worker(example):
|
|
|
|
| 5 |
from functools import lru_cache
|
| 6 |
|
| 7 |
# Load models and experiments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
with open("experiments.json") as f:
|
| 10 |
experiments = json.load(f)
|
| 11 |
|
| 12 |
+
MODELS = list(experiments.keys())
|
| 13 |
+
MODELS = [m for m in MODELS if m != "claude-3-7-sonnet-20250219"]
|
| 14 |
+
|
| 15 |
@lru_cache
|
| 16 |
def load_details_and_results(model, benchmark, experiment_tag):
|
| 17 |
def worker(example):
|
experiments.json
CHANGED
|
@@ -62,7 +62,8 @@
|
|
| 62 |
"extractive_match"
|
| 63 |
],
|
| 64 |
"tags": {
|
| 65 |
-
"
|
|
|
|
| 66 |
}
|
| 67 |
},
|
| 68 |
"gpqa_diamond": {
|
|
@@ -71,7 +72,8 @@
|
|
| 71 |
"extractive_match"
|
| 72 |
],
|
| 73 |
"tags": {
|
| 74 |
-
"
|
|
|
|
| 75 |
}
|
| 76 |
},
|
| 77 |
"aime_24": {
|
|
@@ -80,7 +82,8 @@
|
|
| 80 |
"extractive_match"
|
| 81 |
],
|
| 82 |
"tags": {
|
| 83 |
-
"
|
|
|
|
| 84 |
}
|
| 85 |
},
|
| 86 |
"aime_25": {
|
|
@@ -89,7 +92,8 @@
|
|
| 89 |
"extractive_match"
|
| 90 |
],
|
| 91 |
"tags": {
|
| 92 |
-
"
|
|
|
|
| 93 |
}
|
| 94 |
},
|
| 95 |
"ifeval": {
|
|
@@ -98,7 +102,8 @@
|
|
| 98 |
"prompt_level_strict_acc"
|
| 99 |
],
|
| 100 |
"tags": {
|
| 101 |
-
"
|
|
|
|
| 102 |
}
|
| 103 |
}
|
| 104 |
}
|
|
@@ -416,5 +421,89 @@
|
|
| 416 |
}
|
| 417 |
}
|
| 418 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
}
|
| 420 |
}
|
|
|
|
| 62 |
"extractive_match"
|
| 63 |
],
|
| 64 |
"tags": {
|
| 65 |
+
"default": "2025-02-25T14-35-15.137825",
|
| 66 |
+
"thinking": "2025-03-05T10-14-44.802711"
|
| 67 |
}
|
| 68 |
},
|
| 69 |
"gpqa_diamond": {
|
|
|
|
| 72 |
"extractive_match"
|
| 73 |
],
|
| 74 |
"tags": {
|
| 75 |
+
"default": "2025-02-25T12-43-49.294245",
|
| 76 |
+
"thinking": "2025-03-05T15-37-37.180318"
|
| 77 |
}
|
| 78 |
},
|
| 79 |
"aime_24": {
|
|
|
|
| 82 |
"extractive_match"
|
| 83 |
],
|
| 84 |
"tags": {
|
| 85 |
+
"default": "2025-02-25T12-37-52.771787",
|
| 86 |
+
"thinking": "2025-03-05T12-39-13.627801"
|
| 87 |
}
|
| 88 |
},
|
| 89 |
"aime_25": {
|
|
|
|
| 92 |
"extractive_match"
|
| 93 |
],
|
| 94 |
"tags": {
|
| 95 |
+
"default": "2025-02-25T12-37-52.771787",
|
| 96 |
+
"thinking": "2025-03-05T12-39-13.627801"
|
| 97 |
}
|
| 98 |
},
|
| 99 |
"ifeval": {
|
|
|
|
| 102 |
"prompt_level_strict_acc"
|
| 103 |
],
|
| 104 |
"tags": {
|
| 105 |
+
"default": "2025-02-25T12-24-45.750753",
|
| 106 |
+
"thinking": "2025-03-05T15-37-37.180318"
|
| 107 |
}
|
| 108 |
}
|
| 109 |
}
|
|
|
|
| 421 |
}
|
| 422 |
}
|
| 423 |
}
|
| 424 |
+
},
|
| 425 |
+
"openai/deepseek-ai/DeepSeek-R1": {
|
| 426 |
+
"display_name": "DeepSeek R1",
|
| 427 |
+
"provider": "deepseek",
|
| 428 |
+
"open": true,
|
| 429 |
+
"benchmarks": {
|
| 430 |
+
"math_500": {
|
| 431 |
+
"subset": "lighteval|math_500|0",
|
| 432 |
+
"metrics": ["extractive_match"],
|
| 433 |
+
"tags": {
|
| 434 |
+
"latest": "2025-03-04T17-06-33.124766"
|
| 435 |
+
}
|
| 436 |
+
},
|
| 437 |
+
"gpqa_diamond": {
|
| 438 |
+
"subset": "lighteval|gpqa:diamond|0",
|
| 439 |
+
"metrics": ["extractive_match"],
|
| 440 |
+
"tags": {
|
| 441 |
+
"latest": "2025-03-04T17-06-33.124766"
|
| 442 |
+
}
|
| 443 |
+
},
|
| 444 |
+
"aime_24": {
|
| 445 |
+
"subset": "lighteval|aime24|0",
|
| 446 |
+
"metrics": ["extractive_match"],
|
| 447 |
+
"tags": {
|
| 448 |
+
"latest": "2025-03-04T14-52-35.594174"
|
| 449 |
+
}
|
| 450 |
+
},
|
| 451 |
+
"aime_25": {
|
| 452 |
+
"subset": "lighteval|aime25|0",
|
| 453 |
+
"metrics": ["extractive_match"],
|
| 454 |
+
"tags": {
|
| 455 |
+
"latest": "2025-03-04T14-25-05.009799"
|
| 456 |
+
}
|
| 457 |
+
},
|
| 458 |
+
"ifeval": {
|
| 459 |
+
"subset": "extended|ifeval|0",
|
| 460 |
+
"metrics": ["prompt_level_strict_acc"],
|
| 461 |
+
"tags": {
|
| 462 |
+
"latest": "2025-03-04T15-24-42.488745"
|
| 463 |
+
}
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
},
|
| 467 |
+
"Qwen/QwQ-32B": {
|
| 468 |
+
"display_name": "QwQ 32B",
|
| 469 |
+
"provider": "Qwen",
|
| 470 |
+
"open": true,
|
| 471 |
+
"benchmarks": {
|
| 472 |
+
"math_500": {
|
| 473 |
+
"subset": "lighteval|math_500|0",
|
| 474 |
+
"metrics": ["extractive_match"],
|
| 475 |
+
"tags": {
|
| 476 |
+
"latest": "2025-03-07T11-04-40.089127"
|
| 477 |
+
}
|
| 478 |
+
},
|
| 479 |
+
"gpqa_diamond": {
|
| 480 |
+
"subset": "lighteval|gpqa:diamond|0",
|
| 481 |
+
"metrics": ["extractive_match"],
|
| 482 |
+
"tags": {
|
| 483 |
+
"latest": "2025-03-07T11-04-40.089127"
|
| 484 |
+
}
|
| 485 |
+
},
|
| 486 |
+
"aime_24": {
|
| 487 |
+
"subset": "lighteval|aime24|0",
|
| 488 |
+
"metrics": ["extractive_match"],
|
| 489 |
+
"tags": {
|
| 490 |
+
"latest": "2025-03-10T10-36-07.886033"
|
| 491 |
+
}
|
| 492 |
+
},
|
| 493 |
+
"aime_25": {
|
| 494 |
+
"subset": "lighteval|aime25|0",
|
| 495 |
+
"metrics": ["extractive_match"],
|
| 496 |
+
"tags": {
|
| 497 |
+
"latest": "2025-03-10T10-36-07.886033"
|
| 498 |
+
}
|
| 499 |
+
},
|
| 500 |
+
"ifeval": {
|
| 501 |
+
"subset": "extended|ifeval|0",
|
| 502 |
+
"metrics": ["prompt_level_strict_acc"],
|
| 503 |
+
"tags": {
|
| 504 |
+
"latest": "2025-03-07T11-04-40.089127"
|
| 505 |
+
}
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
}
|
| 509 |
}
|