Spaces:
Sleeping
Sleeping
shyuli
commited on
Commit
·
d1f8ae7
1
Parent(s):
237c85f
version v0.1
Browse files
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"config": {
|
| 3 |
-
"model_dtype": "torch.float16",
|
| 4 |
-
"model_name": "demo-leaderboard/gpt2-demo",
|
| 5 |
-
"model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
|
| 6 |
-
},
|
| 7 |
-
"results": {
|
| 8 |
-
"anli_r1": {
|
| 9 |
-
"acc": 0
|
| 10 |
-
},
|
| 11 |
-
"logiqa": {
|
| 12 |
-
"acc_norm": 0.90
|
| 13 |
-
}
|
| 14 |
-
}
|
| 15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"anli_r1": {
|
| 4 |
-
"acc": 0.4,
|
| 5 |
-
"acc_stderr": 0.11239029738980327
|
| 6 |
-
},
|
| 7 |
-
"logiqa": {
|
| 8 |
-
"acc": 0.35,
|
| 9 |
-
"acc_stderr": 0.10942433098048308,
|
| 10 |
-
"acc_norm": 0.3,
|
| 11 |
-
"acc_norm_stderr": 0.10513149660756933
|
| 12 |
-
}
|
| 13 |
-
},
|
| 14 |
-
"versions": {
|
| 15 |
-
"anli_r1": 0,
|
| 16 |
-
"logiqa": 0
|
| 17 |
-
},
|
| 18 |
-
"config": {
|
| 19 |
-
"model": "hf-causal-experimental",
|
| 20 |
-
"model_args": "pretrained=demo-leaderboard/gpt2-demo,revision=main,dtype=bfloat16",
|
| 21 |
-
"num_fewshot": 0,
|
| 22 |
-
"batch_size": 1,
|
| 23 |
-
"batch_sizes": [],
|
| 24 |
-
"device": "cpu",
|
| 25 |
-
"no_cache": true,
|
| 26 |
-
"limit": 20,
|
| 27 |
-
"bootstrap_iters": 100000,
|
| 28 |
-
"description_dict": null,
|
| 29 |
-
"model_dtype": "bfloat16",
|
| 30 |
-
"model_name": "demo-leaderboard/gpt2-demo",
|
| 31 |
-
"model_sha": "main"
|
| 32 |
-
}
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -44,14 +44,14 @@ We evaluate on a comprehensive set of benchmarks that test different aspects of
|
|
| 44 |
|
| 45 |
### General QA
|
| 46 |
- **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
|
| 47 |
-
- **TriviaQA**:
|
| 48 |
-
- **PopQA**:
|
| 49 |
|
| 50 |
### Multi-Hop QA
|
| 51 |
-
- **HotpotQA**:
|
| 52 |
-
- **
|
| 53 |
-
- **Musique**:
|
| 54 |
-
- **Bamboogle**:
|
| 55 |
|
| 56 |
### Novel Evaluation: FictionalHot
|
| 57 |
- **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.
|
|
|
|
| 44 |
|
| 45 |
### General QA
|
| 46 |
- **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
|
| 47 |
+
- **TriviaQA**: large-scale dataset with questions from trivia websites and competitions, featuring complex entity relationships
|
| 48 |
+
- **PopQA**: A large-scale open-domain, entity-centric QA dataset (14k QA pairs), with questions generated by templating Wikidata knowledge tuples.
|
| 49 |
|
| 50 |
### Multi-Hop QA
|
| 51 |
+
- **HotpotQA**: the first large-scale dataset requiring reasoning across multiple Wikipedia paragraphs.
|
| 52 |
+
- **2Wiki**: A multi-hop QA dataset with explicit, annotated reasoning paths.
|
| 53 |
+
- **Musique**: A multi-hop QA benchmark of 2–4-hop questions constructed from five single-hop datasets.
|
| 54 |
+
- **Bamboogle**: A complex, cross-domain question set curated from queries that Google answers incorrectly to evaluate models’ compositional reasoning.
|
| 55 |
|
| 56 |
### Novel Evaluation: FictionalHot
|
| 57 |
- **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.
|
src/display/formatting.py
CHANGED
|
@@ -2,11 +2,56 @@ def model_hyperlink(link, model_name):
|
|
| 2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
def make_clickable_model(model_name):
|
| 6 |
-
# Custom link mappings for each model
|
| 7 |
custom_links = {
|
| 8 |
-
"ReSeek-Qwen2.5-7b-Instruct": "https://
|
| 9 |
-
"ReSeek-Qwen2.5-3b-Instruct": "https://
|
| 10 |
"ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
|
| 11 |
"ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
|
| 12 |
"Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
|
|
@@ -19,12 +64,15 @@ def make_clickable_model(model_name):
|
|
| 19 |
"Direct-Inference-Qwen2.5-7b-Instruct": "",
|
| 20 |
}
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
if model_name in custom_links:
|
| 23 |
link = custom_links[model_name]
|
| 24 |
-
return model_hyperlink(link,
|
| 25 |
else:
|
| 26 |
-
# If no custom link, just return the
|
| 27 |
-
return
|
| 28 |
|
| 29 |
|
| 30 |
def styled_error(error):
|
|
|
|
| 2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
|
| 4 |
|
| 5 |
+
def extract_method_name(full_model_name):
|
| 6 |
+
"""Extract method name with size from full model name (e.g., 'ReSeek-Qwen2.5-7b-Instruct' -> 'ReSeek-7b')"""
|
| 7 |
+
# Split by '-' and extract method name + size
|
| 8 |
+
parts = full_model_name.split("-")
|
| 9 |
+
method_parts = []
|
| 10 |
+
size_part = None
|
| 11 |
+
|
| 12 |
+
for i, part in enumerate(parts):
|
| 13 |
+
# Check if this is a base model indicator
|
| 14 |
+
if part.lower() in ["qwen2.5", "qwen2", "qwen", "llama", "mistral", "phi"]:
|
| 15 |
+
# Look for the size part (like "7b", "3b") after the base model name
|
| 16 |
+
for j in range(i, len(parts)):
|
| 17 |
+
if parts[j].lower().endswith("b") and len(parts[j]) <= 3:
|
| 18 |
+
size_part = parts[j].lower()
|
| 19 |
+
break
|
| 20 |
+
break
|
| 21 |
+
method_parts.append(part)
|
| 22 |
+
|
| 23 |
+
if method_parts and size_part:
|
| 24 |
+
method_name = "-".join(method_parts) + "-" + size_part
|
| 25 |
+
elif method_parts:
|
| 26 |
+
method_name = "-".join(method_parts)
|
| 27 |
+
else:
|
| 28 |
+
method_name = full_model_name
|
| 29 |
+
|
| 30 |
+
return method_name
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def extract_base_model(full_model_name):
|
| 34 |
+
"""Extract base model from full model name (e.g., 'ReSeek-Qwen2.5-7b-Instruct' -> 'Qwen2.5-7b-Instruct')"""
|
| 35 |
+
# Split by '-' and take parts from base model onwards
|
| 36 |
+
parts = full_model_name.split("-")
|
| 37 |
+
base_parts = []
|
| 38 |
+
found_base = False
|
| 39 |
+
for part in parts:
|
| 40 |
+
# Start collecting when we hit a base model indicator
|
| 41 |
+
if part.lower() in ["qwen2.5", "qwen2", "qwen", "llama", "mistral", "phi"]:
|
| 42 |
+
found_base = True
|
| 43 |
+
if found_base:
|
| 44 |
+
base_parts.append(part)
|
| 45 |
+
|
| 46 |
+
base_model = "-".join(base_parts) if base_parts else "Unknown"
|
| 47 |
+
return base_model
|
| 48 |
+
|
| 49 |
+
|
| 50 |
def make_clickable_model(model_name):
|
| 51 |
+
# Custom link mappings for each model (using full model name as key)
|
| 52 |
custom_links = {
|
| 53 |
+
"ReSeek-Qwen2.5-7b-Instruct": "https://huggingface.co/TencentBAC/ReSeek-qwen2.5-3b-em-grpo",
|
| 54 |
+
"ReSeek-Qwen2.5-3b-Instruct": "https://huggingface.co/TencentBAC/ReSeek-qwen2.5-7b-em-grpo",
|
| 55 |
"ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
|
| 56 |
"ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
|
| 57 |
"Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
|
|
|
|
| 64 |
"Direct-Inference-Qwen2.5-7b-Instruct": "",
|
| 65 |
}
|
| 66 |
|
| 67 |
+
# Extract just the method name (without base model)
|
| 68 |
+
method_name = extract_method_name(model_name)
|
| 69 |
+
|
| 70 |
if model_name in custom_links:
|
| 71 |
link = custom_links[model_name]
|
| 72 |
+
return model_hyperlink(link, method_name)
|
| 73 |
else:
|
| 74 |
+
# If no custom link, just return the method name
|
| 75 |
+
return method_name
|
| 76 |
|
| 77 |
|
| 78 |
def styled_error(error):
|
src/display/utils.py
CHANGED
|
@@ -27,9 +27,11 @@ class ColumnContent:
|
|
| 27 |
class AutoEvalColumn:
|
| 28 |
rank = ColumnContent("Rank", "number", True, never_hidden=True)
|
| 29 |
model = ColumnContent("Model", "markdown", True, never_hidden=True)
|
|
|
|
| 30 |
average = ColumnContent("Average Score", "number", True)
|
| 31 |
model_size = ColumnContent("Model Size", "str", True)
|
| 32 |
|
|
|
|
| 33 |
# Add task columns dynamically
|
| 34 |
for task in Tasks:
|
| 35 |
setattr(AutoEvalColumn, task.name, ColumnContent(task.value.col_name, "number", True))
|
|
|
|
| 27 |
class AutoEvalColumn:
|
| 28 |
rank = ColumnContent("Rank", "number", True, never_hidden=True)
|
| 29 |
model = ColumnContent("Model", "markdown", True, never_hidden=True)
|
| 30 |
+
base_model = ColumnContent("Base Model", "str", True)
|
| 31 |
average = ColumnContent("Average Score", "number", True)
|
| 32 |
model_size = ColumnContent("Model Size", "str", True)
|
| 33 |
|
| 34 |
+
|
| 35 |
# Add task columns dynamically
|
| 36 |
for task in Tasks:
|
| 37 |
setattr(AutoEvalColumn, task.name, ColumnContent(task.value.col_name, "number", True))
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -7,7 +7,7 @@ from dataclasses import dataclass
|
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
-
from src.display.formatting import make_clickable_model
|
| 11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 12 |
|
| 13 |
|
|
@@ -125,10 +125,14 @@ class EvalResult:
|
|
| 125 |
elif "7b" in self.full_model.lower():
|
| 126 |
model_size = "7B"
|
| 127 |
|
|
|
|
|
|
|
|
|
|
| 128 |
data_dict = {
|
| 129 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 130 |
AutoEvalColumn.rank.name: 0, # Will be set later based on average ranking
|
| 131 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
|
|
| 132 |
AutoEvalColumn.model_size.name: model_size,
|
| 133 |
AutoEvalColumn.average.name: average,
|
| 134 |
}
|
|
|
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
+
from src.display.formatting import make_clickable_model, extract_base_model
|
| 11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 12 |
|
| 13 |
|
|
|
|
| 125 |
elif "7b" in self.full_model.lower():
|
| 126 |
model_size = "7B"
|
| 127 |
|
| 128 |
+
# Extract base model
|
| 129 |
+
base_model = extract_base_model(self.full_model)
|
| 130 |
+
|
| 131 |
data_dict = {
|
| 132 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 133 |
AutoEvalColumn.rank.name: 0, # Will be set later based on average ranking
|
| 134 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 135 |
+
AutoEvalColumn.base_model.name: base_model,
|
| 136 |
AutoEvalColumn.model_size.name: model_size,
|
| 137 |
AutoEvalColumn.average.name: average,
|
| 138 |
}
|