shyuli commited on
Commit
d1f8ae7
·
1 Parent(s): 237c85f

version v0.1

Browse files
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "torch.float16",
4
- "model_name": "demo-leaderboard/gpt2-demo",
5
- "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
6
- },
7
- "results": {
8
- "anli_r1": {
9
- "acc": 0
10
- },
11
- "logiqa": {
12
- "acc_norm": 0.90
13
- }
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.4,
5
- "acc_stderr": 0.11239029738980327
6
- },
7
- "logiqa": {
8
- "acc": 0.35,
9
- "acc_stderr": 0.10942433098048308,
10
- "acc_norm": 0.3,
11
- "acc_norm_stderr": 0.10513149660756933
12
- }
13
- },
14
- "versions": {
15
- "anli_r1": 0,
16
- "logiqa": 0
17
- },
18
- "config": {
19
- "model": "hf-causal-experimental",
20
- "model_args": "pretrained=demo-leaderboard/gpt2-demo,revision=main,dtype=bfloat16",
21
- "num_fewshot": 0,
22
- "batch_size": 1,
23
- "batch_sizes": [],
24
- "device": "cpu",
25
- "no_cache": true,
26
- "limit": 20,
27
- "bootstrap_iters": 100000,
28
- "description_dict": null,
29
- "model_dtype": "bfloat16",
30
- "model_name": "demo-leaderboard/gpt2-demo",
31
- "model_sha": "main"
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -44,14 +44,14 @@ We evaluate on a comprehensive set of benchmarks that test different aspects of
44
 
45
  ### General QA
46
  - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
47
- - **TriviaQA**: Trivia questions requiring document-based answer extraction
48
- - **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
49
 
50
  ### Multi-Hop QA
51
- - **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
52
- - **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
53
- - **Musique**: Multi-step compositional reasoning QA via single-hop question composition
54
- - **Bamboogle**: Adversarial search QA designed to test compositionality gaps in language models
55
 
56
  ### Novel Evaluation: FictionalHot
57
  - **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.
 
44
 
45
  ### General QA
46
  - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
47
+ - **TriviaQA**: large-scale dataset with questions from trivia websites and competitions, featuring complex entity relationships
48
+ - **PopQA**: A large-scale open-domain, entity-centric QA dataset (14k QA pairs), with questions generated by templating Wikidata knowledge tuples.
49
 
50
  ### Multi-Hop QA
51
+ - **HotpotQA**: the first large-scale dataset requiring reasoning across multiple Wikipedia paragraphs.
52
+ - **2Wiki**: A multi-hop QA dataset with explicit, annotated reasoning paths.
53
+ - **Musique**: A multi-hop QA benchmark of 2–4-hop questions constructed from five single-hop datasets.
54
+ - **Bamboogle**: A complex, cross-domain question set curated from queries that Google answers incorrectly to evaluate models’ compositional reasoning.
55
 
56
  ### Novel Evaluation: FictionalHot
57
  - **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.
src/display/formatting.py CHANGED
@@ -2,11 +2,56 @@ def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def make_clickable_model(model_name):
6
- # Custom link mappings for each model
7
  custom_links = {
8
- "ReSeek-Qwen2.5-7b-Instruct": "https://your-custom-link.com/reseek-7b",
9
- "ReSeek-Qwen2.5-3b-Instruct": "https://your-custom-link.com/reseek-3b",
10
  "ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
11
  "ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
12
  "Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
@@ -19,12 +64,15 @@ def make_clickable_model(model_name):
19
  "Direct-Inference-Qwen2.5-7b-Instruct": "",
20
  }
21
 
 
 
 
22
  if model_name in custom_links:
23
  link = custom_links[model_name]
24
- return model_hyperlink(link, model_name)
25
  else:
26
- # If no custom link, just return the model name
27
- return model_name
28
 
29
 
30
  def styled_error(error):
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
+ def extract_method_name(full_model_name):
6
+ """Extract method name with size from full model name (e.g., 'ReSeek-Qwen2.5-7b-Instruct' -> 'ReSeek-7b')"""
7
+ # Split by '-' and extract method name + size
8
+ parts = full_model_name.split("-")
9
+ method_parts = []
10
+ size_part = None
11
+
12
+ for i, part in enumerate(parts):
13
+ # Check if this is a base model indicator
14
+ if part.lower() in ["qwen2.5", "qwen2", "qwen", "llama", "mistral", "phi"]:
15
+ # Look for the size part (like "7b", "3b") after the base model name
16
+ for j in range(i, len(parts)):
17
+ if parts[j].lower().endswith("b") and len(parts[j]) <= 3:
18
+ size_part = parts[j].lower()
19
+ break
20
+ break
21
+ method_parts.append(part)
22
+
23
+ if method_parts and size_part:
24
+ method_name = "-".join(method_parts) + "-" + size_part
25
+ elif method_parts:
26
+ method_name = "-".join(method_parts)
27
+ else:
28
+ method_name = full_model_name
29
+
30
+ return method_name
31
+
32
+
33
+ def extract_base_model(full_model_name):
34
+ """Extract base model from full model name (e.g., 'ReSeek-Qwen2.5-7b-Instruct' -> 'Qwen2.5-7b-Instruct')"""
35
+ # Split by '-' and take parts from base model onwards
36
+ parts = full_model_name.split("-")
37
+ base_parts = []
38
+ found_base = False
39
+ for part in parts:
40
+ # Start collecting when we hit a base model indicator
41
+ if part.lower() in ["qwen2.5", "qwen2", "qwen", "llama", "mistral", "phi"]:
42
+ found_base = True
43
+ if found_base:
44
+ base_parts.append(part)
45
+
46
+ base_model = "-".join(base_parts) if base_parts else "Unknown"
47
+ return base_model
48
+
49
+
50
  def make_clickable_model(model_name):
51
+ # Custom link mappings for each model (using full model name as key)
52
  custom_links = {
53
+ "ReSeek-Qwen2.5-7b-Instruct": "https://huggingface.co/TencentBAC/ReSeek-qwen2.5-3b-em-grpo",
54
+ "ReSeek-Qwen2.5-3b-Instruct": "https://huggingface.co/TencentBAC/ReSeek-qwen2.5-7b-em-grpo",
55
  "ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
56
  "ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
57
  "Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
 
64
  "Direct-Inference-Qwen2.5-7b-Instruct": "",
65
  }
66
 
67
+ # Extract just the method name (without base model)
68
+ method_name = extract_method_name(model_name)
69
+
70
  if model_name in custom_links:
71
  link = custom_links[model_name]
72
+ return model_hyperlink(link, method_name)
73
  else:
74
+ # If no custom link, just return the method name
75
+ return method_name
76
 
77
 
78
  def styled_error(error):
src/display/utils.py CHANGED
@@ -27,9 +27,11 @@ class ColumnContent:
27
  class AutoEvalColumn:
28
  rank = ColumnContent("Rank", "number", True, never_hidden=True)
29
  model = ColumnContent("Model", "markdown", True, never_hidden=True)
 
30
  average = ColumnContent("Average Score", "number", True)
31
  model_size = ColumnContent("Model Size", "str", True)
32
 
 
33
  # Add task columns dynamically
34
  for task in Tasks:
35
  setattr(AutoEvalColumn, task.name, ColumnContent(task.value.col_name, "number", True))
 
27
  class AutoEvalColumn:
28
  rank = ColumnContent("Rank", "number", True, never_hidden=True)
29
  model = ColumnContent("Model", "markdown", True, never_hidden=True)
30
+ base_model = ColumnContent("Base Model", "str", True)
31
  average = ColumnContent("Average Score", "number", True)
32
  model_size = ColumnContent("Model Size", "str", True)
33
 
34
+
35
  # Add task columns dynamically
36
  for task in Tasks:
37
  setattr(AutoEvalColumn, task.name, ColumnContent(task.value.col_name, "number", True))
src/leaderboard/read_evals.py CHANGED
@@ -7,7 +7,7 @@ from dataclasses import dataclass
7
  import dateutil
8
  import numpy as np
9
 
10
- from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
 
13
 
@@ -125,10 +125,14 @@ class EvalResult:
125
  elif "7b" in self.full_model.lower():
126
  model_size = "7B"
127
 
 
 
 
128
  data_dict = {
129
  "eval_name": self.eval_name, # not a column, just a save name,
130
  AutoEvalColumn.rank.name: 0, # Will be set later based on average ranking
131
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
132
  AutoEvalColumn.model_size.name: model_size,
133
  AutoEvalColumn.average.name: average,
134
  }
 
7
  import dateutil
8
  import numpy as np
9
 
10
+ from src.display.formatting import make_clickable_model, extract_base_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
 
13
 
 
125
  elif "7b" in self.full_model.lower():
126
  model_size = "7B"
127
 
128
+ # Extract base model
129
+ base_model = extract_base_model(self.full_model)
130
+
131
  data_dict = {
132
  "eval_name": self.eval_name, # not a column, just a save name,
133
  AutoEvalColumn.rank.name: 0, # Will be set later based on average ranking
134
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
135
+ AutoEvalColumn.base_model.name: base_model,
136
  AutoEvalColumn.model_size.name: model_size,
137
  AutoEvalColumn.average.name: average,
138
  }