SearchAgent_Leaderboard

Sleeping

App Files Files Community

shyuli commited on Sep 30

Commit

d1f8ae7

1 Parent(s): 237c85f

version v0.1

Browse files

Files changed (6) hide show

eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json +0 -15
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json +0 -33
src/about.py +6 -6
src/display/formatting.py +54 -6
src/display/utils.py +2 -0
src/leaderboard/read_evals.py +5 -1

eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json DELETED Viewed

@@ -1,15 +0,0 @@
-{
-    "config": {
-        "model_dtype": "torch.float16",
-        "model_name": "demo-leaderboard/gpt2-demo",
-        "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
-    },
-    "results": {
-        "anli_r1": {
-            "acc": 0
-        },
-        "logiqa": {
-            "acc_norm": 0.90
-        }
-    }
-}

eval-results/demo-leaderboard/gpt2-demo/results_2023-11-22 15:46:20.425378.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "results": {
-    "anli_r1": {
-      "acc": 0.4,
-      "acc_stderr": 0.11239029738980327
-    },
-    "logiqa": {
-      "acc": 0.35,
-      "acc_stderr": 0.10942433098048308,
-      "acc_norm": 0.3,
-      "acc_norm_stderr": 0.10513149660756933
-    }
-  },
-  "versions": {
-    "anli_r1": 0,
-    "logiqa": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=demo-leaderboard/gpt2-demo,revision=main,dtype=bfloat16",
-    "num_fewshot": 0,
-    "batch_size": 1,
-    "batch_sizes": [],
-    "device": "cpu",
-    "no_cache": true,
-    "limit": 20,
-    "bootstrap_iters": 100000,
-    "description_dict": null,
-    "model_dtype": "bfloat16",
-    "model_name": "demo-leaderboard/gpt2-demo",
-    "model_sha": "main"
-  }
-}

src/about.py CHANGED Viewed

@@ -44,14 +44,14 @@ We evaluate on a comprehensive set of benchmarks that test different aspects of
 ### General QA
 - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
-- **TriviaQA**: Trivia questions requiring document-based answer extraction
-- **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
 ### Multi-Hop QA
-- **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
-- **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
-- **Musique**: Multi-step compositional reasoning QA via single-hop question composition
-- **Bamboogle**: Adversarial search QA designed to test compositionality gaps in language models
 ### Novel Evaluation: FictionalHot
 - **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.

 ### General QA
 - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
+- **TriviaQA**: large-scale dataset with questions from trivia websites and competitions, featuring complex entity relationships
+- **PopQA**: A large-scale open-domain, entity-centric QA dataset (14k QA pairs), with questions generated by templating Wikidata knowledge tuples.
 ### Multi-Hop QA
+- **HotpotQA**: the first large-scale dataset requiring reasoning across multiple Wikipedia paragraphs.
+- **2Wiki**: A multi-hop QA dataset with explicit, annotated reasoning paths.
+- **Musique**: A multi-hop QA benchmark of 2–4-hop questions constructed from five single-hop datasets.
+- **Bamboogle**: A complex, cross-domain question set curated from queries that Google answers incorrectly to evaluate models’ compositional reasoning.
 ### Novel Evaluation: FictionalHot
 - **FictionalHot**: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.

src/display/formatting.py CHANGED Viewed

@@ -2,11 +2,56 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 def make_clickable_model(model_name):
-    # Custom link mappings for each model
     custom_links = {
-        "ReSeek-Qwen2.5-7b-Instruct": "https://your-custom-link.com/reseek-7b",
-        "ReSeek-Qwen2.5-3b-Instruct": "https://your-custom-link.com/reseek-3b",
         "ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
         "ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
         "Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
@@ -19,12 +64,15 @@ def make_clickable_model(model_name):
         "Direct-Inference-Qwen2.5-7b-Instruct": "",
     }
     if model_name in custom_links:
         link = custom_links[model_name]
-        return model_hyperlink(link, model_name)
     else:
-        # If no custom link, just return the model name
-        return model_name
 def styled_error(error):

     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def extract_method_name(full_model_name):
+    """Extract method name with size from full model name (e.g., 'ReSeek-Qwen2.5-7b-Instruct' -> 'ReSeek-7b')"""
+    # Split by '-' and extract method name + size
+    parts = full_model_name.split("-")
+    method_parts = []
+    size_part = None
+    for i, part in enumerate(parts):
+        # Check if this is a base model indicator
+        if part.lower() in ["qwen2.5", "qwen2", "qwen", "llama", "mistral", "phi"]:
+            # Look for the size part (like "7b", "3b") after the base model name
+            for j in range(i, len(parts)):
+                if parts[j].lower().endswith("b") and len(parts[j]) <= 3:
+                    size_part = parts[j].lower()
+                    break
+            break
+        method_parts.append(part)
+    if method_parts and size_part:
+        method_name = "-".join(method_parts) + "-" + size_part
+    elif method_parts:
+        method_name = "-".join(method_parts)
+    else:
+        method_name = full_model_name
+    return method_name
+def extract_base_model(full_model_name):
+    """Extract base model from full model name (e.g., 'ReSeek-Qwen2.5-7b-Instruct' -> 'Qwen2.5-7b-Instruct')"""
+    # Split by '-' and take parts from base model onwards
+    parts = full_model_name.split("-")
+    base_parts = []
+    found_base = False
+    for part in parts:
+        # Start collecting when we hit a base model indicator
+        if part.lower() in ["qwen2.5", "qwen2", "qwen", "llama", "mistral", "phi"]:
+            found_base = True
+        if found_base:
+            base_parts.append(part)
+    base_model = "-".join(base_parts) if base_parts else "Unknown"
+    return base_model
 def make_clickable_model(model_name):
+    # Custom link mappings for each model (using full model name as key)
     custom_links = {
+        "ReSeek-Qwen2.5-7b-Instruct": "https://huggingface.co/TencentBAC/ReSeek-qwen2.5-3b-em-grpo",
+        "ReSeek-Qwen2.5-3b-Instruct": "https://huggingface.co/TencentBAC/ReSeek-qwen2.5-7b-em-grpo",
         "ZeroSearch-Qwen2.5-3b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_3B_Instruct",
         "ZeroSearch-Qwen2.5-7b-Instruct": "https://huggingface.co/Alibaba-NLP/ZeroSearch_wiki_V2_Qwen2.5_7B_Instruct",
         "Search-R1-Qwen2.5-7b-Instruct": "https://huggingface.co/PeterJinGo/SearchR1-nq_hotpotqa_train-qwen2.5-7b-it-em-ppo",
         "Direct-Inference-Qwen2.5-7b-Instruct": "",
     }
+    # Extract just the method name (without base model)
+    method_name = extract_method_name(model_name)
     if model_name in custom_links:
         link = custom_links[model_name]
+        return model_hyperlink(link, method_name)
     else:
+        # If no custom link, just return the method name
+        return method_name
 def styled_error(error):

src/display/utils.py CHANGED Viewed

@@ -27,9 +27,11 @@ class ColumnContent:
 class AutoEvalColumn:
     rank = ColumnContent("Rank", "number", True, never_hidden=True)
     model = ColumnContent("Model", "markdown", True, never_hidden=True)
     average = ColumnContent("Average Score", "number", True)
     model_size = ColumnContent("Model Size", "str", True)
 # Add task columns dynamically
 for task in Tasks:
     setattr(AutoEvalColumn, task.name, ColumnContent(task.value.col_name, "number", True))

 class AutoEvalColumn:
     rank = ColumnContent("Rank", "number", True, never_hidden=True)
     model = ColumnContent("Model", "markdown", True, never_hidden=True)
+    base_model = ColumnContent("Base Model", "str", True)
     average = ColumnContent("Average Score", "number", True)
     model_size = ColumnContent("Model Size", "str", True)
 # Add task columns dynamically
 for task in Tasks:
     setattr(AutoEvalColumn, task.name, ColumnContent(task.value.col_name, "number", True))

src/leaderboard/read_evals.py CHANGED Viewed

@@ -7,7 +7,7 @@ from dataclasses import dataclass
 import dateutil
 import numpy as np
-from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
@@ -125,10 +125,14 @@ class EvalResult:
         elif "7b" in self.full_model.lower():
             model_size = "7B"
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.rank.name: 0,  # Will be set later based on average ranking
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.model_size.name: model_size,
             AutoEvalColumn.average.name: average,
         }

 import dateutil
 import numpy as np
+from src.display.formatting import make_clickable_model, extract_base_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
         elif "7b" in self.full_model.lower():
             model_size = "7B"
+        # Extract base model
+        base_model = extract_base_model(self.full_model)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.rank.name: 0,  # Will be set later based on average ranking
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            AutoEvalColumn.base_model.name: base_model,
             AutoEvalColumn.model_size.name: model_size,
             AutoEvalColumn.average.name: average,
         }