SearchAgent_Leaderboard

Sleeping

shyuli

version v0.1

542377d about 2 months ago

8.68 kB

	from dataclasses import dataclass
	from enum import Enum


	@dataclass
	class Task:
	benchmark: str
	metric: str
	col_name: str


	# Select your tasks here
	# ---------------------------------------------------
	class Tasks(Enum):
	# task_key in the json file, metric_key in the json file, name to display in the leaderboard
	# General QA tasks
	nq = Task("nq", "exact_match", "NQ")
	triviaqa = Task("triviaqa", "exact_match", "TriviaQA")
	popqa = Task("popqa", "exact_match", "PopQA")
	# Multi-hop QA tasks
	hotpotqa = Task("hotpotqa", "exact_match", "HotpotQA")
	twowiki = Task("2wiki", "exact_match", "2wiki")
	musique = Task("musique", "exact_match", "Musique")
	bamboogle = Task("bamboogle", "exact_match", "Bamboogle")
	fictionalhot = Task("fictionalhot", "exact_match", "FictionalHot")


	NUM_FEWSHOT = 0 # Change with your few shot
	# ---------------------------------------------------


	# Your leaderboard name
	TITLE = """<h1 align="center" id="space-title">🔍 SearchAgent Leaderboard</h1>"""

	# What does your leaderboard evaluate?
	INTRODUCTION_TEXT = """
	# 🔍 SearchAgent Leaderboard

	This leaderboard evaluates the performance of search-augmented question answering systems across various tasks, ranging from simple factual QA to complex multi-hop reasoning. Our evaluation addresses the inconsistency in experimental settings across prior works by providing a standardized comparison framework.

	## 📊 Evaluation Tasks

	We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:

	### General QA
	- NQ: Natural Questions - QA based on real Google search queries from Wikipedia
	- TriviaQA: Trivia questions requiring document-based answer extraction
	- PopQA: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory

	### Multi-Hop QA
	- HotpotQA: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
	- 2wiki: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
	- Musique: Multi-step compositional reasoning QA via single-hop question composition
	- Bamboogle: Adversarial search QA designed to test compositionality gaps in language models

	### Novel Evaluation: FictionalHot
	- FictionalHot: A closed-world benchmark grounding questions in synthetic fictional entities to mitigate data contamination and enable reproducible evaluation. Questions are transformed from real-world scenarios to fictional ones while preserving reasoning structure.

	## 🎯 Evaluation Metrics
	Following standardized practices, we primarily use Exact Match (EM) as the main metric. A prediction is correct if its normalized string exactly matches any normalized reference answer (with lowercasing, punctuation removal, and whitespace normalization).

	"""

	# Which evaluations are you running? how can people reproduce what you have?
	LLM_BENCHMARKS_TEXT = f"""
	## 🔬 Evaluation Methodology

	This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:

	1. Corpora: From static Wikipedia snapshots (2018, 2019) to live Internet access
	2. Test Sets: Broad evaluation vs. focused multi-hop evaluation
	3. Training Regimes: No training to multi-dataset fine-tuning approaches
	4. Metrics: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations

	## 📋 Dataset Details & Challenges

	### Data Contamination Problem
	A critical issue in current benchmarks is data contamination, where high scores may reflect memorized pretraining knowledge rather than genuine procedural reasoning capabilities.

	### Our Solution: FictionalHot
	We introduce FictionalHot, a novel closed-world benchmark that:
	- Grounds all questions in newly generated synthetic fictional entities
	- Uses a three-step construction pipeline: sampling → GPT-based entity replacement → synthetic document generation
	- Forces models to rely on procedural reasoning over provided documents
	- Enables reproducible evaluation with a fixed knowledge source

	### Benchmark Coverage
	- Corpus: 2018 Wikipedia snapshot for reproducibility
	- Retrieval: Top-k=3 with maximum T=4 tool-use turns per question

	## 🔄 Experimental Setup

	Following established practices, we:
	- Fine-tune on unified NQ + HotpotQA training data
	- Evaluate on Qwen2.5-3B-Instruct and Qwen2.5-7B-Instruct models
	- Use E5 embeddings for retrieval backend
	- Apply standard Exact Match evaluation with string normalization


	"""

	EVALUATION_QUEUE_TEXT = """
	## 📣 Model Submission via Community

	We now accept submissions via the Space's Community (Discussions). This keeps the process simple and transparent.

	- Go to the Community tab of this leaderboard Space:
	https://huggingface.co/spaces/TencentBAC/SearchAgent_Leaderboard
	- Create a new Discussion with title:
	`Submission: <YourMethod>-<model_name>-<model_size>`
	- Include the following in the post:
	- Model weights link (HF or GitHub)
	- Short method description
	- Evaluation JSON (inline or attached)

	Example JSON:
	```json
	{
	"config": {
	"model_dtype": "torch.float16",
	"model_name": "YourMethod-Qwen2.5-7b-Instruct",
	"model_sha": "main"
	},
	"results": {
	"nq": {"exact_match": 0.45},
	"triviaqa": {"exact_match": 0.62},
	"popqa": {"exact_match": 0.38},
	"hotpotqa": {"exact_match": 0.41},
	"2wiki": {"exact_match": 0.33},
	"musique": {"exact_match": 0.15},
	"bamboogle": {"exact_match": 0.28},
	"fictionalhot": {"exact_match": 0.06}
	}
	}
	```

	We will review your post and add your model to the leaderboard.
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""
	% Key Search-Augmented QA Methods
	@article{luo2024search,
	title={Search-o1: Agentic Search-Enhanced Large Reasoning Models},
	author={Xiaoxi Li and Guanting Dong and Jiajie Jin and Yuyao Zhang and Yujia Zhou and Yutao Zhu and Peitian Zhang and Zhicheng Dou},
	journal={arXiv preprint arXiv:2501.05366},
	year={2025}
	}

	@article{songR1SearcherIncentivizingSearch2025,
	title={R1-Searcher: Incentivizing the Search Capability in LLMs via Reinforcement Learning},
	author={Song, Huatong and Jiang, Jinhao and Min, Yingqian and Chen, Jie and Chen, Zhipeng and Zhao, Wayne Xin and Fang, Lei and Wen, Ji-Rong},
	journal={arXiv preprint arXiv:2503.05592},
	year={2025}
	}

	@article{jin2025search,
	title={Search-r1: Training llms to reason and leverage search engines with reinforcement learning},
	author={Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei},
	journal={arXiv preprint arXiv:2503.09516},
	year={2025}
	}

	@article{sunZeroSearchIncentivizeSearch2025,
	title={ZeroSearch: Incentivize the Search Capability of LLMs without Searching},
	author={Sun, Hao and Qiao, Zile and Guo, Jiayan and Fan, Xuanbo and Hou, Yingyan and Jiang, Yong and Xie, Pengjun and Zhang, Yan and Huang, Fei and Zhou, Jingren},
	journal={arXiv preprint arXiv:2505.04588},
	year={2025}
	}

	@article{zheng2025deepresearcher,
	title={Deepresearcher: Scaling deep research via reinforcement learning in real-world environments},
	author={Zheng, Yuxiang and Fu, Dayuan and Hu, Xiangkun and Cai, Xiaojie and Ye, Lyumanshan and Lu, Pengrui and Liu, Pengfei},
	journal={arXiv preprint arXiv:2504.03160},
	year={2025}
	}

	% Benchmark Datasets
	@article{kwiatkowskiNaturalQuestionsBenchmark2019,
	title={Natural Questions: A Benchmark for Question Answering Research},
	author={Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and others},
	journal={Transactions of the Association for Computational Linguistics},
	volume={7},
	pages={453--466},
	year={2019}
	}

	@article{yangHotpotQADatasetDiverse2018,
	title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering},
	author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D.},
	booktitle={Proceedings of EMNLP},
	year={2018}
	}

	@article{trivediMuSiQueMultihopQuestions2022,
	title={MuSiQue: Multihop Questions via Single-hop Question Composition},
	author={Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish},
	journal={Transactions of the Association for Computational Linguistics},
	volume={10},
	pages={539--554},
	year={2022}
	}
	"""