leaderboard

Running

App Files Files Community

leaderboard / app.py

Ori

Update app.py

4a825f1 verified over 1 year ago

raw

history blame

9.03 kB

	import os
	import json
	import datetime
	from email.utils import parseaddr


	import gradio as gr
	import pandas as pd
	from datasets import load_dataset
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import HfApi
	from content import format_error, format_warning, format_log, TITLE


	# Placeholder for the question_scorer function
	def question_scorer(prediction, gold_answer):
	return 1 if prediction == gold_answer else 0


	# Constants and Configuration
	TOKEN = os.environ.get("TOKEN", None)
	OWNER = "Ori"
	DATA_DATASET = f"Ori/AssistantBench_V1.0"
	RESULTS_DATASET = f"Ori/results"
	SUBMISSION_DATASET = f"{OWNER}/submissions"
	LEADERBOARD_PATH = f"{OWNER}/leaderboard"
	api = HfApi()

	YEAR_VERSION = "2024"

	os.makedirs("scored", exist_ok=True)

	# Load datasets
	eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
	ignore_verifications=True, trust_remote_code=True)
	gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
	gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}


	# Function to get dataframe from results
	def get_dataframe_from_results(eval_results, split):
	local_df = eval_results[split]
	df = pd.DataFrame(local_df)
	df = df.sort_values(by=["Accuracy"], ascending=False)
	numeric_cols = [c for c in local_df.column_names if "score" in c]
	df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
	return df


	eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")


	# Function to restart the space
	def restart_space():
	api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)


	TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]


	# Function to add a new evaluation
	def add_new_eval(
	model_name: str,
	model_family: str,
	url: str,
	path_to_file: str,
	organization: str,
	mail: str,
	):
	_, parsed_mail = parseaddr(mail)
	if "@" not in parsed_mail:
	return format_warning("Please provide a valid email address.")

	print("Adding new eval")

	if model_name.lower() in set(
	[m.lower() for m in eval_results["test"]["Model Name"]]) and organization.lower() in set(
	[o.lower() for o in eval_results["test"]["Organization"]]):
	return format_warning("This model has already been submitted.")

	if path_to_file is None:
	return format_warning("Please attach a file.")

	api.upload_file(
	repo_id=SUBMISSION_DATASET,
	path_or_fileobj=path_to_file.name,
	path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_raw_{datetime.datetime.today()}.jsonl",
	repo_type="dataset",
	token=TOKEN
	)

	file_path = path_to_file.name
	scores = 0
	num_questions = 0
	with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
	with open(file_path, 'r') as f:
	for ix, line in enumerate(f):
	try:
	task = json.loads(line)
	except Exception:
	return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")

	if "answer" not in task:
	return format_error(
	f"Line {ix} contains no answer key. Please fix it and resubmit your file.")

	answer = task["answer"]
	task_id = task["id"]
	if task_id not in gold_answers["test"]:
	return format_error(
	f"{task_id} not found in test set. Are you sure you submitted the correct file?")

	score = question_scorer(task['answer'], gold_answers["test"][task_id])
	scored_file.write(
	json.dumps({
	"id": task_id,
	"model_answer": answer,
	"score": score
	}) + "\n"
	)
	scores += score
	num_questions += 1

	api.upload_file(
	repo_id=SUBMISSION_DATASET,
	path_or_fileobj=f"scored/{organization}_{model_name}.jsonl",
	path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_scored_{datetime.datetime.today()}.jsonl",
	repo_type="dataset",
	token=TOKEN
	)

	eval_entry = {
	"Model Name": model_name,
	"Model Family": model_family,
	"URL": url,
	"Organization": organization,
	"Accuracy": scores / num_questions if num_questions > 0 else 0,
	"Answer rate": scores / num_questions if num_questions > 0 else 0,
	"Precision": scores / num_questions if num_questions > 0 else 0,
	"EM": scores if num_questions > 0 else 0,
	"Cost": 0, # Placeholder for cost, update with actual value if needed
	}
	eval_results["test"] = eval_results["test"].add_item(eval_entry)
	eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)

	return format_log(
	f"Model {model_name} submitted by {organization} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")


	# Function to refresh the results
	def refresh():
	eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
	ignore_verifications=True, trust_remote_code=True)
	eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
	return eval_dataframe_test


	# Gradio interface
	demo = gr.Blocks()
	with demo:
	gr.HTML("<h1>AssistantBench</h1>")
	gr.Markdown("""
	AssistantBench aims to evaluate the ability of web agents to assist with real and time-consuming tasks.
	For more information, please check out our paper or the official website.
	To download AssistantBench, press [here](https://huggingface.co/datasets/Ori/AssistantBench_V1.0).
	""")

	gr.HTML("<h2>AssistantBench Leaderboard</h2>")
	with gr.Tab("Results: Test"):
	leaderboard_table_test = gr.Dataframe(
	value=eval_dataframe_test, datatype=TYPES, interactive=False,
	column_widths=["20%"]
	)

	refresh_button = gr.Button("Refresh")
	refresh_button.click(
	refresh,
	inputs=[],
	outputs=[
	leaderboard_table_test,
	],
	)

	gr.HTML("<h2>Making a New Submission</h2>")
	with gr.Accordion("Submit a new model for evaluation"):
	with gr.Row():
	gr.Markdown("""
	To make a new submission, upload a predictions file. We support JSONL files with the following format:
	```
	{"id": "task_id_1", "answer": "Answer 1 from your model"}
	{"id": "task_id_2", "answer": "Answer 2 from your model"}
	```
	Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py).
	""")
	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(label="Model Name")
	model_family_textbox = gr.Textbox(label="Model Family")
	url_textbox = gr.Textbox(label="URL to Model Information")
	with gr.Column():
	organization = gr.Textbox(label="Organization")
	mail = gr.Textbox(
	label="Contact Email (will be stored privately & used if there is an issue with your submission)")
	file_output = gr.File()

	submit_button = gr.Button("Submit Eval")
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	[
	model_name_textbox,
	model_family_textbox,
	url_textbox,
	file_output,
	organization,
	mail
	],
	submission_result,
	)

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_text = """@article{yoran-etal-2023-assistantbench,
	title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
	author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
	year={2024},
	eprint={TODO},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
	}"""
	citation_button = gr.Textbox(
	value=citation_text,
	label="Citation",
	lines=20,
	elem_id="citation-button",
	show_copy_button=True
	)

	gr.HTML(
	"<p>We would like to thank the GAIA team on which this leaderboard is based on their template and HuggingFace for hosting the leaderboard.</p>")

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=3600)
	scheduler.start()
	demo.launch(debug=True)