Spaces:

TabArena
/

leaderboard

Running

App Files Files Community

leaderboard / main.py

LennartPurucker

add paper link/ref

95d1cf3 5 months ago

raw

history blame contribute delete

18.4 kB

	from __future__ import annotations

	import zipfile
	from dataclasses import dataclass
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import website_texts
	from apscheduler.schedulers.background import BackgroundScheduler
	from constants import Constants, model_type_emoji
	from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
	from website_texts import (
	ABOUT_TEXT,
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	INTRODUCTION_TEXT,
	TITLE,
	VERSION_HISTORY_BUTTON_TEXT,
	)


	def get_model_family(model_name: str) -> str:
	prefixes_mapping = {
	Constants.reference: ["AutoGluon"],
	Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
	Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"],
	Constants.foundational: ["TABDPT", "TABICL", "TABPFN"],
	Constants.baseline: ["KNN", "LR"],
	}

	for method_type, prefixes in prefixes_mapping.items():
	for prefix in prefixes:
	if prefix.lower() in model_name.lower():
	return method_type
	return Constants.other


	def rename_map(model_name: str) -> str:
	rename_map = {
	"TABM": "TabM",
	"REALMLP": "RealMLP",
	"GBM": "LightGBM",
	"CAT": "CatBoost",
	"XGB": "XGBoost",
	"XT": "ExtraTrees",
	"RF": "RandomForest",
	"MNCA": "ModernNCA",
	"NN_TORCH": "TorchMLP",
	"FASTAI": "FastaiMLP",
	"TABPFNV2": "TabPFNv2",
	"EBM": "EBM",
	"TABDPT": "TabDPT",
	"TABICL": "TabICL",
	"KNN": "KNN",
	"LR": "Linear",
	}

	for prefix in rename_map:
	if prefix in model_name:
	return model_name.replace(prefix, rename_map[prefix])

	return model_name


	def load_data(filename: str, data_source="data"):
	df_leaderboard = pd.read_csv(Path(__file__).parent / data_source / filename)

	# Add Model Family Information
	df_leaderboard["Type"] = df_leaderboard.loc[:, "method"].apply(
	lambda s: model_type_emoji[get_model_family(s)]
	)
	df_leaderboard["TypeName"] = df_leaderboard.loc[:, "method"].apply(
	lambda s: get_model_family(s)
	)
	df_leaderboard["method"] = df_leaderboard["method"].apply(rename_map)

	# elo,elo+,elo-,mrr
	df_leaderboard["Elo 95% CI"] = (
	"+"
	+ df_leaderboard["elo+"].round(0).astype(int).astype(str)
	+ "/-"
	+ df_leaderboard["elo-"].round(0).astype(int).astype(str)
	)
	# select only the columns we want to display
	df_leaderboard["normalized-score"] = 1 - df_leaderboard["normalized-error"]
	df_leaderboard["hmr"] = 1 / df_leaderboard["mrr"]
	df_leaderboard["improvability"] = 100 * df_leaderboard["champ_delta"]

	# Imputed logic
	if "imputed" in df_leaderboard.columns:
	df_leaderboard["imputed"] = (100 * df_leaderboard["imputed"]).round(2)
	df_leaderboard["imputed_bool"] = False
	# Filter methods that are fully imputed.
	df_leaderboard = df_leaderboard[~(df_leaderboard["imputed"] == 100)]
	# Add imputed column and add name postfix
	imputed_mask = df_leaderboard["imputed"] != 0
	df_leaderboard.loc[imputed_mask, "imputed_bool"] = True
	df_leaderboard.loc[imputed_mask, "method"] = df_leaderboard.loc[
	imputed_mask, ["method", "imputed"]
	].apply(lambda row: row["method"] + f" [{row['imputed']:.2f}% IMPUTED]", axis=1)
	else:
	df_leaderboard["imputed_bool"] = None
	df_leaderboard["imputed"] = None

	# Resolve GPU postfix
	gpu_postfix = "_GPU"
	df_leaderboard["Hardware"] = df_leaderboard["method"].apply(
	lambda x: "CPU" if gpu_postfix not in x else "GPU"
	)
	df_leaderboard["method"] = df_leaderboard["method"].str.replace(gpu_postfix, "")

	df_leaderboard = df_leaderboard.loc[
	:,
	[
	"Type",
	"TypeName",
	"method",
	"elo",
	"Elo 95% CI",
	"normalized-score",
	"rank",
	"hmr",
	"improvability",
	"median_time_train_s_per_1K",
	"median_time_infer_s_per_1K",
	"imputed",
	"imputed_bool",
	"Hardware",
	],
	]

	# round for better display
	df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
	0
	)
	df_leaderboard[["median_time_train_s_per_1K", "rank", "hmr"]] = df_leaderboard[
	["median_time_train_s_per_1K", "rank", "hmr"]
	].round(2)
	df_leaderboard[
	["normalized-score", "median_time_infer_s_per_1K", "improvability"]
	] = df_leaderboard[
	["normalized-score", "median_time_infer_s_per_1K", "improvability"]
	].round(3)

	df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False)
	df_leaderboard = df_leaderboard.reset_index(drop=True)
	df_leaderboard = df_leaderboard.reset_index(names="#")

	# rename some columns
	return df_leaderboard.rename(
	columns={
	"median_time_train_s_per_1K": "Median Train Time (s/1K) [⬇️]",
	"median_time_infer_s_per_1K": "Median Predict Time (s/1K) [⬇️]",
	"method": "Model",
	"elo": "Elo [⬆️]",
	"rank": "Rank [⬇️]",
	"normalized-score": "Score [⬆️]",
	"hmr": "Harmonic Rank [⬇️]",
	"improvability": "Improvability (%) [⬇️]",
	"imputed": "Imputed (%) [⬇️]",
	"imputed_bool": "Imputed",
	}
	)


	@dataclass
	class LBContainer:
	name: str
	file_name: str
	blurb: str
	overview_image_name: str \| None
	df_leaderboard: pd.DataFrame \| None = None

	def __post_init__(self):
	self.df_leaderboard = load_data(self.file_name)


	def make_overview_image(
	overview_image_name: str \| None, data_source: str = "data"
	) -> None:
	path_to_image = Path(__file__).parent / data_source / overview_image_name
	path_to_image_zip = path_to_image.with_suffix(".png.zip")
	with zipfile.ZipFile(path_to_image_zip, "r") as zipf:
	zipf.extractall(path_to_image.parent)
	gr.Image(
	str(path_to_image), label="Leaderboard Overview", show_label=True, height=550
	)


	def make_overview_leaderboard(lbs: [LBContainer]):
	# Create column per LB
	all_models = {
	m.split("[")[0].strip()
	for lb in lbs
	for m in lb.df_leaderboard[
	~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"])
	]["Model"]
	.unique()
	.tolist()
	}

	full_df = None
	for lb in lbs:
	df = lb.df_leaderboard.copy()
	df = df[~df["TypeName"].isin(["Reference Pipeline"])]
	df[lb.name] = df["Elo [⬆️]"].rank(ascending=False, method="first").astype(int)
	df = df.sort_values(by=lb.name, ascending=True)

	# Adding indicators does not work as it makes it a string and then not sort
	# correctly.
	# df[lb.name] = df[lb.name].astype(str)
	# df[lb.name] = df[lb.name].replace({
	# "1": "🥇 1",
	# "2": "🥈 2",
	# "3": "🥉 3",
	# }
	# )

	df = df[["Type", "Model", lb.name]]
	# Remove imputed message.
	df["Model"] = (
	df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string")
	)

	if full_df is None:
	# TODO: add support in case a model did not run on the full LB.
	assert all_models.difference(set(df["Model"].unique())) == set()
	full_df = df
	else:
	df = df[["Model", lb.name]]
	df_models = set(df["Model"].unique())
	missing_models = all_models.difference(df_models)
	if missing_models:
	missing_models_df = pd.DataFrame(
	[[mm, "--"] for mm in missing_models],
	columns=["Model", lb.name],
	)
	df = pd.concat([df, missing_models_df], ignore_index=True)
	df["Model"] = df["Model"].astype("string")
	# Merge
	full_df = full_df.merge(df, how="left", on="Model", validate="1:1")

	medal_colors = ["#998A00", "#808080", "#8C5520"]

	# Highlight function
	def highlight_top3(col):
	styles = [""] * len(col)
	for index_i in range(len(col)):
	if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3:
	styles[index_i] = (
	f"background-color: {medal_colors[col.iloc[index_i] - 1]};"
	)

	return styles

	styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs])

	return gr.DataFrame(
	styler,
	pinned_columns=2,
	interactive=False,
	show_search="search",
	label="The ranking of all models (with imputation) across various leaderboards.",
	)


	def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
	# -- Add filters
	df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
	lambda m: f"{m} {model_type_emoji[m]}"
	)
	df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)")
	df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)")
	df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith(
	"(tuned + ensemble)"
	) \| df_leaderboard["Model"].str.endswith("(4h)")

	filter_columns = [
	ColumnFilter("TypeFiler", type="checkboxgroup", label="🤖 Model Types"),
	ColumnFilter("Only Default", type="boolean", default=False),
	ColumnFilter("Only Tuned", type="boolean", default=False),
	ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False),
	]

	# Add Imputed count postfix
	if any(df_leaderboard["Imputed"]):
	df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace(
	{
	True: "Imputed",
	False: "Not Imputed",
	}
	)
	filter_columns.append(
	ColumnFilter(
	"Imputed",
	type="checkboxgroup",
	label="(Not) Imputed Models",
	info="We impute the performance for models that cannot run on all"
	" datasets due to task or dataset size constraints (e.g. TabPFN,"
	" TabICL). We impute with the performance of a default RandomForest."
	" We add a postfix [X% IMPUTED] to the model if any results were"
	" imputed. The X% shows the percentage of"
	" datasets that were imputed. In general, imputation negatively"
	" represents the model performance, punishing the model for not"
	" being able to run on all datasets.",
	)
	)

	return Leaderboard(
	value=df_leaderboard,
	select_columns=SelectColumns(
	default_selection=list(df_leaderboard.columns),
	cant_deselect=["Type", "Model"],
	label="Select Columns to Display:",
	),
	hide_columns=[
	"TypeName",
	"TypeFiler",
	"RefModel",
	"Only Default",
	"Only Tuned",
	"Only Tuned + Ensemble",
	"Imputed",
	],
	search_columns=["Model", "TypeName"],
	filter_columns=filter_columns,
	bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):",
	height=800,
	)


	def _get_lbs() -> tuple[LBContainer, ...]:
	ta = LBContainer(
	name="🏅 Main",
	file_name="full-imputed/tabarena_leaderboard.csv",
	overview_image_name="full-imputed/tuning-impact-elo.png",
	blurb="Leaderboard for all datasets including all (imputed) models.",
	)
	ta_lite = LBContainer(
	name="Lite",
	file_name="lite/full-imputed/tabarena_leaderboard.csv",
	overview_image_name="lite/full-imputed/tuning-impact-elo.png",
	blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.",
	)
	ta_clf = LBContainer(
	name="Classification",
	file_name="full-imputed-cls/tabarena_leaderboard.csv",
	overview_image_name="full-imputed-cls/tuning-impact-elo.png",
	blurb="Leaderboard for all 38 classification datasets including all (imputed) models.",
	)
	ta_reg = LBContainer(
	name="Regression",
	file_name="full-imputed-reg/tabarena_leaderboard.csv",
	# FIXME: get overview image without TabICL
	overview_image_name="full-imputed-reg/tuning-impact-elo.png",
	blurb="Leaderboard for all 13 regression datasets including all (imputed) models.",
	)
	ta_tabicl = LBContainer(
	name="⚡ TabICL-data",
	file_name="tabicl-imputed/tabarena_leaderboard.csv",
	overview_image_name="tabicl-imputed/tuning-impact-elo.png",
	blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.",
	)
	ta_tabpfn = LBContainer(
	name="⚡ TabPFN-data",
	file_name="tabpfn-imputed/tabarena_leaderboard.csv",
	overview_image_name="tabpfn-imputed/tuning-impact-elo.png",
	blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.",
	)
	ta_tabpfn_tabicl = LBContainer(
	name="TabPFN/ICL-data",
	file_name="tabpfn-tabicl/tabarena_leaderboard.csv",
	overview_image_name="tabpfn-tabicl/tuning-impact-elo.png",
	blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.",
	)

	return ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl


	def main():
	css = """
	.markdown-text-box {
	padding: 4px;
	border-radius: 2px;
	}
	"""
	js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	window.location.href = url.href;
	}
	}
	"""
	demo = gr.Blocks(css=css, js=js_func, title="TabArena")
	with demo:
	gr.HTML(TITLE)

	# -- Introduction
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
	with gr.Row():
	with gr.Column(), gr.Accordion("📊 Datasets", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box"
	)

	with gr.Column(), gr.Accordion("🤖 Models", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
	)
	with gr.Row():
	with gr.Column(), gr.Accordion("📈 Metrics", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
	)
	with gr.Column(), gr.Accordion("📊 Reference Pipeline", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
	)
	with gr.Row(), gr.Accordion("📝 More Details", open=False):
	gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
	with gr.Row(), gr.Accordion("📙 Citation", open=False):
	gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=7,
	elem_id="citation-button",
	show_copy_button=True,
	)

	# -- Get all LBs we need:
	ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs()

	# -- LB Overview
	gr.Markdown("## 🗺️ TabArena Overview")
	ordered_lbs = [
	ta,
	ta_clf,
	ta_reg,
	ta_tabicl,
	ta_tabpfn,
	ta_tabpfn_tabicl,
	ta_lite,
	]
	make_overview_leaderboard(lbs=ordered_lbs)

	gr.Markdown("## 🏆 TabArena Leaderboards")
	with gr.Tabs(elem_classes="tab-buttons"):
	for lb_id, lb in enumerate(ordered_lbs):
	with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id):
	gr.Markdown(lb.blurb, elem_classes="markdown-text")
	make_overview_image(lb.overview_image_name)
	make_leaderboard(lb.df_leaderboard)

	with gr.Row(), gr.Accordion("📂 Version History", open=False):
	gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text")

	gr.Markdown("## Old Leaderboards")
	with (
	gr.Tabs(elem_classes="tab-buttons"),
	gr.TabItem("TabArena-v0.1", elem_id="llm-benchmark-tab-table", id=2),
	):
	df_leaderboard = load_data(
	"tabarena_leaderboard.csv.zip", data_source="old_data/v0_1_0"
	)
	df_leaderboard["Imputed"] = False
	imputed_map = {
	"TabPFNv2": 35.29,
	"TabICL": 29.41,
	}
	for model_name, imputed_percentage in imputed_map.items():
	if imputed_percentage == 100:
	# Filter methods that are fully imputed.
	df_leaderboard = df_leaderboard[
	~df_leaderboard["Model"].str.startswith(model_name)
	]
	else:
	mask = df_leaderboard["Model"].str.startswith(model_name)
	df_leaderboard.loc[mask, "Model"] = (
	df_leaderboard.loc[mask, "Model"]
	+ f" [{imputed_percentage:.2f}% IMPUTED]"
	)
	df_leaderboard.loc[mask, "Imputed"] = True
	# Post fix logic is incorrect, thus we overwrite it here.
	# See paper for details.
	df_leaderboard["Hardware"] = None
	make_leaderboard(df_leaderboard)

	scheduler = BackgroundScheduler()
	# scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()
	demo.launch()


	if __name__ == "__main__":
	main()