Spaces:
Running
Running
| from __future__ import annotations | |
| import zipfile | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import website_texts | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from constants import Constants, model_type_emoji | |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns | |
| from website_texts import ( | |
| ABOUT_TEXT, | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| INTRODUCTION_TEXT, | |
| TITLE, | |
| VERSION_HISTORY_BUTTON_TEXT, | |
| ) | |
| def get_model_family(model_name: str) -> str: | |
| prefixes_mapping = { | |
| Constants.reference: ["AutoGluon"], | |
| Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"], | |
| Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"], | |
| Constants.foundational: ["TABDPT", "TABICL", "TABPFN"], | |
| Constants.baseline: ["KNN", "LR"], | |
| } | |
| for method_type, prefixes in prefixes_mapping.items(): | |
| for prefix in prefixes: | |
| if prefix.lower() in model_name.lower(): | |
| return method_type | |
| return Constants.other | |
| def rename_map(model_name: str) -> str: | |
| rename_map = { | |
| "TABM": "TabM", | |
| "REALMLP": "RealMLP", | |
| "GBM": "LightGBM", | |
| "CAT": "CatBoost", | |
| "XGB": "XGBoost", | |
| "XT": "ExtraTrees", | |
| "RF": "RandomForest", | |
| "MNCA": "ModernNCA", | |
| "NN_TORCH": "TorchMLP", | |
| "FASTAI": "FastaiMLP", | |
| "TABPFNV2": "TabPFNv2", | |
| "EBM": "EBM", | |
| "TABDPT": "TabDPT", | |
| "TABICL": "TabICL", | |
| "KNN": "KNN", | |
| "LR": "Linear", | |
| } | |
| for prefix in rename_map: | |
| if prefix in model_name: | |
| return model_name.replace(prefix, rename_map[prefix]) | |
| return model_name | |
| def load_data(filename: str, data_source="data"): | |
| df_leaderboard = pd.read_csv(Path(__file__).parent / data_source / filename) | |
| # Add Model Family Information | |
| df_leaderboard["Type"] = df_leaderboard.loc[:, "method"].apply( | |
| lambda s: model_type_emoji[get_model_family(s)] | |
| ) | |
| df_leaderboard["TypeName"] = df_leaderboard.loc[:, "method"].apply( | |
| lambda s: get_model_family(s) | |
| ) | |
| df_leaderboard["method"] = df_leaderboard["method"].apply(rename_map) | |
| # elo,elo+,elo-,mrr | |
| df_leaderboard["Elo 95% CI"] = ( | |
| "+" | |
| + df_leaderboard["elo+"].round(0).astype(int).astype(str) | |
| + "/-" | |
| + df_leaderboard["elo-"].round(0).astype(int).astype(str) | |
| ) | |
| # select only the columns we want to display | |
| df_leaderboard["normalized-score"] = 1 - df_leaderboard["normalized-error"] | |
| df_leaderboard["hmr"] = 1 / df_leaderboard["mrr"] | |
| df_leaderboard["improvability"] = 100 * df_leaderboard["champ_delta"] | |
| # Imputed logic | |
| if "imputed" in df_leaderboard.columns: | |
| df_leaderboard["imputed"] = (100 * df_leaderboard["imputed"]).round(2) | |
| df_leaderboard["imputed_bool"] = False | |
| # Filter methods that are fully imputed. | |
| df_leaderboard = df_leaderboard[~(df_leaderboard["imputed"] == 100)] | |
| # Add imputed column and add name postfix | |
| imputed_mask = df_leaderboard["imputed"] != 0 | |
| df_leaderboard.loc[imputed_mask, "imputed_bool"] = True | |
| df_leaderboard.loc[imputed_mask, "method"] = df_leaderboard.loc[ | |
| imputed_mask, ["method", "imputed"] | |
| ].apply(lambda row: row["method"] + f" [{row['imputed']:.2f}% IMPUTED]", axis=1) | |
| else: | |
| df_leaderboard["imputed_bool"] = None | |
| df_leaderboard["imputed"] = None | |
| # Resolve GPU postfix | |
| gpu_postfix = "_GPU" | |
| df_leaderboard["Hardware"] = df_leaderboard["method"].apply( | |
| lambda x: "CPU" if gpu_postfix not in x else "GPU" | |
| ) | |
| df_leaderboard["method"] = df_leaderboard["method"].str.replace(gpu_postfix, "") | |
| df_leaderboard = df_leaderboard.loc[ | |
| :, | |
| [ | |
| "Type", | |
| "TypeName", | |
| "method", | |
| "elo", | |
| "Elo 95% CI", | |
| "normalized-score", | |
| "rank", | |
| "hmr", | |
| "improvability", | |
| "median_time_train_s_per_1K", | |
| "median_time_infer_s_per_1K", | |
| "imputed", | |
| "imputed_bool", | |
| "Hardware", | |
| ], | |
| ] | |
| # round for better display | |
| df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round( | |
| 0 | |
| ) | |
| df_leaderboard[["median_time_train_s_per_1K", "rank", "hmr"]] = df_leaderboard[ | |
| ["median_time_train_s_per_1K", "rank", "hmr"] | |
| ].round(2) | |
| df_leaderboard[ | |
| ["normalized-score", "median_time_infer_s_per_1K", "improvability"] | |
| ] = df_leaderboard[ | |
| ["normalized-score", "median_time_infer_s_per_1K", "improvability"] | |
| ].round(3) | |
| df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False) | |
| df_leaderboard = df_leaderboard.reset_index(drop=True) | |
| df_leaderboard = df_leaderboard.reset_index(names="#") | |
| # rename some columns | |
| return df_leaderboard.rename( | |
| columns={ | |
| "median_time_train_s_per_1K": "Median Train Time (s/1K) [β¬οΈ]", | |
| "median_time_infer_s_per_1K": "Median Predict Time (s/1K) [β¬οΈ]", | |
| "method": "Model", | |
| "elo": "Elo [β¬οΈ]", | |
| "rank": "Rank [β¬οΈ]", | |
| "normalized-score": "Score [β¬οΈ]", | |
| "hmr": "Harmonic Rank [β¬οΈ]", | |
| "improvability": "Improvability (%) [β¬οΈ]", | |
| "imputed": "Imputed (%) [β¬οΈ]", | |
| "imputed_bool": "Imputed", | |
| } | |
| ) | |
| class LBContainer: | |
| name: str | |
| file_name: str | |
| blurb: str | |
| overview_image_name: str | None | |
| df_leaderboard: pd.DataFrame | None = None | |
| def __post_init__(self): | |
| self.df_leaderboard = load_data(self.file_name) | |
| def make_overview_image( | |
| overview_image_name: str | None, data_source: str = "data" | |
| ) -> None: | |
| path_to_image = Path(__file__).parent / data_source / overview_image_name | |
| path_to_image_zip = path_to_image.with_suffix(".png.zip") | |
| with zipfile.ZipFile(path_to_image_zip, "r") as zipf: | |
| zipf.extractall(path_to_image.parent) | |
| gr.Image( | |
| str(path_to_image), label="Leaderboard Overview", show_label=True, height=550 | |
| ) | |
| def make_overview_leaderboard(lbs: [LBContainer]): | |
| # Create column per LB | |
| all_models = { | |
| m.split("[")[0].strip() | |
| for lb in lbs | |
| for m in lb.df_leaderboard[ | |
| ~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"]) | |
| ]["Model"] | |
| .unique() | |
| .tolist() | |
| } | |
| full_df = None | |
| for lb in lbs: | |
| df = lb.df_leaderboard.copy() | |
| df = df[~df["TypeName"].isin(["Reference Pipeline"])] | |
| df[lb.name] = df["Elo [β¬οΈ]"].rank(ascending=False, method="first").astype(int) | |
| df = df.sort_values(by=lb.name, ascending=True) | |
| # Adding indicators does not work as it makes it a string and then not sort | |
| # correctly. | |
| # df[lb.name] = df[lb.name].astype(str) | |
| # df[lb.name] = df[lb.name].replace({ | |
| # "1": "π₯ 1", | |
| # "2": "π₯ 2", | |
| # "3": "π₯ 3", | |
| # } | |
| # ) | |
| df = df[["Type", "Model", lb.name]] | |
| # Remove imputed message. | |
| df["Model"] = ( | |
| df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string") | |
| ) | |
| if full_df is None: | |
| # TODO: add support in case a model did not run on the full LB. | |
| assert all_models.difference(set(df["Model"].unique())) == set() | |
| full_df = df | |
| else: | |
| df = df[["Model", lb.name]] | |
| df_models = set(df["Model"].unique()) | |
| missing_models = all_models.difference(df_models) | |
| if missing_models: | |
| missing_models_df = pd.DataFrame( | |
| [[mm, "--"] for mm in missing_models], | |
| columns=["Model", lb.name], | |
| ) | |
| df = pd.concat([df, missing_models_df], ignore_index=True) | |
| df["Model"] = df["Model"].astype("string") | |
| # Merge | |
| full_df = full_df.merge(df, how="left", on="Model", validate="1:1") | |
| medal_colors = ["#998A00", "#808080", "#8C5520"] | |
| # Highlight function | |
| def highlight_top3(col): | |
| styles = [""] * len(col) | |
| for index_i in range(len(col)): | |
| if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3: | |
| styles[index_i] = ( | |
| f"background-color: {medal_colors[col.iloc[index_i] - 1]};" | |
| ) | |
| return styles | |
| styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs]) | |
| return gr.DataFrame( | |
| styler, | |
| pinned_columns=2, | |
| interactive=False, | |
| show_search="search", | |
| label="The ranking of all models (with imputation) across various leaderboards.", | |
| ) | |
| def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard: | |
| # -- Add filters | |
| df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply( | |
| lambda m: f"{m} {model_type_emoji[m]}" | |
| ) | |
| df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)") | |
| df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)") | |
| df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith( | |
| "(tuned + ensemble)" | |
| ) | df_leaderboard["Model"].str.endswith("(4h)") | |
| filter_columns = [ | |
| ColumnFilter("TypeFiler", type="checkboxgroup", label="π€ Model Types"), | |
| ColumnFilter("Only Default", type="boolean", default=False), | |
| ColumnFilter("Only Tuned", type="boolean", default=False), | |
| ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False), | |
| ] | |
| # Add Imputed count postfix | |
| if any(df_leaderboard["Imputed"]): | |
| df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace( | |
| { | |
| True: "Imputed", | |
| False: "Not Imputed", | |
| } | |
| ) | |
| filter_columns.append( | |
| ColumnFilter( | |
| "Imputed", | |
| type="checkboxgroup", | |
| label="(Not) Imputed Models", | |
| info="We impute the performance for models that cannot run on all" | |
| " datasets due to task or dataset size constraints (e.g. TabPFN," | |
| " TabICL). We impute with the performance of a default RandomForest." | |
| " We add a postfix [X% IMPUTED] to the model if any results were" | |
| " imputed. The X% shows the percentage of" | |
| " datasets that were imputed. In general, imputation negatively" | |
| " represents the model performance, punishing the model for not" | |
| " being able to run on all datasets.", | |
| ) | |
| ) | |
| return Leaderboard( | |
| value=df_leaderboard, | |
| select_columns=SelectColumns( | |
| default_selection=list(df_leaderboard.columns), | |
| cant_deselect=["Type", "Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| hide_columns=[ | |
| "TypeName", | |
| "TypeFiler", | |
| "RefModel", | |
| "Only Default", | |
| "Only Tuned", | |
| "Only Tuned + Ensemble", | |
| "Imputed", | |
| ], | |
| search_columns=["Model", "TypeName"], | |
| filter_columns=filter_columns, | |
| bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):", | |
| height=800, | |
| ) | |
| def _get_lbs() -> tuple[LBContainer, ...]: | |
| ta = LBContainer( | |
| name="π Main", | |
| file_name="full-imputed/tabarena_leaderboard.csv", | |
| overview_image_name="full-imputed/tuning-impact-elo.png", | |
| blurb="Leaderboard for all datasets including all (imputed) models.", | |
| ) | |
| ta_lite = LBContainer( | |
| name="Lite", | |
| file_name="lite/full-imputed/tabarena_leaderboard.csv", | |
| overview_image_name="lite/full-imputed/tuning-impact-elo.png", | |
| blurb="Leaderboard for one split (1st fold, 1st repeat) for all datasets including all (imputed) models.", | |
| ) | |
| ta_clf = LBContainer( | |
| name="Classification", | |
| file_name="full-imputed-cls/tabarena_leaderboard.csv", | |
| overview_image_name="full-imputed-cls/tuning-impact-elo.png", | |
| blurb="Leaderboard for all 38 classification datasets including all (imputed) models.", | |
| ) | |
| ta_reg = LBContainer( | |
| name="Regression", | |
| file_name="full-imputed-reg/tabarena_leaderboard.csv", | |
| # FIXME: get overview image without TabICL | |
| overview_image_name="full-imputed-reg/tuning-impact-elo.png", | |
| blurb="Leaderboard for all 13 regression datasets including all (imputed) models.", | |
| ) | |
| ta_tabicl = LBContainer( | |
| name="β‘ TabICL-data", | |
| file_name="tabicl-imputed/tabarena_leaderboard.csv", | |
| overview_image_name="tabicl-imputed/tuning-impact-elo.png", | |
| blurb="Leaderboard for all 36 datasets within the constraints of TabICL including all (imputed) models.", | |
| ) | |
| ta_tabpfn = LBContainer( | |
| name="β‘ TabPFN-data", | |
| file_name="tabpfn-imputed/tabarena_leaderboard.csv", | |
| overview_image_name="tabpfn-imputed/tuning-impact-elo.png", | |
| blurb="Leaderboard for all 33 datasets within the constraints of TabPFN including all (imputed) models.", | |
| ) | |
| ta_tabpfn_tabicl = LBContainer( | |
| name="TabPFN/ICL-data", | |
| file_name="tabpfn-tabicl/tabarena_leaderboard.csv", | |
| overview_image_name="tabpfn-tabicl/tuning-impact-elo.png", | |
| blurb="Leaderboard for all 26 datasets within the constraints of TabPFN and TabICL including all models.", | |
| ) | |
| return ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl | |
| def main(): | |
| css = """ | |
| .markdown-text-box { | |
| padding: 4px; | |
| border-radius: 2px; | |
| } | |
| """ | |
| js_func = """ | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'dark') { | |
| url.searchParams.set('__theme', 'dark'); | |
| window.location.href = url.href; | |
| } | |
| } | |
| """ | |
| demo = gr.Blocks(css=css, js=js_func, title="TabArena") | |
| with demo: | |
| gr.HTML(TITLE) | |
| # -- Introduction | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(), gr.Accordion("π Datasets", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Column(), gr.Accordion("π€ Models", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(), gr.Accordion("π Metrics", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Column(), gr.Accordion("π Reference Pipeline", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Row(), gr.Accordion("π More Details", open=False): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box") | |
| with gr.Row(), gr.Accordion("π Citation", open=False): | |
| gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=7, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # -- Get all LBs we need: | |
| ta, ta_lite, ta_clf, ta_reg, ta_tabicl, ta_tabpfn, ta_tabpfn_tabicl = _get_lbs() | |
| # -- LB Overview | |
| gr.Markdown("## πΊοΈ TabArena Overview") | |
| ordered_lbs = [ | |
| ta, | |
| ta_clf, | |
| ta_reg, | |
| ta_tabicl, | |
| ta_tabpfn, | |
| ta_tabpfn_tabicl, | |
| ta_lite, | |
| ] | |
| make_overview_leaderboard(lbs=ordered_lbs) | |
| gr.Markdown("## π TabArena Leaderboards") | |
| with gr.Tabs(elem_classes="tab-buttons"): | |
| for lb_id, lb in enumerate(ordered_lbs): | |
| with gr.TabItem(lb.name, elem_id="llm-benchmark-tab-table", id=lb_id): | |
| gr.Markdown(lb.blurb, elem_classes="markdown-text") | |
| make_overview_image(lb.overview_image_name) | |
| make_leaderboard(lb.df_leaderboard) | |
| with gr.Row(), gr.Accordion("π Version History", open=False): | |
| gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text") | |
| gr.Markdown("## Old Leaderboards") | |
| with ( | |
| gr.Tabs(elem_classes="tab-buttons"), | |
| gr.TabItem("TabArena-v0.1", elem_id="llm-benchmark-tab-table", id=2), | |
| ): | |
| df_leaderboard = load_data( | |
| "tabarena_leaderboard.csv.zip", data_source="old_data/v0_1_0" | |
| ) | |
| df_leaderboard["Imputed"] = False | |
| imputed_map = { | |
| "TabPFNv2": 35.29, | |
| "TabICL": 29.41, | |
| } | |
| for model_name, imputed_percentage in imputed_map.items(): | |
| if imputed_percentage == 100: | |
| # Filter methods that are fully imputed. | |
| df_leaderboard = df_leaderboard[ | |
| ~df_leaderboard["Model"].str.startswith(model_name) | |
| ] | |
| else: | |
| mask = df_leaderboard["Model"].str.startswith(model_name) | |
| df_leaderboard.loc[mask, "Model"] = ( | |
| df_leaderboard.loc[mask, "Model"] | |
| + f" [{imputed_percentage:.2f}% IMPUTED]" | |
| ) | |
| df_leaderboard.loc[mask, "Imputed"] = True | |
| # Post fix logic is incorrect, thus we overwrite it here. | |
| # See paper for details. | |
| df_leaderboard["Hardware"] = None | |
| make_leaderboard(df_leaderboard) | |
| scheduler = BackgroundScheduler() | |
| # scheduler.add_job(restart_space, "interval", seconds=1800) | |
| scheduler.start() | |
| demo.queue(default_concurrency_limit=40).launch() | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() | |