SearchAgent_Leaderboard

Running

App Files Files Community

shyuli commited on Sep 29

Commit

542377d

1 Parent(s): c3f10b2

version v0.1

Browse files

Files changed (2) hide show

app.py +37 -0
src/about.py +3 -3

app.py CHANGED Viewed

@@ -125,13 +125,42 @@ except Exception as e:
     print(f"Could not setup eval results path: {e}")
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
@@ -161,19 +190,26 @@ def create_demo():
         gr.HTML(TITLE)
         with gr.Tabs(elem_classes="tab-buttons") as tabs:
             with gr.TabItem("🏅 SearchAgent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
                 leaderboard = init_leaderboard(LEADERBOARD_DF)
                 gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
             with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
                 gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
                 with gr.Column():
                     with gr.Row():
                         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
                 gr.Textbox(
                     value=CITATION_BUTTON_TEXT,
@@ -182,6 +218,7 @@ def create_demo():
                     elem_id="citation-button",
                     show_copy_button=True,
                 )
     return demo

     print(f"Could not setup eval results path: {e}")
+def _debug_print_dataframe(name: str, dataframe: pd.DataFrame) -> None:
+    if dataframe is None:
+        print(f"[debug] {name}: DataFrame is None")
+        return
+    print(f"[debug] {name}: shape={dataframe.shape}, columns={list(dataframe.columns)}")
+    if not dataframe.empty:
+        preview = dataframe.head().to_dict(orient="records")
+        print(f"[debug] {name}: head={preview}")
+    else:
+        print(f"[debug] {name}: DataFrame is empty")
+def _debug_list_dir(label: str, path: str, limit: int = 10) -> None:
+    try:
+        entries = os.listdir(path)
+        print(f"[debug] {label}: path={path}, count={len(entries)}, preview={entries[:limit]}")
+    except FileNotFoundError:
+        print(f"[debug] {label}: path={path} not found")
+    except Exception as exc:
+        print(f"[debug] {label}: path={path} error={exc}")
+_debug_list_dir("EVAL_RESULTS", EVAL_RESULTS_PATH)
+_debug_list_dir("EVAL_QUEUE", EVAL_REQUESTS_PATH)
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+_debug_print_dataframe("LEADERBOARD", LEADERBOARD_DF)
 (
     finished_eval_queue_df,
     running_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+_debug_print_dataframe("EVAL_QUEUE_FINISHED", finished_eval_queue_df)
+_debug_print_dataframe("EVAL_QUEUE_RUNNING", running_eval_queue_df)
+_debug_print_dataframe("EVAL_QUEUE_PENDING", pending_eval_queue_df)
 def init_leaderboard(dataframe):
         gr.HTML(TITLE)
         with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            print("[debug] Rendering leaderboard tab start")
             with gr.TabItem("🏅 SearchAgent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
                 leaderboard = init_leaderboard(LEADERBOARD_DF)
                 gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+            print("[debug] Rendering leaderboard tab done")
+            print("[debug] Rendering about tab start")
             with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
                 gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            print("[debug] Rendering about tab done")
+            print("[debug] Rendering submit tab start")
             with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
                 with gr.Column():
                     with gr.Row():
                         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+            print("[debug] Rendering submit tab done")
         with gr.Row():
+            print("[debug] Rendering citation start")
             with gr.Accordion("📙 Citation", open=False):
                 gr.Textbox(
                     value=CITATION_BUTTON_TEXT,
                     elem_id="citation-button",
                     show_copy_button=True,
                 )
+            print("[debug] Rendering citation done")
     return demo

src/about.py CHANGED Viewed

@@ -42,12 +42,12 @@ This leaderboard evaluates the performance of **search-augmented question answer
 We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:
-### General QA (Set A)
 - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
 - **TriviaQA**: Trivia questions requiring document-based answer extraction
 - **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
-### Multi-Hop QA (Set B)
 - **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
 - **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
 - **Musique**: Multi-step compositional reasoning QA via single-hop question composition
@@ -68,7 +68,7 @@ LLM_BENCHMARKS_TEXT = f"""
 This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:
 1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access
-2. **Test Sets**: Broad evaluation (Set A) vs. focused multi-hop evaluation (Set B)
 3. **Training Regimes**: No training to multi-dataset fine-tuning approaches
 4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations

 We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:
+### General QA
 - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
 - **TriviaQA**: Trivia questions requiring document-based answer extraction
 - **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
+### Multi-Hop QA
 - **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
 - **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
 - **Musique**: Multi-step compositional reasoning QA via single-hop question composition
 This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:
 1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access
+2. **Test Sets**: Broad evaluation vs. focused multi-hop evaluation
 3. **Training Regimes**: No training to multi-dataset fine-tuning approaches
 4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations