shyuli commited on
Commit
542377d
Β·
1 Parent(s): c3f10b2

version v0.1

Browse files
Files changed (2) hide show
  1. app.py +37 -0
  2. src/about.py +3 -3
app.py CHANGED
@@ -125,13 +125,42 @@ except Exception as e:
125
  print(f"Could not setup eval results path: {e}")
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
129
 
130
  (
131
  finished_eval_queue_df,
132
  running_eval_queue_df,
133
  pending_eval_queue_df,
134
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
135
 
136
 
137
  def init_leaderboard(dataframe):
@@ -161,19 +190,26 @@ def create_demo():
161
  gr.HTML(TITLE)
162
 
163
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
164
  with gr.TabItem("πŸ… SearchAgent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
165
  leaderboard = init_leaderboard(LEADERBOARD_DF)
166
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
167
 
 
168
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
169
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
170
 
 
171
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
172
  with gr.Column():
173
  with gr.Row():
174
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
175
 
176
  with gr.Row():
 
177
  with gr.Accordion("πŸ“™ Citation", open=False):
178
  gr.Textbox(
179
  value=CITATION_BUTTON_TEXT,
@@ -182,6 +218,7 @@ def create_demo():
182
  elem_id="citation-button",
183
  show_copy_button=True,
184
  )
 
185
 
186
  return demo
187
 
 
125
  print(f"Could not setup eval results path: {e}")
126
 
127
 
128
+ def _debug_print_dataframe(name: str, dataframe: pd.DataFrame) -> None:
129
+ if dataframe is None:
130
+ print(f"[debug] {name}: DataFrame is None")
131
+ return
132
+ print(f"[debug] {name}: shape={dataframe.shape}, columns={list(dataframe.columns)}")
133
+ if not dataframe.empty:
134
+ preview = dataframe.head().to_dict(orient="records")
135
+ print(f"[debug] {name}: head={preview}")
136
+ else:
137
+ print(f"[debug] {name}: DataFrame is empty")
138
+
139
+
140
+ def _debug_list_dir(label: str, path: str, limit: int = 10) -> None:
141
+ try:
142
+ entries = os.listdir(path)
143
+ print(f"[debug] {label}: path={path}, count={len(entries)}, preview={entries[:limit]}")
144
+ except FileNotFoundError:
145
+ print(f"[debug] {label}: path={path} not found")
146
+ except Exception as exc:
147
+ print(f"[debug] {label}: path={path} error={exc}")
148
+
149
+
150
+ _debug_list_dir("EVAL_RESULTS", EVAL_RESULTS_PATH)
151
+ _debug_list_dir("EVAL_QUEUE", EVAL_REQUESTS_PATH)
152
+
153
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
154
+ _debug_print_dataframe("LEADERBOARD", LEADERBOARD_DF)
155
 
156
  (
157
  finished_eval_queue_df,
158
  running_eval_queue_df,
159
  pending_eval_queue_df,
160
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
161
+ _debug_print_dataframe("EVAL_QUEUE_FINISHED", finished_eval_queue_df)
162
+ _debug_print_dataframe("EVAL_QUEUE_RUNNING", running_eval_queue_df)
163
+ _debug_print_dataframe("EVAL_QUEUE_PENDING", pending_eval_queue_df)
164
 
165
 
166
  def init_leaderboard(dataframe):
 
190
  gr.HTML(TITLE)
191
 
192
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
193
+ print("[debug] Rendering leaderboard tab start")
194
  with gr.TabItem("πŸ… SearchAgent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
195
  leaderboard = init_leaderboard(LEADERBOARD_DF)
196
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
197
+ print("[debug] Rendering leaderboard tab done")
198
 
199
+ print("[debug] Rendering about tab start")
200
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
201
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
202
+ print("[debug] Rendering about tab done")
203
 
204
+ print("[debug] Rendering submit tab start")
205
  with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
206
  with gr.Column():
207
  with gr.Row():
208
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
209
+ print("[debug] Rendering submit tab done")
210
 
211
  with gr.Row():
212
+ print("[debug] Rendering citation start")
213
  with gr.Accordion("πŸ“™ Citation", open=False):
214
  gr.Textbox(
215
  value=CITATION_BUTTON_TEXT,
 
218
  elem_id="citation-button",
219
  show_copy_button=True,
220
  )
221
+ print("[debug] Rendering citation done")
222
 
223
  return demo
224
 
src/about.py CHANGED
@@ -42,12 +42,12 @@ This leaderboard evaluates the performance of **search-augmented question answer
42
 
43
  We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:
44
 
45
- ### General QA (Set A)
46
  - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
47
  - **TriviaQA**: Trivia questions requiring document-based answer extraction
48
  - **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
49
 
50
- ### Multi-Hop QA (Set B)
51
  - **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
52
  - **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
53
  - **Musique**: Multi-step compositional reasoning QA via single-hop question composition
@@ -68,7 +68,7 @@ LLM_BENCHMARKS_TEXT = f"""
68
  This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:
69
 
70
  1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access
71
- 2. **Test Sets**: Broad evaluation (Set A) vs. focused multi-hop evaluation (Set B)
72
  3. **Training Regimes**: No training to multi-dataset fine-tuning approaches
73
  4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations
74
 
 
42
 
43
  We evaluate on a comprehensive set of benchmarks that test different aspects of search-augmented QA:
44
 
45
+ ### General QA
46
  - **NQ**: Natural Questions - QA based on real Google search queries from Wikipedia
47
  - **TriviaQA**: Trivia questions requiring document-based answer extraction
48
  - **PopQA**: Popular culture QA testing knowledge breadth and parametric vs. non-parametric memory
49
 
50
+ ### Multi-Hop QA
51
  - **HotpotQA**: Complex QA requiring reasoning across multiple documents with explainable reasoning chains
52
  - **2wiki**: Multi-hop reasoning based on Wikipedia requiring compositional reasoning
53
  - **Musique**: Multi-step compositional reasoning QA via single-hop question composition
 
68
  This leaderboard addresses the challenge of inconsistent experimental settings in search agent evaluation by providing standardized comparisons. Prior works vary significantly in:
69
 
70
  1. **Corpora**: From static Wikipedia snapshots (2018, 2019) to live Internet access
71
+ 2. **Test Sets**: Broad evaluation vs. focused multi-hop evaluation
72
  3. **Training Regimes**: No training to multi-dataset fine-tuning approaches
73
  4. **Metrics**: Exact Match, F1, Substring matching, and LLM-as-a-judge evaluations
74