xhluca commited on
Commit
c6bd9a8
·
unverified ·
1 Parent(s): e56979c

refactor and update leaderboard

Browse files
Files changed (3) hide show
  1. app.py +31 -6
  2. results.csv +0 -16
  3. results.jsonl +16 -0
app.py CHANGED
@@ -1,18 +1,43 @@
1
  """
2
- gradio app that reads results.csv and display it in a table, title is "AgentRewardBench Leaderboard"
3
  """
4
- import gradio as gr
5
 
 
 
6
  import pandas as pd
7
 
 
8
  def load_data():
9
- # read the csv file
10
- df = pd.read_csv("./results.csv")
 
 
 
 
 
 
11
  # remove Recall and F1 columns
12
  df = df.drop(columns=["Recall", "F1"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # return the dataframe
14
  return df
15
 
 
16
  with gr.Blocks() as demo:
17
  gr.Markdown(
18
  """
@@ -29,6 +54,6 @@ with gr.Blocks() as demo:
29
  """
30
  )
31
  df = load_data()
32
- table = gr.DataFrame(df, show_label=False)
33
 
34
- demo.queue(default_concurrency_limit=40).launch()
 
1
  """
2
+ gradio app that reads results.jsonl and display it in a table, title is "AgentRewardBench Leaderboard"
3
  """
 
4
 
5
+ import gradio as gr
6
+ import json
7
  import pandas as pd
8
 
9
+
10
  def load_data():
11
+ # read the jsonl file
12
+ data = []
13
+ with open("./results.jsonl", "r", encoding="utf-8") as f:
14
+ for line in f:
15
+ if line.strip():
16
+ data.append(json.loads(line))
17
+
18
+ df = pd.DataFrame(data)
19
  # remove Recall and F1 columns
20
  df = df.drop(columns=["Recall", "F1"])
21
+
22
+ # if a field called "Overall" exists, sort the dataframe by "Overall"
23
+ if "Overall" in df.columns:
24
+ df = df.sort_values(by="Overall", ascending=False)
25
+
26
+ # if a field called "Project URL" exists, add a link to the project url
27
+ if "Project URL" in df.columns:
28
+ df["Author"] = "[" + df["Author"] + "](" + df["Project URL"] + ")"
29
+ # remove the Project URL column
30
+ df = df.drop(columns=["Project URL"])
31
+
32
+ # if a field called "Logs URL" exists, add a link to the logs url
33
+ if "Logs URL" in df.columns:
34
+ df["Logs"] = df["Logs URL"].apply(lambda x: f"[🔗]({x})" if x is not None else "✖️")
35
+ df = df.drop(columns=["Logs URL"])
36
+
37
  # return the dataframe
38
  return df
39
 
40
+
41
  with gr.Blocks() as demo:
42
  gr.Markdown(
43
  """
 
54
  """
55
  )
56
  df = load_data()
57
+ table = gr.DataFrame(df, show_label=False, datatype="markdown")
58
 
59
+ demo.queue(default_concurrency_limit=40).launch()
results.csv DELETED
@@ -1,16 +0,0 @@
1
- Judge,Overall,Recall,F1,AB,VWA,WA,Work,Work++,URL,Author
2
- Rule-based,83.8,55.9,67.1,25.0,85.2,79.0,100.0,83.3,https://arxiv.org/abs/2504.08942,Lù et al.
3
- WebJudge,73.7,-,-,66.7,69.8,72.6,92.3,75.0,https://arxiv.org/pdf/2504.01382,Xue et al.
4
- World-State-Model-7B,71.2,72.2,71.7,53.8,64.4,70.1,93.3,86.4,https://arxiv.org/abs/2508.04700,Sun et al.
5
- AER-C (GPT-4o),67.7,71.9,69.7,83.3,56.0,68.8,100.0,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
6
- AER-V (GPT-4o),67.6,71.5,69.5,83.3,61.2,67.6,96.4,59.3,https://arxiv.org/abs/2504.08942,Lù et al.
7
- NNetNav (Llama-3.3 70B),52.5,82.4,64.1,20.8,54.5,54.3,77.3,43.2,https://arxiv.org/abs/2504.08942,Lù et al.
8
- Claude 3.7 S. (A),68.8,81.6,74.7,87.5,61.0,69.3,85.0,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
9
- GPT-4o (A),69.8,83.1,75.9,77.8,63.0,70.2,94.6,63.0,https://arxiv.org/abs/2504.08942,Lù et al.
10
- GPT-4o Mini (A),61.5,86.1,71.7,80.0,57.9,63.5,84.2,49.4,https://arxiv.org/abs/2504.08942,Lù et al.
11
- Llama 3.3 (A),67.7,79.0,72.9,75.0,59.6,68.2,94.3,62.7,https://arxiv.org/abs/2504.08942,Lù et al.
12
- Qwen2.5-VL (A),64.3,89.8,75.0,72.7,59.3,63.6,87.2,60.3,https://arxiv.org/abs/2504.08942,Lù et al.
13
- Claude 3.7 S. (S),69.4,76.3,72.7,71.4,64.8,69.3,85.3,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
14
- GPT-4o (S),68.1,80.3,73.7,77.8,60.7,69.9,93.8,59.6,https://arxiv.org/abs/2504.08942,Lù et al.
15
- GPT-4o Mini (S),64.5,78.3,70.8,80.0,57.4,66.9,90.3,54.8,https://arxiv.org/abs/2504.08942,Lù et al.
16
- Qwen2.5-VL (S),64.5,86.1,73.7,70.0,58.5,62.9,93.8,64.4,https://arxiv.org/abs/2504.08942,Lù et al.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results.jsonl ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"Judge": "Rule-based", "Overall": 83.8, "Recall": 55.9, "F1": 67.1, "AB": 25.0, "VWA": 85.2, "WA": 79.0, "Work": 100.0, "Work++": 83.3, "Author": "Benchmark authors", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
2
+ {"Judge": "World-State-Model-7B", "Overall": 71.2, "Recall": 72.2, "F1": 71.7, "AB": 53.8, "VWA": 64.4, "WA": 70.1, "Work": 93.3, "Work++": 86.4, "Author": "Sun et al.", "Project URL": "https://arxiv.org/abs/2508.04700", "Logs URL": "https://huggingface.co/datasets/Zery/WSM-7B-AgentRewardBench/tree/main"}
3
+ {"Judge": "AER-C (GPT-4o)", "Overall": 67.7, "Recall": 71.9, "F1": 69.7, "AB": 83.3, "VWA": 56.0, "WA": 68.8, "Work": 100.0, "Work++": 66.7, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
4
+ {"Judge": "AER-V (GPT-4o)", "Overall": 67.6, "Recall": 71.5, "F1": 69.5, "AB": 83.3, "VWA": 61.2, "WA": 67.6, "Work": 96.4, "Work++": 59.3, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
5
+ {"Judge": "NNetNav (Llama-3.3 70B)", "Overall": 52.5, "Recall": 82.4, "F1": 64.1, "AB": 20.8, "VWA": 54.5, "WA": 54.3, "Work": 77.3, "Work++": 43.2, "Author": "Murty et al.", "Project URL": "https://arxiv.org/abs/2410.02907", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
6
+ {"Judge": "Claude 3.7 S. (A)", "Overall": 68.8, "Recall": 81.6, "F1": 74.7, "AB": 87.5, "VWA": 61.0, "WA": 69.3, "Work": 85.0, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
7
+ {"Judge": "GPT-4o (A)", "Overall": 69.8, "Recall": 83.1, "F1": 75.9, "AB": 77.8, "VWA": 63.0, "WA": 70.2, "Work": 94.6, "Work++": 63.0, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
8
+ {"Judge": "GPT-4o Mini (A)", "Overall": 61.5, "Recall": 86.1, "F1": 71.7, "AB": 80.0, "VWA": 57.9, "WA": 63.5, "Work": 84.2, "Work++": 49.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
9
+ {"Judge": "Llama 3.3 (A)", "Overall": 67.7, "Recall": 79.0, "F1": 72.9, "AB": 75.0, "VWA": 59.6, "WA": 68.2, "Work": 94.3, "Work++": 62.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
10
+ {"Judge": "Qwen2.5-VL (A)", "Overall": 64.3, "Recall": 89.8, "F1": 75.0, "AB": 72.7, "VWA": 59.3, "WA": 63.6, "Work": 87.2, "Work++": 60.3, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
11
+ {"Judge": "Claude 3.7 S. (S)", "Overall": 69.4, "Recall": 76.3, "F1": 72.7, "AB": 71.4, "VWA": 64.8, "WA": 69.3, "Work": 85.3, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
12
+ {"Judge": "GPT-4o (S)", "Overall": 68.1, "Recall": 80.3, "F1": 73.7, "AB": 77.8, "VWA": 60.7, "WA": 69.9, "Work": 93.8, "Work++": 59.6, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
13
+ {"Judge": "GPT-4o Mini (S)", "Overall": 64.5, "Recall": 78.3, "F1": 70.8, "AB": 80.0, "VWA": 57.4, "WA": 66.9, "Work": 90.3, "Work++": 54.8, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
14
+ {"Judge": "Qwen2.5-VL (S)", "Overall": 64.5, "Recall": 86.1, "F1": 73.7, "AB": 70.0, "VWA": 58.5, "WA": 62.9, "Work": 93.8, "Work++": 64.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
15
+ {"Judge": "WebJudge-7B", "Overall": 75.7, "Recall": 58.0, "F1": 65.6, "AB": 80.0, "VWA": 66.7, "WA": 77.5, "Work": 100.0, "Work++": 70.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/pdf/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}
16
+ {"Judge": "WebJudge (o4-mini)", "Overall": 82.0, "Recall": 47.8, "F1": 60.4, "AB": 100.0, "VWA": 74.5, "WA": 81.2, "Work": 100.0, "Work++": 90.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/abs/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}