agent-reward-bench-leaderboard

Running

App Files Files Community

xhluca commited on Aug 17

Commit

c6bd9a8

unverified ·

1 Parent(s): e56979c

refactor and update leaderboard

Browse files

Files changed (3) hide show

app.py +31 -6
results.csv +0 -16
results.jsonl +16 -0

app.py CHANGED Viewed

@@ -1,18 +1,43 @@
 """
-gradio app that reads results.csv and display it in a table, title is "AgentRewardBench Leaderboard"
 """
-import gradio as gr
 import pandas as pd
 def load_data():
-    # read the csv file
-    df = pd.read_csv("./results.csv")
     # remove Recall and F1 columns
     df = df.drop(columns=["Recall", "F1"])
     # return the dataframe
     return df
 with gr.Blocks() as demo:
     gr.Markdown(
         """
@@ -29,6 +54,6 @@ with gr.Blocks() as demo:
         """
     )
     df = load_data()
-    table = gr.DataFrame(df, show_label=False)
-demo.queue(default_concurrency_limit=40).launch()

 """
+gradio app that reads results.jsonl and display it in a table, title is "AgentRewardBench Leaderboard"
 """
+import gradio as gr
+import json
 import pandas as pd
 def load_data():
+    # read the jsonl file
+    data = []
+    with open("./results.jsonl", "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                data.append(json.loads(line))
+    df = pd.DataFrame(data)
     # remove Recall and F1 columns
     df = df.drop(columns=["Recall", "F1"])
+    # if a field called "Overall" exists, sort the dataframe by "Overall"
+    if "Overall" in df.columns:
+        df = df.sort_values(by="Overall", ascending=False)
+    # if a field called "Project URL" exists, add a link to the project url
+    if "Project URL" in df.columns:
+        df["Author"] = "[" + df["Author"] + "](" + df["Project URL"] + ")"
+        # remove the Project URL column
+        df = df.drop(columns=["Project URL"])
+    # if a field called "Logs URL" exists, add a link to the logs url
+    if "Logs URL" in df.columns:
+        df["Logs"] = df["Logs URL"].apply(lambda x: f"[🔗]({x})" if x is not None else "✖️")
+        df = df.drop(columns=["Logs URL"])
     # return the dataframe
     return df
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         """
     )
     df = load_data()
+    table = gr.DataFrame(df, show_label=False, datatype="markdown")
+demo.queue(default_concurrency_limit=40).launch()

results.csv DELETED Viewed

@@ -1,16 +0,0 @@
-Judge,Overall,Recall,F1,AB,VWA,WA,Work,Work++,URL,Author
-Rule-based,83.8,55.9,67.1,25.0,85.2,79.0,100.0,83.3,https://arxiv.org/abs/2504.08942,Lù et al.
-WebJudge,73.7,-,-,66.7,69.8,72.6,92.3,75.0,https://arxiv.org/pdf/2504.01382,Xue et al.
-World-State-Model-7B,71.2,72.2,71.7,53.8,64.4,70.1,93.3,86.4,https://arxiv.org/abs/2508.04700,Sun et al.
-AER-C (GPT-4o),67.7,71.9,69.7,83.3,56.0,68.8,100.0,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
-AER-V (GPT-4o),67.6,71.5,69.5,83.3,61.2,67.6,96.4,59.3,https://arxiv.org/abs/2504.08942,Lù et al.
-NNetNav (Llama-3.3 70B),52.5,82.4,64.1,20.8,54.5,54.3,77.3,43.2,https://arxiv.org/abs/2504.08942,Lù et al.
-Claude 3.7 S. (A),68.8,81.6,74.7,87.5,61.0,69.3,85.0,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
-GPT-4o (A),69.8,83.1,75.9,77.8,63.0,70.2,94.6,63.0,https://arxiv.org/abs/2504.08942,Lù et al.
-GPT-4o Mini (A),61.5,86.1,71.7,80.0,57.9,63.5,84.2,49.4,https://arxiv.org/abs/2504.08942,Lù et al.
-Llama 3.3 (A),67.7,79.0,72.9,75.0,59.6,68.2,94.3,62.7,https://arxiv.org/abs/2504.08942,Lù et al.
-Qwen2.5-VL (A),64.3,89.8,75.0,72.7,59.3,63.6,87.2,60.3,https://arxiv.org/abs/2504.08942,Lù et al.
-Claude 3.7 S. (S),69.4,76.3,72.7,71.4,64.8,69.3,85.3,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
-GPT-4o (S),68.1,80.3,73.7,77.8,60.7,69.9,93.8,59.6,https://arxiv.org/abs/2504.08942,Lù et al.
-GPT-4o Mini (S),64.5,78.3,70.8,80.0,57.4,66.9,90.3,54.8,https://arxiv.org/abs/2504.08942,Lù et al.
-Qwen2.5-VL (S),64.5,86.1,73.7,70.0,58.5,62.9,93.8,64.4,https://arxiv.org/abs/2504.08942,Lù et al.

results.jsonl ADDED Viewed

	@@ -0,0 +1,16 @@

+{"Judge": "Rule-based", "Overall": 83.8, "Recall": 55.9, "F1": 67.1, "AB": 25.0, "VWA": 85.2, "WA": 79.0, "Work": 100.0, "Work++": 83.3, "Author": "Benchmark authors", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "World-State-Model-7B", "Overall": 71.2, "Recall": 72.2, "F1": 71.7, "AB": 53.8, "VWA": 64.4, "WA": 70.1, "Work": 93.3, "Work++": 86.4, "Author": "Sun et al.", "Project URL": "https://arxiv.org/abs/2508.04700", "Logs URL": "https://huggingface.co/datasets/Zery/WSM-7B-AgentRewardBench/tree/main"}
+{"Judge": "AER-C (GPT-4o)", "Overall": 67.7, "Recall": 71.9, "F1": 69.7, "AB": 83.3, "VWA": 56.0, "WA": 68.8, "Work": 100.0, "Work++": 66.7, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "AER-V (GPT-4o)", "Overall": 67.6, "Recall": 71.5, "F1": 69.5, "AB": 83.3, "VWA": 61.2, "WA": 67.6, "Work": 96.4, "Work++": 59.3, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "NNetNav (Llama-3.3 70B)", "Overall": 52.5, "Recall": 82.4, "F1": 64.1, "AB": 20.8, "VWA": 54.5, "WA": 54.3, "Work": 77.3, "Work++": 43.2, "Author": "Murty et al.", "Project URL": "https://arxiv.org/abs/2410.02907", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "Claude 3.7 S. (A)", "Overall": 68.8, "Recall": 81.6, "F1": 74.7, "AB": 87.5, "VWA": 61.0, "WA": 69.3, "Work": 85.0, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "GPT-4o (A)", "Overall": 69.8, "Recall": 83.1, "F1": 75.9, "AB": 77.8, "VWA": 63.0, "WA": 70.2, "Work": 94.6, "Work++": 63.0, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "GPT-4o Mini (A)", "Overall": 61.5, "Recall": 86.1, "F1": 71.7, "AB": 80.0, "VWA": 57.9, "WA": 63.5, "Work": 84.2, "Work++": 49.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "Llama 3.3 (A)", "Overall": 67.7, "Recall": 79.0, "F1": 72.9, "AB": 75.0, "VWA": 59.6, "WA": 68.2, "Work": 94.3, "Work++": 62.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "Qwen2.5-VL (A)", "Overall": 64.3, "Recall": 89.8, "F1": 75.0, "AB": 72.7, "VWA": 59.3, "WA": 63.6, "Work": 87.2, "Work++": 60.3, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "Claude 3.7 S. (S)", "Overall": 69.4, "Recall": 76.3, "F1": 72.7, "AB": 71.4, "VWA": 64.8, "WA": 69.3, "Work": 85.3, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "GPT-4o (S)", "Overall": 68.1, "Recall": 80.3, "F1": 73.7, "AB": 77.8, "VWA": 60.7, "WA": 69.9, "Work": 93.8, "Work++": 59.6, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "GPT-4o Mini (S)", "Overall": 64.5, "Recall": 78.3, "F1": 70.8, "AB": 80.0, "VWA": 57.4, "WA": 66.9, "Work": 90.3, "Work++": 54.8, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "Qwen2.5-VL (S)", "Overall": 64.5, "Recall": 86.1, "F1": 73.7, "AB": 70.0, "VWA": 58.5, "WA": 62.9, "Work": 93.8, "Work++": 64.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
+{"Judge": "WebJudge-7B", "Overall": 75.7, "Recall": 58.0, "F1": 65.6, "AB": 80.0, "VWA": 66.7, "WA": 77.5, "Work": 100.0, "Work++": 70.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/pdf/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}
+{"Judge": "WebJudge (o4-mini)", "Overall": 82.0, "Recall": 47.8, "F1": 60.4, "AB": 100.0, "VWA": 74.5, "WA": 81.2, "Work": 100.0, "Work++": 90.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/abs/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}