xhluca
commited on
refactor and update leaderboard
Browse files- app.py +31 -6
- results.csv +0 -16
- results.jsonl +16 -0
app.py
CHANGED
|
@@ -1,18 +1,43 @@
|
|
| 1 |
"""
|
| 2 |
-
gradio app that reads results.
|
| 3 |
"""
|
| 4 |
-
import gradio as gr
|
| 5 |
|
|
|
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
|
|
|
|
| 8 |
def load_data():
|
| 9 |
-
# read the
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# remove Recall and F1 columns
|
| 12 |
df = df.drop(columns=["Recall", "F1"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# return the dataframe
|
| 14 |
return df
|
| 15 |
|
|
|
|
| 16 |
with gr.Blocks() as demo:
|
| 17 |
gr.Markdown(
|
| 18 |
"""
|
|
@@ -29,6 +54,6 @@ with gr.Blocks() as demo:
|
|
| 29 |
"""
|
| 30 |
)
|
| 31 |
df = load_data()
|
| 32 |
-
table = gr.DataFrame(df, show_label=False)
|
| 33 |
|
| 34 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
"""
|
| 2 |
+
gradio app that reads results.jsonl and display it in a table, title is "AgentRewardBench Leaderboard"
|
| 3 |
"""
|
|
|
|
| 4 |
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import json
|
| 7 |
import pandas as pd
|
| 8 |
|
| 9 |
+
|
| 10 |
def load_data():
|
| 11 |
+
# read the jsonl file
|
| 12 |
+
data = []
|
| 13 |
+
with open("./results.jsonl", "r", encoding="utf-8") as f:
|
| 14 |
+
for line in f:
|
| 15 |
+
if line.strip():
|
| 16 |
+
data.append(json.loads(line))
|
| 17 |
+
|
| 18 |
+
df = pd.DataFrame(data)
|
| 19 |
# remove Recall and F1 columns
|
| 20 |
df = df.drop(columns=["Recall", "F1"])
|
| 21 |
+
|
| 22 |
+
# if a field called "Overall" exists, sort the dataframe by "Overall"
|
| 23 |
+
if "Overall" in df.columns:
|
| 24 |
+
df = df.sort_values(by="Overall", ascending=False)
|
| 25 |
+
|
| 26 |
+
# if a field called "Project URL" exists, add a link to the project url
|
| 27 |
+
if "Project URL" in df.columns:
|
| 28 |
+
df["Author"] = "[" + df["Author"] + "](" + df["Project URL"] + ")"
|
| 29 |
+
# remove the Project URL column
|
| 30 |
+
df = df.drop(columns=["Project URL"])
|
| 31 |
+
|
| 32 |
+
# if a field called "Logs URL" exists, add a link to the logs url
|
| 33 |
+
if "Logs URL" in df.columns:
|
| 34 |
+
df["Logs"] = df["Logs URL"].apply(lambda x: f"[🔗]({x})" if x is not None else "✖️")
|
| 35 |
+
df = df.drop(columns=["Logs URL"])
|
| 36 |
+
|
| 37 |
# return the dataframe
|
| 38 |
return df
|
| 39 |
|
| 40 |
+
|
| 41 |
with gr.Blocks() as demo:
|
| 42 |
gr.Markdown(
|
| 43 |
"""
|
|
|
|
| 54 |
"""
|
| 55 |
)
|
| 56 |
df = load_data()
|
| 57 |
+
table = gr.DataFrame(df, show_label=False, datatype="markdown")
|
| 58 |
|
| 59 |
+
demo.queue(default_concurrency_limit=40).launch()
|
results.csv
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
Judge,Overall,Recall,F1,AB,VWA,WA,Work,Work++,URL,Author
|
| 2 |
-
Rule-based,83.8,55.9,67.1,25.0,85.2,79.0,100.0,83.3,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 3 |
-
WebJudge,73.7,-,-,66.7,69.8,72.6,92.3,75.0,https://arxiv.org/pdf/2504.01382,Xue et al.
|
| 4 |
-
World-State-Model-7B,71.2,72.2,71.7,53.8,64.4,70.1,93.3,86.4,https://arxiv.org/abs/2508.04700,Sun et al.
|
| 5 |
-
AER-C (GPT-4o),67.7,71.9,69.7,83.3,56.0,68.8,100.0,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 6 |
-
AER-V (GPT-4o),67.6,71.5,69.5,83.3,61.2,67.6,96.4,59.3,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 7 |
-
NNetNav (Llama-3.3 70B),52.5,82.4,64.1,20.8,54.5,54.3,77.3,43.2,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 8 |
-
Claude 3.7 S. (A),68.8,81.6,74.7,87.5,61.0,69.3,85.0,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 9 |
-
GPT-4o (A),69.8,83.1,75.9,77.8,63.0,70.2,94.6,63.0,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 10 |
-
GPT-4o Mini (A),61.5,86.1,71.7,80.0,57.9,63.5,84.2,49.4,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 11 |
-
Llama 3.3 (A),67.7,79.0,72.9,75.0,59.6,68.2,94.3,62.7,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 12 |
-
Qwen2.5-VL (A),64.3,89.8,75.0,72.7,59.3,63.6,87.2,60.3,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 13 |
-
Claude 3.7 S. (S),69.4,76.3,72.7,71.4,64.8,69.3,85.3,66.7,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 14 |
-
GPT-4o (S),68.1,80.3,73.7,77.8,60.7,69.9,93.8,59.6,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 15 |
-
GPT-4o Mini (S),64.5,78.3,70.8,80.0,57.4,66.9,90.3,54.8,https://arxiv.org/abs/2504.08942,Lù et al.
|
| 16 |
-
Qwen2.5-VL (S),64.5,86.1,73.7,70.0,58.5,62.9,93.8,64.4,https://arxiv.org/abs/2504.08942,Lù et al.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results.jsonl
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"Judge": "Rule-based", "Overall": 83.8, "Recall": 55.9, "F1": 67.1, "AB": 25.0, "VWA": 85.2, "WA": 79.0, "Work": 100.0, "Work++": 83.3, "Author": "Benchmark authors", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 2 |
+
{"Judge": "World-State-Model-7B", "Overall": 71.2, "Recall": 72.2, "F1": 71.7, "AB": 53.8, "VWA": 64.4, "WA": 70.1, "Work": 93.3, "Work++": 86.4, "Author": "Sun et al.", "Project URL": "https://arxiv.org/abs/2508.04700", "Logs URL": "https://huggingface.co/datasets/Zery/WSM-7B-AgentRewardBench/tree/main"}
|
| 3 |
+
{"Judge": "AER-C (GPT-4o)", "Overall": 67.7, "Recall": 71.9, "F1": 69.7, "AB": 83.3, "VWA": 56.0, "WA": 68.8, "Work": 100.0, "Work++": 66.7, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 4 |
+
{"Judge": "AER-V (GPT-4o)", "Overall": 67.6, "Recall": 71.5, "F1": 69.5, "AB": 83.3, "VWA": 61.2, "WA": 67.6, "Work": 96.4, "Work++": 59.3, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 5 |
+
{"Judge": "NNetNav (Llama-3.3 70B)", "Overall": 52.5, "Recall": 82.4, "F1": 64.1, "AB": 20.8, "VWA": 54.5, "WA": 54.3, "Work": 77.3, "Work++": 43.2, "Author": "Murty et al.", "Project URL": "https://arxiv.org/abs/2410.02907", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 6 |
+
{"Judge": "Claude 3.7 S. (A)", "Overall": 68.8, "Recall": 81.6, "F1": 74.7, "AB": 87.5, "VWA": 61.0, "WA": 69.3, "Work": 85.0, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 7 |
+
{"Judge": "GPT-4o (A)", "Overall": 69.8, "Recall": 83.1, "F1": 75.9, "AB": 77.8, "VWA": 63.0, "WA": 70.2, "Work": 94.6, "Work++": 63.0, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 8 |
+
{"Judge": "GPT-4o Mini (A)", "Overall": 61.5, "Recall": 86.1, "F1": 71.7, "AB": 80.0, "VWA": 57.9, "WA": 63.5, "Work": 84.2, "Work++": 49.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 9 |
+
{"Judge": "Llama 3.3 (A)", "Overall": 67.7, "Recall": 79.0, "F1": 72.9, "AB": 75.0, "VWA": 59.6, "WA": 68.2, "Work": 94.3, "Work++": 62.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 10 |
+
{"Judge": "Qwen2.5-VL (A)", "Overall": 64.3, "Recall": 89.8, "F1": 75.0, "AB": 72.7, "VWA": 59.3, "WA": 63.6, "Work": 87.2, "Work++": 60.3, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 11 |
+
{"Judge": "Claude 3.7 S. (S)", "Overall": 69.4, "Recall": 76.3, "F1": 72.7, "AB": 71.4, "VWA": 64.8, "WA": 69.3, "Work": 85.3, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 12 |
+
{"Judge": "GPT-4o (S)", "Overall": 68.1, "Recall": 80.3, "F1": 73.7, "AB": 77.8, "VWA": 60.7, "WA": 69.9, "Work": 93.8, "Work++": 59.6, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 13 |
+
{"Judge": "GPT-4o Mini (S)", "Overall": 64.5, "Recall": 78.3, "F1": 70.8, "AB": 80.0, "VWA": 57.4, "WA": 66.9, "Work": 90.3, "Work++": 54.8, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 14 |
+
{"Judge": "Qwen2.5-VL (S)", "Overall": 64.5, "Recall": 86.1, "F1": 73.7, "AB": 70.0, "VWA": 58.5, "WA": 62.9, "Work": 93.8, "Work++": 64.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"}
|
| 15 |
+
{"Judge": "WebJudge-7B", "Overall": 75.7, "Recall": 58.0, "F1": 65.6, "AB": 80.0, "VWA": 66.7, "WA": 77.5, "Work": 100.0, "Work++": 70.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/pdf/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}
|
| 16 |
+
{"Judge": "WebJudge (o4-mini)", "Overall": 82.0, "Recall": 47.8, "F1": 60.4, "AB": 100.0, "VWA": 74.5, "WA": 81.2, "Work": 100.0, "Work++": 90.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/abs/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}
|