Spaces:

SaylorTwift
/

OpenEvalsModelDetails

Runtime error

App Files Files Community

Linker1907 commited on Mar 4

Commit

bcda822

1 Parent(s): 114fe52

add app file

Browse files

Files changed (2) hide show

app.py +197 -0
experiments.json +420 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from datasets import load_dataset
+from collections import defaultdict
+import json
+import gradio as gr
+from functools import lru_cache
+# Load models and experiments
+MODELS = [
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+    "o3-mini-2025-01-31",
+    "meta-llama/Llama-3.3-70B-Instruct",
+    "moonshotai/Moonlight-16B-A3B-Instruct",
+    "gpt-4o",
+    "claude-3-7-sonnet-20250219",
+    "openai/gpt-4.5-preview-2025-02-27"
+]
+with open("experiments.json") as f:
+    experiments = json.load(f)
+@lru_cache
+def load_details_and_results(model, benchmark, experiment_tag):
+    def worker(example):
+        example["predictions"] = example["predictions"]
+        example["gold"] = example["gold"][0]
+        example["metrics"] = example["metrics"]
+        return example
+    repo = f"SaylorTwift/details_{model.replace('/', '__')}_private"
+    subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
+    split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
+    details = load_dataset(repo, subset, split=split)
+    results = load_dataset(repo, "results", split=split)
+    results = eval(results[0]["results"])
+    columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
+    details = details.select_columns(columns_to_keep)
+    details = details.map(worker)
+    return details, results
+# Load all experiment details
+experiment_details = defaultdict(dict)
+for model in MODELS:
+    for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
+        subset = benchmark_details["subset"]
+        for experiment_tag in benchmark_details["tags"]:
+            details, _ = load_details_and_results(model, benchmark, experiment_tag)
+            experiment_details[model][subset] = details
+def display_model_comparison(selected_models, benchmark, example_index):
+    if not selected_models:
+        return "Please select at least one model to compare."
+    outputs = []
+    for model in selected_models:
+        try:
+            example = experiment_details[model][benchmark][example_index]
+            outputs.append({
+                'Model': model.split('/')[-1],
+                'Prediction': example['predictions'][0] if example['predictions'] else '',
+                'Prompt': example['full_prompt'],
+                'Metrics': example['metrics'],
+                'Gold': example['gold']
+            })
+        except (KeyError, IndexError):
+            continue
+    if not outputs:
+        return "No results found for the selected combination."
+    # Create HTML output with all models
+    html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
+    # Show gold answer at the top with distinct styling
+    if outputs:
+        html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
+        html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
+        html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
+        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
+        html_output += "</div>\n"
+        html_output += "</div>\n"
+    for output in outputs:
+        html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
+        html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
+        # Format metrics as a clean table
+        html_output += "<details open style='margin-bottom: 15px;'>\n"
+        html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
+        metrics = output['Metrics']
+        if isinstance(metrics, str):
+            metrics = eval(metrics)
+        html_output += "<div style='overflow-x: auto;'>\n"
+        html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
+        for key, value in metrics.items():
+            if isinstance(value, float):
+                value = f"{value:.3f}"
+            html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
+        html_output += "</table>\n"
+        html_output += "</div>\n"
+        html_output += "</details>\n\n"
+        # Handle prompt formatting with better styling
+        html_output += "<details style='margin-bottom: 15px;'>\n"
+        html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
+        html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
+        prompt_text = output['Prompt']
+        if isinstance(prompt_text, list):
+            for i, msg in enumerate(prompt_text):
+                if isinstance(msg, dict) and 'content' in msg:
+                    role = msg.get('role', 'message').title()
+                    html_output += "<div style='margin-bottom: 10px;'>\n"
+                    html_output += f"<strong>{role}:</strong>\n"
+                    html_output += "<div style='overflow-x: auto;'>\n"
+                    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{msg['content']}</code></pre>\n"
+                    html_output += "</div>\n"
+                    html_output += "</div>\n"
+                else:
+                    html_output += "<div style='margin-bottom: 10px;'>\n"
+                    html_output += "<div style='overflow-x: auto;'>\n"
+                    html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{json.dumps(msg, indent=2)}</code></pre>\n"
+                    html_output += "</div>\n"
+                    html_output += "</div>\n"
+        else:
+            html_output += "<div style='overflow-x: auto;'>\n"
+            if isinstance(prompt_text, dict) and 'content' in prompt_text:
+                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text['content']}</code></pre>\n"
+            else:
+                html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
+            html_output += "</div>\n"
+        html_output += "</div>\n"
+        html_output += "</details>\n\n"
+        # Style prediction output - now in a collapsible section
+        html_output += "<details open style='margin-bottom: 15px;'>\n"
+        html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
+        # Add word count in a muted style
+        word_count = len(output['Prediction'].split())
+        html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
+        html_output += "</summary>\n"
+        html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
+        html_output += "<div style='overflow-x: auto;'>\n"
+        html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{output['Prediction']}</code></pre>\n"
+        html_output += "</div>\n"
+        html_output += "</div>\n"
+        html_output += "</details>\n"
+        html_output += "</div>\n\n"
+    html_output += "</div>"
+    return html_output
+# Get unique benchmarks
+available_benchmarks = list(set(
+    benchmark
+    for model in MODELS
+    for benchmark in experiment_details[model].keys()
+))
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=display_model_comparison,
+    inputs=[
+        gr.Dropdown(
+            choices=sorted(MODELS),
+            label="Models",
+            multiselect=True,
+            value=MODELS,
+            info="Select models to compare"
+        ),
+        gr.Dropdown(
+            choices=sorted(available_benchmarks),
+            label="Benchmark",
+            value=sorted(available_benchmarks)[0] if available_benchmarks else None,
+            info="Choose the evaluation benchmark"
+        ),
+        gr.Number(
+            label="Example Index",
+            value=0,
+            step=1,
+            info="Navigate through different examples"
+        )
+    ],
+    outputs=gr.HTML(),
+    title="Model Generation Comparison",
+    description="Compare model outputs across different benchmarks and prompts",
+    theme=gr.themes.Soft(),
+    css="button { margin: 0 10px; padding: 5px 15px; }"
+)
+if __name__ == "__main__":
+    demo.launch()

experiments.json ADDED Viewed

	@@ -0,0 +1,420 @@

+{
+    "gpt-4o": {
+        "display_name": "gpt 4o",
+        "provider": "openai",
+        "open": false,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T10-14-16.106571"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T10-14-16.106571"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T10-14-16.106571"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T10-14-16.106571"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": [
+                    "prompt_level_strict_acc"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T10-14-16.106571"
+                }
+            }
+        }
+    },
+    "claude-3-7-sonnet-20250219": {
+        "display_name": "Claude 3.7 Sonnet",
+        "provider": "anthropic",
+        "open": false,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-25T14-35-15.137825"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-25T12-43-49.294245"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-25T12-37-52.771787"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-25T12-37-52.771787"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": [
+                    "prompt_level_strict_acc"
+                ],
+                "tags": {
+                    "latest": "2025-02-25T12-24-45.750753"
+                }
+            }
+        }
+    },
+    "o3-mini-2025-01-31": {
+        "display_name": "o3-mini",
+        "provider": "openai",
+        "open": false,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T11-37-01.193437"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T11-37-01.193437"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T11-37-01.193437"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T11-37-01.193437"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": [
+                    "prompt_level_strict_acc"
+                ],
+                "tags": {
+                    "latest": "2025-02-26T11-37-01.193437"
+                }
+            }
+        }
+    },
+    "moonshotai/Moonlight-16B-A3B-Instruct": {
+        "display_name": "Moonlight",
+        "provider": "moonshotai",
+        "open": true,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025_02_26T13_32_06.104265"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025_02_26T13_32_06.104265"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025_02_26T13_32_06.104265"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": [
+                    "extractive_match"
+                ],
+                "tags": {
+                    "latest": "2025_02_26T13_32_06.104265"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": [
+                    "prompt_level_strict_acc"
+                ],
+                "tags": {
+                    "latest": "2025_02_26T13_32_06.104265"
+                }
+            }
+        }
+    },
+    "meta-llama/Llama-3.3-70B-Instruct": {
+        "display_name": "Llama 3.3 70B",
+        "provider": "meta",
+        "open": true,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-26T17-13-13.448521"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-26T17-13-13.448521"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-26T17-13-13.448521"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-26T17-13-13.448521"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": ["prompt_level_strict_acc"],
+                "tags": {
+                    "latest": "2025-02-26T17-13-13.448521"
+                }
+            }
+        }
+    },
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
+        "display_name": "DeepSeek Llama 70B",
+        "provider": "deepseek",
+        "open": true,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T11-09-04.037858"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T11-09-04.037858"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T11-09-04.037858"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T11-09-04.037858"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": ["prompt_level_strict_acc"],
+                "tags": {
+                    "latest": "2025-02-27T14-02-02.414381"
+                }
+            }
+        }
+    },
+    "qihoo360/TinyR1-32B-Preview": {
+        "display_name": "TinyR1 32B",
+        "provider": "qihoo360",
+        "open": true,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T13-32-41.564652"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T13-32-41.564652"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T13-32-41.564652"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-02-27T13-32-41.564652"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": ["prompt_level_strict_acc"],
+                "tags": {
+                    "latest": "2025-02-27T13-32-41.564652"
+                }
+            }
+        }
+    },
+    "openai/gpt-4.5-preview-2025-02-27": {
+        "display_name": "gpt 4.5",
+        "provider": "openai",
+        "open": false,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T11-17-20.767980"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T11-35-34.241611"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T11-15-32.836958"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T11-15-32.836958"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": ["prompt_level_strict_acc"],
+                "tags": {
+                    "latest": "2025-03-03T11-17-20.767980"
+                }
+            }
+        }
+    },
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {
+        "display_name": "DeepSeek Qwen 32B",
+        "provider": "deepseek",
+        "open": true,
+        "benchmarks": {
+            "math_500": {
+                "subset": "lighteval|math_500|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T14-51-09.849491"
+                }
+            },
+            "gpqa_diamond": {
+                "subset": "lighteval|gpqa:diamond|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T14-51-09.849491"
+                }
+            },
+            "aime_24": {
+                "subset": "lighteval|aime24|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T14-51-09.849491"
+                }
+            },
+            "aime_25": {
+                "subset": "lighteval|aime25|0",
+                "metrics": ["extractive_match"],
+                "tags": {
+                    "latest": "2025-03-03T14-51-09.849491"
+                }
+            },
+            "ifeval": {
+                "subset": "extended|ifeval|0",
+                "metrics": ["prompt_level_strict_acc"],
+                "tags": {
+                    "latest": "2025-03-03T15-06-10.838105"
+                }
+            }
+        }
+    }
+}