Spaces:

pelcra
/

llmlagbench

Running

App Files Files Community

llmlagbench / app.py

fzarnecki

Initial commit

f97f900 about 1 month ago

raw

history blame

24.2 kB

	import json
	import math
	from statistics import mean
	from datetime import datetime

	import pandas as pd
	import gradio as gr
	import plotly.graph_objects as go
	from gradio_leaderboard import Leaderboard

	from src.utils import(
	_safe_numeric,
	calculate_cumulative_average,
	create_dataframe,
	get_aggregated_columns,
	load_data,
	load_model_metadata,
	load_raw_model_data,
	build_year_column_mapping,
	)
	from content import LLMLAGBENCH_INTRO, LEADERBOARD_INTRO, MODEL_COMPARISON_INTRO, AUTHORS


	# TODO move to file
	CIT_BTN_TEXT = ""
	CIT_BTN_LABEL = ""


	### CONFIGURATION

	cfg = {
	"data_path": "data/leaderboard_graph_data.json",
	"metadata_path": "data/model_metadata.json",
	"years": ["2021", "2022", "2023", "2024", "2025"],
	"months": [f"{i:02d}" for i in range(1, 13)]
	}


	### CALLBACKS

	# updated update_dash to create interactive plot
	def update_dashboard(graph_years, graph_model_filter):
	"""
	graph_years: list like ["2024","2025"] for graph from graph_year_selector
	graph_model_filter: list of models for the line plot or None
	"""

	# Table always shows all years and all models
	table_years = cfg.get("years")
	table_model_filter = None

	# Default: show all years if none selected for graph
	if not graph_years:
	graph_years = cfg.get("years")

	# keep some necessary metadata columns in the specified order
	metadata_cols = ["Model", "Overall Average", "Provider cutoff", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Release date", "Model cutoff", "trend_changepoints", "Parameters"]
	cols = metadata_cols.copy()

	yearly_df = df.copy()
	monthly_df = df.copy()
	graph_df = df.copy()

	# TODO if >1 year - aggregate the values to be per year, not per month
	if len(table_years) > 1:
	lb_cols = ["Model", "Overall Average", "Provider cutoff", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Release date", "Model cutoff", "Parameters"] + [y for y in cfg.get("aggregated_cols_year") if y in table_years]
	yearly_df = yearly_df[lb_cols]

	# Expand years into their YYYY_MM columns (for table)
	chosen_months = []
	for y in table_years:
	chosen_months.extend(year_to_columns.get(y, []))
	# Sort chronologically using the global aggregated_cols order
	# Only include months that actually exist in the dataframe
	chosen_months_with_years = table_years + [c for c in cfg.get("aggregated_cols_month") if c in chosen_months and c in monthly_df.columns]
	cols.extend(chosen_months_with_years)

	# Filter by models for table if requested
	if table_model_filter:
	yearly_df = yearly_df[yearly_df["Model"].isin(table_model_filter)]
	monthly_df = monthly_df[monthly_df["Model"].isin(table_model_filter)]

	# Sort by Overall Average in descending order
	yearly_df = yearly_df.sort_values(by="Overall Average", ascending=False)
	monthly_df = monthly_df.sort_values(by="Overall Average", ascending=False)

	# Reduce columns
	monthly_df = monthly_df[cols]

	# Filter by models for graph if requested (use separate dataframe)
	# Build graph columns based on graph_years
	graph_months = []
	for y in graph_years:
	graph_months.extend(year_to_columns.get(y, []))
	graph_months_with_years = graph_years + [c for c in cfg.get("aggregated_cols_month") if c in graph_months and c in graph_df.columns]
	graph_cols = metadata_cols + graph_months_with_years
	graph_df = graph_df[graph_cols]
	if graph_model_filter:
	graph_df = graph_df[graph_df["Model"].isin(graph_model_filter)]

	# Build tidy dataframe for gr.LinePlot with columns x, y, Model
	records = []
	# Exclude all metadata columns and yearly aggregates from x_labels - only keep monthly columns
	excluded_cols = {"Model", "Overall Average", "Parameters", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Provider cutoff", "Release date", "Model cutoff", "trend_changepoints"}
	x_labels = [c for c in graph_cols if c not in excluded_cols and c not in graph_years] # only months for the plot
	for _, row in graph_df.iterrows():
	for col in x_labels:
	y_val = _safe_numeric(row.get(col))
	records.append({"x": col, "y": y_val, "Model": row["Model"]})
	lineplot_df = pd.DataFrame(records)

	# Ensure chronological order using global sorted list - double sorting? TODO verify
	chronological_order = [c for c in cfg.get("aggregated_cols_month") if c in lineplot_df["x"].unique()]
	lineplot_df["x"] = pd.Categorical(lineplot_df["x"], categories=chronological_order, ordered=True)
	lineplot_df = lineplot_df.sort_values(by="x")

	# Build Plotly figure
	fig = go.Figure()

	for _, row in graph_df.iterrows():
	model = row["Model"]
	color = GLOBAL_MODEL_COLORS[model]
	model_data = lineplot_df[lineplot_df["Model"] == model]

	fig.add_trace(go.Scatter(
	x=model_data["x"],
	y=model_data["y"],
	mode="lines",
	name=model,
	line=dict(width=2, color=color),
	hovertemplate="Model: %{text}<br>x=%{x}<br>y=%{y}",
	text=[model] * len(model_data),
	showlegend=True,
	line_shape='spline'
	))

	# Highlight changepoints (can be multiple)
	changepoints = row.get("trend_changepoints", [])
	if isinstance(changepoints, list):
	for idx, bp in enumerate(changepoints):
	if bp in model_data["x"].values:
	cp_row = model_data[model_data["x"] == bp]
	# Make first changepoint smaller if there are multiple
	marker_size = 12 if idx == 0 else 6
	fig.add_trace(go.Scatter(
	x=cp_row["x"],
	y=cp_row["y"],
	mode="markers",
	marker=dict(
	size=marker_size,
	color=color,
	symbol="circle-open",
	line=dict(width=3, color="white")
	),
	hovertemplate=f"<b>Trend Changepoint</b><br>Model: {model}<br>x=%{{x}}<br>y=%{{y}}",
	showlegend=False
	))

	# Style the figure & Lock axis order
	fig.update_layout(
	xaxis=dict(
	categoryorder="array",
	categoryarray=chronological_order,
	title="Year_Month",
	color="#e5e7eb",
	gridcolor="#374151"
	),
	yaxis=dict(title="Average Faithfulness (0-2 scale)", color="#e5e7eb", gridcolor="#374151"),
	paper_bgcolor="#1f2937",
	plot_bgcolor="#1f2937",
	font=dict(family="IBM Plex Sans", size=12, color="#e5e7eb"),
	hoverlabel=dict(bgcolor="#374151", font=dict(color="#e5e7eb"), bordercolor="#4b5563"),
	margin=dict(l=40, r=20, t=60, b=40),
	# title=dict(text="Model Comparison with Trend Changepoints", x=0.5, font=dict(color="#e5e7eb")),
	showlegend=True,
	yaxis_range=[-0.1, 2.1],
	xaxis_tickangle=-45
	)

	if len(table_years) > 1:
	return yearly_df, fig
	else:
	return monthly_df, fig


	def create_faithfulness_plot(model_name):
	"""
	Create a Plotly figure showing faithfulness scores with segments and cumulative refusals.

	Args:
	model_name: Name of the model to plot

	Returns:
	Plotly Figure object or None if model not found
	"""
	if not model_name:
	return go.Figure()

	# Load raw model data
	model_data = load_raw_model_data(cfg.get("data_path"), model_name)

	if not model_data:
	return go.Figure()

	# Extract data
	dates = model_data.get('dates', [])
	faithfulness = model_data.get('faithfulness', [])
	cumulative_refusals = model_data.get('cumulative_refusals', [])
	segments = model_data.get('segments', [])
	changepoint_dates = model_data.get('changepoint_dates', [])
	total_obs = model_data.get('total_observations', max(cumulative_refusals) if cumulative_refusals else 1)

	# Calculate cumulative average faithfulness
	cumulative_avg_faithfulness = calculate_cumulative_average(faithfulness) if faithfulness else []

	# Create figure with secondary y-axis
	fig = go.Figure()

	# Add faithfulness scatter points
	fig.add_trace(go.Scatter(
	x=dates,
	y=faithfulness,
	mode='markers',
	name='Faithfulness',
	marker=dict(size=4, color='steelblue', opacity=0.6),
	hovertemplate='Date: %{x}<br>Faithfulness: %{y}<extra></extra>',
	yaxis='y'
	))

	# Add cumulative average faithfulness line (green curve)
	if cumulative_avg_faithfulness:
	fig.add_trace(go.Scatter(
	x=dates,
	y=cumulative_avg_faithfulness,
	mode='lines',
	name='Cumulative Average',
	line=dict(color='#22c55e', width=2.5),
	hovertemplate='Date: %{x}<br>Cumulative Avg: %{y:.3f}<extra></extra>',
	yaxis='y'
	))

	# Add segment mean lines (horizontal lines for each segment)
	for seg in segments:
	fig.add_trace(go.Scatter(
	x=[seg['start_date'], seg['end_date']],
	y=[seg['mean_faithfulness'], seg['mean_faithfulness']],
	mode='lines',
	name=f"Segment Mean ({seg['mean_faithfulness']:.2f})",
	line=dict(color='red', width=2),
	hovertemplate=f"Mean: {seg['mean_faithfulness']:.2f}<br>Refusal Rate: {seg['refusal_rate_percent']:.1f}%<extra></extra>",
	yaxis='y',
	showlegend=False
	))

	# Add changepoint vertical lines
	for cp_date in changepoint_dates:
	fig.add_vline(
	x=cp_date,
	line=dict(color='darkred', dash='dash', width=1.5),
	opacity=0.7
	)

	# Add cumulative refusals line (on secondary y-axis)
	fig.add_trace(go.Scatter(
	x=dates,
	y=cumulative_refusals,
	mode='lines',
	name='Cumulative Refusals',
	line=dict(color='darkorange', width=2),
	hovertemplate='Date: %{x}<br>Cumulative Refusals: %{y}<extra></extra>',
	yaxis='y2'
	))

	# Add refusal rate annotations for each segment
	for seg in segments:
	# Calculate midpoint date for annotation
	start = datetime.strptime(seg['start_date'], '%Y-%m-%d')
	end = datetime.strptime(seg['end_date'], '%Y-%m-%d')
	mid_date = start + (end - start) / 2

	fig.add_annotation(
	x=mid_date.strftime('%Y-%m-%d'),
	y=1.85,
	text=f"{seg['refusal_rate_percent']:.1f}%",
	showarrow=False,
	font=dict(size=10, color='#fbbf24', family='IBM Plex Sans'),
	bgcolor='rgba(55, 65, 81, 0.9)',
	bordercolor='#fbbf24',
	borderwidth=1,
	yref='y'
	)

	# Update layout with dual y-axes
	fig.update_layout(
	title=dict(
	text=f"{model_name}: Faithfulness with PELT Changepoints",
	x=0.5,
	font=dict(color='#e5e7eb', size=14, family='IBM Plex Sans')
	),
	xaxis=dict(
	title='Date',
	color='#e5e7eb',
	gridcolor='#374151',
	tickangle=-45
	),
	yaxis=dict(
	title='Faithfulness Score',
	color='#e5e7eb',
	gridcolor='#374151',
	range=[-0.05, 2.05],
	side='left'
	),
	yaxis2=dict(
	title='Cumulative Refusals',
	color='#fbbf24',
	gridcolor='#374151',
	range=[0, total_obs],
	overlaying='y',
	side='right'
	),
	paper_bgcolor='#1f2937',
	plot_bgcolor='#1f2937',
	font=dict(family='IBM Plex Sans', size=12, color='#e5e7eb'),
	hoverlabel=dict(bgcolor='#374151', font=dict(color='#e5e7eb'), bordercolor='#4b5563'),
	margin=dict(l=60, r=60, t=80, b=80),
	showlegend=False,
	legend=dict(
	x=0.02,
	y=0.98,
	bgcolor='rgba(55, 65, 81, 0.9)',
	bordercolor='#4b5563',
	borderwidth=1
	)
	)

	return fig


	def update_model_comparison(split_enabled, model1, model2):
	"""
	Update model comparison plots based on split checkbox and model selections.

	Args:
	split_enabled: Boolean indicating if 2 graphs should be shown
	model1: First model name
	model2: Second model name (only used if split_enabled)

	Returns:
	Tuple of (plot1, plot2, visibility_dict)
	"""
	if split_enabled:
	# Show 2 graphs side by side
	plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()
	plot2 = create_faithfulness_plot(model2) if model2 else go.Figure()
	return plot1, plot2, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
	else:
	# Show only 1 graph
	plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()
	return plot1, go.Figure(), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)


	def initialize_model_comparison():
	"""
	Initialize model comparison section with random model selections on page load.

	Returns:
	Tuple of (model1_value, model2_value, plot1, plot2, col_plot_1_visible, col_model_2_visible, col_plot_2_visible)
	"""
	import random

	# Select random models for initial display
	if len(all_models) >= 2:
	random_models = random.sample(all_models, 2)
	model1 = random_models[0]
	model2 = random_models[1]
	elif len(all_models) == 1:
	model1 = all_models[0]
	model2 = None
	else:
	model1 = None
	model2 = None

	# Generate initial plot for model1 only (split_enabled=False by default)
	plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()

	return (
	gr.update(value=model1), # model_dropdown_1
	gr.update(value=model2), # model_dropdown_2
	plot1, # comparison_plot_1
	go.Figure(), # comparison_plot_2 (empty since split is disabled)
	gr.update(visible=True), # col_plot_1
	gr.update(visible=False), # col_model_2
	gr.update(visible=False) # col_plot_2
	)


	def initialize_main_dashboard(graph_year_selector_value):
	"""
	Initialize main dashboard with random model selections on page load.

	Args:
	graph_year_selector_value: Selected years from the graph year selector

	Returns:
	Tuple of (graph_model_filter_value, leaderboard, line_plot)
	"""
	import random

	# Select random models for initial display (5 models for graph)
	num_models = min(5, len(all_models))
	if num_models > 0:
	random_graph_models = random.sample(all_models, num_models)
	else:
	random_graph_models = []

	# Generate dashboard with random models
	leaderboard, line_plot = update_dashboard(graph_year_selector_value, random_graph_models)

	return (
	gr.update(value=random_graph_models), # graph_model_filter
	leaderboard,
	line_plot
	)


	### HELPER FUNCTIONS

	def generate_distinct_colors(n):
	"""
	Generate n distinct colors using HSL color space.

	Args:
	n: Number of distinct colors needed

	Returns:
	List of hex color strings
	"""
	colors = []
	for i in range(n):
	hue = (i * 360 / n) % 360
	saturation = 70 + (i % 3) * 10 # Vary saturation slightly for more distinction
	lightness = 55 + (i % 2) * 10 # Vary lightness slightly

	# Convert HSL to RGB
	h = hue / 360
	s = saturation / 100
	l = lightness / 100

	if s == 0:
	r = g = b = l
	else:
	def hue_to_rgb(p, q, t):
	if t < 0: t += 1
	if t > 1: t -= 1
	if t < 1/6: return p + (q - p) * 6 * t
	if t < 1/2: return q
	if t < 2/3: return p + (q - p) * (2/3 - t) * 6
	return p

	q = l * (1 + s) if l < 0.5 else l + s - l * s
	p = 2 * l - q
	r = hue_to_rgb(p, q, h + 1/3)
	g = hue_to_rgb(p, q, h)
	b = hue_to_rgb(p, q, h - 1/3)

	# Convert to hex
	hex_color = f"#{int(r255):02x}{int(g255):02x}{int(b*255):02x}"
	colors.append(hex_color)

	return colors


	### DATA PREP
	# Load data
	data = load_data(cfg.get("data_path"))

	# Load model metadata
	model_metadata = load_model_metadata(cfg.get("metadata_path"))

	# Build year to columns mapping
	year_to_columns = build_year_column_mapping(cfg.get("years"), cfg.get("months"))

	# Create DataFrame (new format doesn't need models_map or metrics)
	df = create_dataframe(cfg, data, model_metadata=model_metadata)

	# Get aggregated column lists
	aggregated_cols_year, aggregated_cols_month = get_aggregated_columns(
	cfg.get("years"), year_to_columns
	)
	cfg["aggregated_cols_year"] = aggregated_cols_year
	cfg["aggregated_cols_month"] = aggregated_cols_month

	# Generate consistent color mapping for all models (do this once globally)
	all_models = sorted(df["Model"].unique().tolist()) # Sort for consistency
	colors = generate_distinct_colors(len(all_models))
	GLOBAL_MODEL_COLORS = {model: colors[i] for i, model in enumerate(all_models)}


	### BUILD UI
	theme = gr.themes.Base(
	primary_hue="green",
	secondary_hue="green",
	radius_size="lg",
	text_size="sm",
	)

	# Custom CSS for scrollable dropdown and table styling
	custom_css = """
	/* Limit the height of selected items in multiselect dropdown */
	.scrollable-dropdown .wrap-inner {
	max-height: 100px !important;
	overflow-y: auto !important;
	}
	/* Alternative selector for the selected items container */
	.scrollable-dropdown div[data-testid="block-label-inner"] ~ div {
	max-height: 100px !important;
	overflow-y: auto !important;
	}
	/* Style the leaderboard table background */
	.gradio-container .gr-table-wrap,
	.gradio-container .gr-dataframe,
	.gradio-leaderboard {
	background-color: #fafafa !important;
	}
	.gradio-container table {
	background-color: #fafafa !important;
	}
	/* Header row - gray background */
	.gradio-container table thead tr,
	.gradio-container table thead th {
	background-color: #f3f4f6 !important;
	}
	/* First column (td:first-child) - gray background for all rows */
	.gradio-container table tbody tr td:first-child {
	background-color: #f3f4f6 !important;
	}
	/* Odd rows - very light background (excluding first column) */
	.gradio-container table tbody tr:nth-child(odd) td {
	background-color: #fafafa !important;
	}
	/* Even rows - white background (excluding first column) */
	.gradio-container table tbody tr:nth-child(even) td {
	background-color: white !important;
	}
	/* Keep first column gray for both odd and even rows */
	.gradio-container table tbody tr:nth-child(odd) td:first-child,
	.gradio-container table tbody tr:nth-child(even) td:first-child {
	background-color: #f3f4f6 !important;
	}
	/* Hover effect for all rows */
	.gradio-container table tbody tr:hover td {
	background-color: #f3f4f6 !important;
	}
	/* Keep first column darker gray on hover */
	.gradio-container table tbody tr:hover td:first-child {
	background-color: #e5e7eb !important;
	}
	"""

	# JavaScript to force light mode
	js_func = """
	function refresh() {
	const url = new URL(window.location);
	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}
	}
	"""

	with gr.Blocks(theme=theme, css=custom_css, js=js_func) as demo:
	gr.Markdown(
	"""
	<div style='text-align: center;'>
	<h1>📶 LLMLagBench - When did your LLM go stale?</h1>
	</div>
	"""
	)
	gr.Markdown(AUTHORS)
	gr.Markdown("<hr>")
	gr.Markdown(LLMLAGBENCH_INTRO)
	gr.Markdown("<hr>")

	with gr.Row():
	# Year selector for graph
	with gr.Column(scale=1):
	graph_year_selector = gr.CheckboxGroup(choices=cfg.get("years"), value=["2021", "2022", "2023", "2024", "2025"], label="Select Years for Graph")
	with gr.Column(scale=1):
	graph_model_filter = gr.Dropdown(
	choices=df["Model"].unique().tolist(),
	multiselect=True,
	filterable=True,
	value=None, # Will be set randomly on page load
	label="Select Models for Graph",
	elem_classes="scrollable-dropdown"
	)

	gr.Markdown("## Model Comparison with Trend Changepoints")
	line_plot = gr.Plot(label="Model Trends")
	gr.Markdown('<hr>')
	gr.Markdown('<br>')

	gr.Markdown(LEADERBOARD_INTRO)

	leaderboard = Leaderboard(
	value=df,
	search_columns=["Model"],
	interactive=False,
	)

	# Wire events — graph inputs update the leaderboard + plot
	for comp in (graph_year_selector, graph_model_filter):
	comp.change(
	fn=update_dashboard,
	inputs=[graph_year_selector, graph_model_filter],
	outputs=[leaderboard, line_plot],
	)

	# Initialize main dashboard on load with random model selections
	demo.load(
	fn=initialize_main_dashboard,
	inputs=[graph_year_selector],
	outputs=[graph_model_filter, leaderboard, line_plot]
	)

	gr.Markdown('<hr>')
	gr.Markdown('<br>')
	gr.Markdown("## Model Comparison: Faithfulness to Ideal Answer with PELT Changepoints")
	gr.Markdown(MODEL_COMPARISON_INTRO)

	# Model comparison section
	with gr.Row():
	split_checkbox = gr.Checkbox(
	label="Split into 2 segments",
	value=True,
	info="Enable to compare two models side by side"
	)

	with gr.Row():
	with gr.Column(scale=1):
	model_dropdown_1 = gr.Dropdown(
	choices=all_models,
	value=None, # Will be set randomly on page load
	label="Select Model 1",
	filterable=True,
	elem_classes="scrollable-dropdown"
	)
	with gr.Column(scale=1, visible=False) as col_model_2:
	model_dropdown_2 = gr.Dropdown(
	choices=all_models,
	value=None, # Will be set randomly on page load
	label="Select Model 2",
	filterable=True,
	elem_classes="scrollable-dropdown"
	)

	with gr.Row():
	with gr.Column(scale=1) as col_plot_1:
	comparison_plot_1 = gr.Plot(label="Model Faithfulness Analysis")
	with gr.Column(scale=1, visible=False) as col_plot_2:
	comparison_plot_2 = gr.Plot(label="Model Faithfulness Analysis")

	# Wire model comparison events
	split_checkbox.change(
	fn=update_model_comparison,
	inputs=[split_checkbox, model_dropdown_1, model_dropdown_2],
	outputs=[comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
	)

	model_dropdown_1.change(
	fn=update_model_comparison,
	inputs=[split_checkbox, model_dropdown_1, model_dropdown_2],
	outputs=[comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
	)

	model_dropdown_2.change(
	fn=update_model_comparison,
	inputs=[split_checkbox, model_dropdown_1, model_dropdown_2],
	outputs=[comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
	)

	# Initialize model comparison on load with random model selections
	demo.load(
	fn=initialize_model_comparison,
	inputs=None,
	outputs=[model_dropdown_1, model_dropdown_2, comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
	)

	gr.Markdown('<hr>')
	gr.Markdown('<br>')
	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CIT_BTN_TEXT,
	label=CIT_BTN_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)

	if __name__ == "__main__":
	demo.launch()