#!/usr/bin/env python3 """ Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version Simplified version for loading and processing CSV data """ import os import pandas as pd from pandas.io.formats.style import Styler from matplotlib.colors import LinearSegmentedColormap import html # CSV file path CSV_FILE_PATH = "leaderboard_data.csv" def load_leaderboard_from_csv() -> pd.DataFrame: """Load leaderboard data from CSV file""" try: if not os.path.exists(CSV_FILE_PATH): print(f"❌ CSV file not found: {CSV_FILE_PATH}") return create_empty_leaderboard_dataframe() df = pd.read_csv(CSV_FILE_PATH) print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}") # Convert to leaderboard format leaderboard_df = csv_to_leaderboard_format(df) # Sort by Mean (Task) score and add rankings leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True) leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1) return leaderboard_df except Exception as e: print(f"❌ Error loading CSV: {e}") return create_empty_leaderboard_dataframe() def create_empty_leaderboard_dataframe() -> pd.DataFrame: """Create an empty DataFrame with proper leaderboard column structure""" return pd.DataFrame(columns=[ "Rank", "Model", "Mean (Task)", "Mean (TaskType)", "Classification", "Clustering", "Pair Classification", "Retrieval", "STS", "Correlation", "Parameters", "Embed Dim", "Max Sequence Length", "Vocab Size", ]) def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame: """Convert CSV data to leaderboard format""" data = [] for idx, row in df.iterrows(): model_name = row['Model'] # Prepare model name for display model_name_clean = html.escape(model_name) # Create clickable HuggingFace link for model name hf_link = f"https://huggingface.co/{model_name_clean}" clickable_model = f'{model_name_clean}' # Handle different column name variations embedding_dim_col = 'Embedding Dim' max_seq_col = 'Max Seq Length' pair_classification_col = 'Pair Classification' data_row = { "Rank": idx + 1, # Initial ranking, will be recalculated "Model": clickable_model, "Mean (Task)": round(float(row['Mean (Task)']), 2), "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2), "Classification": round(float(row['Classification']), 2), "Clustering": round(float(row['Clustering']), 2), "Pair Classification": round(float(row[pair_classification_col]), 2), "Retrieval": round(float(row['Retrieval']), 2), "STS": round(float(row['STS']), 2), "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A", "Parameters": row['Number of Parameters'], "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0, "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])), "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0 } data.append(data_row) result_df = pd.DataFrame(data) return result_df def create_excel_like_cmap(): """Create Excel-like colormap for score visualization""" colors = [ (0.9, 0.1, 0.2), # Red (1.0, 1.0, 0.0), # Yellow (0/255, 176/255, 80/255) # Excel-style Green ] return LinearSegmentedColormap.from_list("excel_like", colors, N=256) def rgb_to_hex(rgb_tuple): """Convert RGB tuple to hex color""" r, g, b = [int(x * 255) for x in rgb_tuple[:3]] return f"#{r:02x}{g:02x}{b:02x}" def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str: """Create colored cell HTML for score visualization""" if pd.isna(value) or value == "N/A": return str(value) try: # Normalize value to 0-1 range if max_val > min_val: normalized = (float(value) - min_val) / (max_val - min_val) else: normalized = 0.5 # Get color from colormap color_rgba = colormap(normalized) color_hex = rgb_to_hex(color_rgba) # Create colored cell HTML with data-sort attribute for proper numeric sorting return f'
{value}
' except (ValueError, TypeError): return str(value) def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler: """Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting. """ if df.empty: return df.style colormap = create_excel_like_cmap() # Score columns to colorize score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering", "Pair Classification", "Retrieval", "STS", "Correlation"] # Calculate min/max for each score column for normalization color_ranges = {} for col in score_columns: if col in df.columns: numeric_values = pd.to_numeric(df[col], errors='coerce') if not numeric_values.isna().all(): color_ranges[col] = { 'min': numeric_values.min(), 'max': numeric_values.max() } # Create styler with background colors for score columns def apply_color_gradient(val, col_name): """Apply background color based on value""" if col_name not in color_ranges: return '' if pd.isna(val) or val == "N/A": return '' try: min_val = color_ranges[col_name]['min'] max_val = color_ranges[col_name]['max'] # Normalize value to 0-1 range if max_val > min_val: normalized = (float(val) - min_val) / (max_val - min_val) else: normalized = 0.5 # Get color from colormap color_rgba = colormap(normalized) color_hex = rgb_to_hex(color_rgba) return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;' except (ValueError, TypeError): return '' # Apply styling to score columns using map (applymap is deprecated) styler = df.style for col in score_columns: if col in df.columns: styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col]) # Format score columns to 2 decimal places format_dict = {} for col in score_columns: if col in df.columns: format_dict[col] = '{:.2f}' if format_dict: styler = styler.format(format_dict, na_rep='N/A') return styler