#!/usr/bin/env python3 """ Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version Simplified version for loading and processing CSV data """ import os import pandas as pd from pandas.io.formats.style import Styler from matplotlib.colors import LinearSegmentedColormap import html # CSV file path CSV_FILE_PATH = "leaderboard_data.csv" def load_leaderboard_from_csv() -> pd.DataFrame: """Load leaderboard data from CSV file""" try: if not os.path.exists(CSV_FILE_PATH): print(f"❌ CSV file not found: {CSV_FILE_PATH}") return create_empty_leaderboard_dataframe() df = pd.read_csv(CSV_FILE_PATH) print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}") # Convert to leaderboard format leaderboard_df = csv_to_leaderboard_format(df) # Sort by Mean (Task) score and add rankings leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True) leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1) return leaderboard_df except Exception as e: print(f"❌ Error loading CSV: {e}") return create_empty_leaderboard_dataframe() def create_empty_leaderboard_dataframe() -> pd.DataFrame: """Create an empty DataFrame with proper leaderboard column structure""" return pd.DataFrame(columns=[ "Rank", "Model", "Mean (Task)", "Mean (TaskType)", "Classification", "Clustering", "Pair Classification", "Retrieval", "STS", "Correlation", "Parameters", "Embed Dim", "Max Sequence Length", "Vocab Size", ]) def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame: """Convert CSV data to leaderboard format""" data = [] for idx, row in df.iterrows(): model_name = row['Model'] # Prepare model name for display model_name_clean = html.escape(model_name) # Create clickable HuggingFace link for model name hf_link = f"https://huggingface.co/{model_name_clean}" clickable_model = f'{model_name_clean}' # Handle different column name variations embedding_dim_col = 'Embedding Dim' max_seq_col = 'Max Seq Length' pair_classification_col = 'Pair Classification' data_row = { "Rank": idx + 1, # Initial ranking, will be recalculated "Model": clickable_model, "Mean (Task)": round(float(row['Mean (Task)']), 2), "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2), "Classification": round(float(row['Classification']), 2), "Clustering": round(float(row['Clustering']), 2), "Pair Classification": round(float(row[pair_classification_col]), 2), "Retrieval": round(float(row['Retrieval']), 2), "STS": round(float(row['STS']), 2), "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A", "Parameters": row['Number of Parameters'], "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0, "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])), "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0 } data.append(data_row) result_df = pd.DataFrame(data) return result_df def create_excel_like_cmap(): """Create Excel-like colormap for score visualization""" colors = [ (0.9, 0.1, 0.2), # Red (1.0, 1.0, 0.0), # Yellow (0/255, 176/255, 80/255) # Excel-style Green ] return LinearSegmentedColormap.from_list("excel_like", colors, N=256) def rgb_to_hex(rgb_tuple): """Convert RGB tuple to hex color""" r, g, b = [int(x * 255) for x in rgb_tuple[:3]] return f"#{r:02x}{g:02x}{b:02x}" def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str: """Create colored cell HTML for score visualization""" if pd.isna(value) or value == "N/A": return str(value) try: # Normalize value to 0-1 range if max_val > min_val: normalized = (float(value) - min_val) / (max_val - min_val) else: normalized = 0.5 # Get color from colormap color_rgba = colormap(normalized) color_hex = rgb_to_hex(color_rgba) # Create colored cell HTML with data-sort attribute for proper numeric sorting return f'