Mizan / evaluation_service.py
nmmursit's picture
Initial commit
9a235dc
#!/usr/bin/env python3
"""
Evaluation Service module for MTEB Turkish Leaderboard
Handles evaluation submissions and status tracking
"""
import time
import re
from typing import Optional, Tuple, List
import traceback
import pandas as pd
import gradio as gr
from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
# Global state management for active evaluations
active_evaluations = {} # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
def get_active_evaluations_status() -> str:
"""Show status of active evaluations"""
if not active_evaluations:
return "🟒 No active evaluation requests"
status_lines = []
for request_id, info in active_evaluations.items():
model_name = info["model_name"]
email = info["email"]
elapsed = int(time.time() - info["start_time"])
status = info.get("status", "PENDING")
status_lines.append(f"πŸ”„ {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
return "\n".join(status_lines)
def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
"""Get active evaluations status and cancellation options"""
status_text = get_active_evaluations_status()
cancel_options = []
for request_id, info in active_evaluations.items():
model_name = info["model_name"]
cancel_options.append(f"{request_id} - {model_name}")
return status_text, cancel_options
def clear_active_evaluations() -> str:
"""Clear all active evaluations from tracking"""
global active_evaluations
count = len(active_evaluations)
active_evaluations.clear()
return f"βœ… Cleared {count} active evaluation(s) from tracking"
def cancel_active_evaluation(selection: str) -> str:
"""Cancel a selected active evaluation"""
if not selection:
return "❌ No evaluation selected for cancellation"
try:
request_id = selection.split(" - ")[0]
if request_id not in active_evaluations:
return f"❌ Evaluation {request_id} not found in active evaluations"
# Try to cancel via API
success = cancel_evaluation_request(request_id)
if success:
model_name = active_evaluations[request_id]["model_name"]
del active_evaluations[request_id]
return f"βœ… Successfully cancelled evaluation for {model_name} (ID: {request_id})"
else:
return f"❌ Failed to cancel evaluation {request_id}. Check API connection."
except Exception as e:
return f"❌ Error cancelling evaluation: {str(e)}"
def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
"""Validate evaluation request parameters"""
# Model name validation
if not model_name or not model_name.strip():
return "❌ Model name cannot be empty!"
model_name = model_name.strip()
# Check model name length (format: org/model-name)
if len(model_name) < 3:
return "❌ Model name too short!"
if len(model_name) > 256:
return "❌ Model name too long (maximum 256 characters)!"
# Check for valid HuggingFace model name format (must be org/model)
if '/' not in model_name:
return "❌ Invalid model name format! Must include organization (e.g., organization/model-name)"
if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
return "❌ Invalid model name format! Use format: organization/model-name"
# Email validation
if not email or not email.strip():
return "❌ Email address cannot be empty!"
email = email.strip()
if len(email) > 254:
return "❌ Email address too long!"
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
if not re.match(email_pattern, email):
return "❌ Invalid email address format!"
return None
def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
try:
# Input validation
error_msg = _validate_evaluation_request(model_name, email)
if error_msg:
return error_msg, None
# Show progress
progress(0.1, desc="Sending evaluation request to API...")
# Send request to API - regardless of backend response, show success to user
api_response = send_evaluation_request_to_api(model_name, batch_size, email)
# Always show success message to user
# Backend errors (like duplicate requests) are handled by API and communicated via email
progress(1.0, desc="Request submitted successfully!")
# Return success message regardless of backend response
success_msg = f"""
βœ… Evaluation request submitted successfully!
πŸ€– Model: {model_name}
πŸ“§ Email: {email}
πŸ“‹ Next Steps:
⏱️ Your request will be reviewed by our system
πŸ“§ You will receive email notifications about the status of your evaluation
πŸ”„ If you've submitted this model before, you'll be notified via email
Thank you for contributing to the Mizan Leaderboard!
"""
return success_msg.strip(), current_data
except Exception as e:
# Log error for debugging
print(f"❌ Error submitting evaluation: {str(e)}")
traceback.print_exc()
error_msg = f"""
❌ Failed to submit evaluation request
πŸ€– Model: {model_name}
πŸ“§ Email: {email}
⚠️ Error: Unable to connect to the evaluation service.
Please try again later or contact support if the problem persists.
"""
return error_msg.strip(), None
def refresh_evaluation_status() -> str:
"""Refresh status of all active evaluations"""
if not active_evaluations:
return "🟒 No active evaluations to refresh"
updated_count = 0
for request_id, info in active_evaluations.items():
try:
status_data = get_evaluation_status(request_id)
if status_data and "status" in status_data:
old_status = info.get("status", "UNKNOWN")
new_status = status_data["status"]
if old_status != new_status:
info["status"] = new_status
updated_count += 1
print(f"Status updated for {request_id}: {old_status} -> {new_status}")
except Exception as e:
print(f"Error refreshing status for {request_id}: {e}")
return f"πŸ”„ Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."