|
|
|
|
|
""" |
|
|
Evaluation Service module for MTEB Turkish Leaderboard |
|
|
Handles evaluation submissions and status tracking |
|
|
""" |
|
|
|
|
|
import time |
|
|
import re |
|
|
from typing import Optional, Tuple, List |
|
|
import traceback |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
|
|
|
from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request |
|
|
|
|
|
|
|
|
active_evaluations = {} |
|
|
|
|
|
|
|
|
def get_active_evaluations_status() -> str: |
|
|
"""Show status of active evaluations""" |
|
|
if not active_evaluations: |
|
|
return "π’ No active evaluation requests" |
|
|
|
|
|
status_lines = [] |
|
|
for request_id, info in active_evaluations.items(): |
|
|
model_name = info["model_name"] |
|
|
email = info["email"] |
|
|
elapsed = int(time.time() - info["start_time"]) |
|
|
status = info.get("status", "PENDING") |
|
|
status_lines.append(f"π {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)") |
|
|
|
|
|
return "\n".join(status_lines) |
|
|
|
|
|
|
|
|
def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]: |
|
|
"""Get active evaluations status and cancellation options""" |
|
|
status_text = get_active_evaluations_status() |
|
|
|
|
|
cancel_options = [] |
|
|
for request_id, info in active_evaluations.items(): |
|
|
model_name = info["model_name"] |
|
|
cancel_options.append(f"{request_id} - {model_name}") |
|
|
|
|
|
return status_text, cancel_options |
|
|
|
|
|
|
|
|
def clear_active_evaluations() -> str: |
|
|
"""Clear all active evaluations from tracking""" |
|
|
global active_evaluations |
|
|
count = len(active_evaluations) |
|
|
active_evaluations.clear() |
|
|
return f"β
Cleared {count} active evaluation(s) from tracking" |
|
|
|
|
|
|
|
|
def cancel_active_evaluation(selection: str) -> str: |
|
|
"""Cancel a selected active evaluation""" |
|
|
if not selection: |
|
|
return "β No evaluation selected for cancellation" |
|
|
|
|
|
try: |
|
|
request_id = selection.split(" - ")[0] |
|
|
|
|
|
if request_id not in active_evaluations: |
|
|
return f"β Evaluation {request_id} not found in active evaluations" |
|
|
|
|
|
|
|
|
success = cancel_evaluation_request(request_id) |
|
|
|
|
|
if success: |
|
|
model_name = active_evaluations[request_id]["model_name"] |
|
|
del active_evaluations[request_id] |
|
|
return f"β
Successfully cancelled evaluation for {model_name} (ID: {request_id})" |
|
|
else: |
|
|
return f"β Failed to cancel evaluation {request_id}. Check API connection." |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error cancelling evaluation: {str(e)}" |
|
|
|
|
|
|
|
|
def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]: |
|
|
"""Validate evaluation request parameters""" |
|
|
|
|
|
if not model_name or not model_name.strip(): |
|
|
return "β Model name cannot be empty!" |
|
|
|
|
|
model_name = model_name.strip() |
|
|
|
|
|
|
|
|
if len(model_name) < 3: |
|
|
return "β Model name too short!" |
|
|
|
|
|
if len(model_name) > 256: |
|
|
return "β Model name too long (maximum 256 characters)!" |
|
|
|
|
|
|
|
|
if '/' not in model_name: |
|
|
return "β Invalid model name format! Must include organization (e.g., organization/model-name)" |
|
|
|
|
|
if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name): |
|
|
return "β Invalid model name format! Use format: organization/model-name" |
|
|
|
|
|
|
|
|
if not email or not email.strip(): |
|
|
return "β Email address cannot be empty!" |
|
|
|
|
|
email = email.strip() |
|
|
|
|
|
if len(email) > 254: |
|
|
return "β Email address too long!" |
|
|
|
|
|
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' |
|
|
if not re.match(email_pattern, email): |
|
|
return "β Invalid email address format!" |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]: |
|
|
try: |
|
|
|
|
|
error_msg = _validate_evaluation_request(model_name, email) |
|
|
if error_msg: |
|
|
return error_msg, None |
|
|
|
|
|
|
|
|
progress(0.1, desc="Sending evaluation request to API...") |
|
|
|
|
|
|
|
|
api_response = send_evaluation_request_to_api(model_name, batch_size, email) |
|
|
|
|
|
|
|
|
|
|
|
progress(1.0, desc="Request submitted successfully!") |
|
|
|
|
|
|
|
|
success_msg = f""" |
|
|
β
Evaluation request submitted successfully! |
|
|
|
|
|
π€ Model: {model_name} |
|
|
π§ Email: {email} |
|
|
|
|
|
π Next Steps: |
|
|
β±οΈ Your request will be reviewed by our system |
|
|
π§ You will receive email notifications about the status of your evaluation |
|
|
π If you've submitted this model before, you'll be notified via email |
|
|
|
|
|
Thank you for contributing to the Mizan Leaderboard! |
|
|
""" |
|
|
|
|
|
return success_msg.strip(), current_data |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"β Error submitting evaluation: {str(e)}") |
|
|
traceback.print_exc() |
|
|
|
|
|
error_msg = f""" |
|
|
β Failed to submit evaluation request |
|
|
|
|
|
π€ Model: {model_name} |
|
|
π§ Email: {email} |
|
|
|
|
|
β οΈ Error: Unable to connect to the evaluation service. |
|
|
|
|
|
Please try again later or contact support if the problem persists. |
|
|
""" |
|
|
return error_msg.strip(), None |
|
|
|
|
|
|
|
|
def refresh_evaluation_status() -> str: |
|
|
"""Refresh status of all active evaluations""" |
|
|
if not active_evaluations: |
|
|
return "π’ No active evaluations to refresh" |
|
|
|
|
|
updated_count = 0 |
|
|
for request_id, info in active_evaluations.items(): |
|
|
try: |
|
|
status_data = get_evaluation_status(request_id) |
|
|
if status_data and "status" in status_data: |
|
|
old_status = info.get("status", "UNKNOWN") |
|
|
new_status = status_data["status"] |
|
|
if old_status != new_status: |
|
|
info["status"] = new_status |
|
|
updated_count += 1 |
|
|
print(f"Status updated for {request_id}: {old_status} -> {new_status}") |
|
|
except Exception as e: |
|
|
print(f"Error refreshing status for {request_id}: {e}") |
|
|
|
|
|
return f"π Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected." |
|
|
|