import asyncio from typing import Dict, Any, List from sklearn.metrics import roc_auc_score, average_precision_score from .evaluator import evaluate_model from .schema import create_submission_record from .dataset_storage import save_submission_to_dataset from config.tasks import TOX21_TASKS async def process_submission( model_name: str, hf_space_tag: str, model_description: str, organization: str, model_size: int, # Changed from str to int publication_title: str, publication_link: str, pretrained: bool, pretraining_data: str, zero_shot: bool, few_shot: bool, n_shot: str, smiles_list: List[str], true_labels: Dict[str, Dict[str, float]], ) -> Dict[str, Any]: """Process a complete submission from evaluation to metrics computation.""" # Step 1: Evaluate the model evaluation_result = await evaluate_model(hf_space_tag, smiles_list) # Step 2: Compute metrics metrics = compute_metrics(evaluation_result["results"], true_labels) # Step 3: Create the submission record record = create_submission_record( model_name=model_name, hf_space_tag=hf_space_tag, model_description=model_description, organization=organization, model_size=model_size, publication_title=publication_title, publication_link=publication_link, pretrained=pretrained, pretraining_data=pretraining_data, zero_shot=zero_shot, few_shot=few_shot, n_shot=n_shot, raw_predictions=evaluation_result["results"], computed_metrics=metrics, status="completed", approved=False, ) # Step 4: Save to HuggingFace dataset save_submission_to_dataset(record) return record def compute_metrics( predictions: List[Dict[str, Any]], true_labels: List[Dict[str, float]] ) -> Dict[str, Any]: """Compute evaluation metrics from predictions and true labels.""" # Simple placeholder - you'll want to implement proper ROC-AUC computation task_metrics = {} # Get all unique tasks if predictions: tasks = [task.key for task in TOX21_TASKS] for task in tasks: # arrange labels and predictions for evaluation y_true = [] y_pred = [] for i, sample in enumerate(predictions): smiles = sample["smiles"] label = true_labels[i].get(task, None) if label is None: continue y_true.append(int(label)) y_pred.append(sample["raw_predictions"][task]) # calculate score random_clf_auprc = sum(y_true) / len(y_true) auprc_score = average_precision_score(y_true=y_true, y_score=y_pred) delta_auprc_score = auprc_score - random_clf_auprc roc_auc_score_ = roc_auc_score(y_true=y_true, y_score=y_pred) task_metrics[task] = { "roc_auc": roc_auc_score_, "delta_auprc": delta_auprc_score, } # Overall score (average of task scores) if task_metrics: overall_roc_auc_score = sum( m["roc_auc"] for m in task_metrics.values() ) / len(task_metrics) overall_delta_auprc_score = sum( m["delta_auprc"] for m in task_metrics.values() ) / len(task_metrics) task_metrics["overall_score"] = { "roc_auc": overall_roc_auc_score, "delta_auprc": overall_delta_auprc_score, } return task_metrics