""" Data loading functionality for the Tox21 leaderboard. Handles loading and processing results from HuggingFace datasets. """ import pandas as pd from datasets import load_dataset, Dataset from config.settings import RESULTS_DATASET, TEST_DATASET, HF_TOKEN from config.tasks import TOX21_TASKS def load_leaderboard_data() -> Dataset: """ Load leaderboard data from HuggingFace dataset. """ print(f"Loading dataset: {RESULTS_DATASET}") print(f"Using HF token: {'Yes' if HF_TOKEN else 'No'}") # Load the dataset (token already set globally via login in settings) dataset = load_dataset(RESULTS_DATASET) print(f"Dataset loaded successfully. Keys: {dataset.keys()}") # Look for test split (more appropriate for results) if "test" in dataset: results_data = dataset["test"] print(f"Test split has {len(results_data)} entries") if len(results_data) > 0: print(f"First entry keys: {results_data[0].keys()}") print(f"First entry: {results_data[0]}") return results_data else: raise ValueError("Dataset does not contain a 'test' split.") def load_test_dataset() -> tuple[list[str], list[dict[str, float]]]: # Get test smiles and labels (token already set globally via login in settings) dset = load_dataset(TEST_DATASET, split="test") tasks = [t.key for t in TOX21_TASKS] smiles = list(dset["smiles"]) labels = [] for sample in list(dset): labels.append({task: sample[task] for task in tasks}) print(f"Loaded test dataset") return smiles, labels