Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| st.set_page_config(page_title="Cyber Benchmark Hub: Leaderboard", layout="wide") | |
| st.title("Cyber Benchmark Hub: Leaderboard") | |
| with st.sidebar: | |
| st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True) | |
| st.markdown("[Priam.ai](https://www.priam.ai/)") | |
| st.divider() | |
| dataset_categories = ["Multiple Choice"] | |
| selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0) | |
| datasets_by_category = { | |
| "Multiple Choice": ["secQA","CyberMetric80"], | |
| } | |
| dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0) | |
| st.divider() | |
| st.header("Filters & Options") | |
| #dataset_version = st.radio("Select Dataset Version", ["v1", "v2"]) | |
| if dataset_choice == "secQA": | |
| dataset_version = st.radio("Select Dataset Version", ["v1", "v2"]) | |
| else: | |
| st.markdown("**Note:** Only CyberMetric80 has been evaluated") | |
| dataset_version = "v1" | |
| # For filtering the leaderboard by model type | |
| # Note: The available model types will come from the CSV, once loaded. | |
| # We'll load the CSV later and then update this filter accordingly. | |
| source_filter_placeholder = st.empty() # placeholder for source filter after data is loaded | |
| st.markdown("---") | |
| st.header("Test Parameters") | |
| test_params = pd.DataFrame({ | |
| "Value": [0, 1, 0, 1, 0] | |
| }, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"]) | |
| st.table(test_params) | |
| # Function to estimate random baseline accuracy for MCQ datasets | |
| def estimate_random_accuracy(questions): | |
| """ | |
| Estimates the average accuracy when answering questions randomly. | |
| Args: | |
| questions: List of tuples where each tuple is (question_id, num_choices) | |
| Returns: | |
| The estimated average accuracy (probability of correct answers) | |
| """ | |
| if not questions: | |
| return 0.0 | |
| total_probability = 0.0 | |
| for question_id, num_choices in questions: | |
| probability = 1.0 / num_choices | |
| total_probability += probability | |
| average_accuracy = total_probability / len(questions) | |
| return average_accuracy | |
| # For the SECQA dataset we assume each question has 4 choices. | |
| # According to the dataset card, there are 242 questions. | |
| total_questions = 242 | |
| questionnaire = [(1, 4), (2, 1), (3, 4), (4, 2), (5, 3), (6, 3), (7, 4), (8, 2), (9, 4), (10, 2), (11, 4), (12, 4), (13, 2), (14, 2), (15, 4), (16, 4), (17, 2), (18, 2), (19, 2), (20, 1), (21, 2), (22, 4), (23, 1), (24, 4), (25, 3), (26, 3), (27, 2), (28, 3), (29, 2), (30, 1), (31, 2), (32, 3), (33, 3), (34, 2), (35, 4), (36, 3), (37, 1), (38, 2), (39, 1), (40, 2), (41, 1), (42, 3), (43, 3), (44, 1), (45, 3), (46, 1), (47, 4), (48, 2), (49, 2), (50, 4), (51, 2), (52, 4), (53, 1), (54, 4), (55, 3), (56, 3), (57, 3), (58, 1), (59, 2), (60, 4), (61, 1), (62, 3), (63, 1), (64, 3), (65, 1), (66, 3), (67, 4), (68, 1), (69, 1), (70, 1), (71, 3), (72, 2), (73, 1), (74, 2), (75, 3), (76, 3), (77, 3), (78, 4), (79, 1), (80, 4), (81, 4), (82, 4), (83, 2), (84, 3), (85, 2), (86, 1), (87, 1), (88, 2), (89, 2), (90, 2), (91, 4), (92, 4), (93, 3), (94, 2), (95, 3), (96, 3), (97, 2), (98, 4), (99, 4), (100, 3), (101, 4), (102, 2), (103, 4), (104, 2), (105, 3), (106, 2), (107, 3), (108, 4), (109, 4), (110, 2)] | |
| questionnairev2 = [(1, 4), (2, 4), (3, 2), (4, 3), (5, 2), (6, 4), (7, 3), (8, 2), (9, 3), (10, 2), (11, 1), (12, 2), (13, 3), (14, 2), (15, 4), (16, 2), (17, 2), (18, 4), (19, 4), (20, 3), (21, 4), (22, 3), (23, 3), (24, 3), (25, 1), (26, 1), (27, 2), (28, 2), (29, 2), (30, 2), (31, 2), (32, 4), (33, 3), (34, 3), (35, 3), (36, 3), (37, 4), (38, 3), (39, 3), (40, 4), (41, 1), (42, 2), (43, 3), (44, 2), (45, 1), (46, 1), (47, 2), (48, 4), (49, 2), (50, 1), (51, 3), (52, 1), (53, 4), (54, 4), (55, 2), (56, 3), (57, 2), (58, 2), (59, 1), (60, 3), (61, 3), (62, 1), (63, 2), (64, 2), (65, 3), (66, 4), (67, 3), (68, 3), (69, 1), (70, 1), (71, 3), (72, 1), (73, 2), (74, 4), (75, 4), (76, 1), (77, 4), (78, 4), (79, 3), (80, 1), (81, 2), (82, 2), (83, 3), (84, 2), (85, 1), (86, 2), (87, 4), (88, 2), (89, 2), (90, 4), (91, 3), (92, 2), (93, 1), (94, 2), (95, 3), (96, 1), (97, 1), (98, 4), (99, 1), (100, 1)] | |
| random_accuracy = estimate_random_accuracy(questionnaire) | |
| random_accuracyv2 = estimate_random_accuracy(questionnairev2) | |
| # Determine file path based on dataset choice. | |
| # For now, if dataset_choice is "secQA", we use "Benchmark.csv" | |
| if dataset_choice == "secQA": | |
| file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space | |
| elif dataset_choice == "CyberMetric80": | |
| file_path = "metric.csv" # Placeholder: update with actual file paths for future datasets | |
| # Function to load and clean CSV data | |
| def load_data(file_path): | |
| df = pd.read_csv(file_path) | |
| # Remove any unnamed columns (caused by trailing commas) | |
| df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)] | |
| # Standardize column names | |
| df.columns = df.columns.str.strip() | |
| df.rename(columns={ | |
| "model name": "Model", | |
| "source": "Type", | |
| "v1 metric": "V1 Accuracy", | |
| "v2 metric": "V2 Accuracy" | |
| }, inplace=True) | |
| # Convert percentage strings to floats (e.g., "100%" → 1.0) | |
| for col in ["V1 Accuracy", "V2 Accuracy"]: | |
| if col in df.columns: | |
| df[col] = df[col].astype(str).str.replace("%", "").str.strip() | |
| df[col] = pd.to_numeric(df[col], errors='coerce') / 100 | |
| return df | |
| # Load dataset | |
| df = load_data(file_path) | |
| # Update the source filter with the actual options from the data | |
| source_filter = source_filter_placeholder.multiselect( | |
| "Select Model Type", | |
| options=df["Type"].unique().tolist(), | |
| default=df["Type"].unique().tolist() | |
| ) | |
| # Apply filtering based on the sidebar selections | |
| df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df | |
| # Choose the correct metric version and compute Accuracy | |
| #df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"] | |
| if dataset_choice == "CyberMetric80": | |
| df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] | |
| else: | |
| df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"] | |
| df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors | |
| # Sort by Accuracy descending | |
| df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True) | |
| # Compute dense ranking so that models with equal accuracy share the same rank | |
| df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int) | |
| df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']] | |
| tab1, tab2 = st.tabs(["Leaderboard", "About"]) | |
| with tab1: | |
| if dataset_choice == "secQA": | |
| st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)") | |
| elif dataset_choice == "CyberMetric80": | |
| st.markdown("#### [View the CyberMetric Dataset](https://github.com/cybermetric/CyberMetric)") | |
| # Use columns to display leaderboard and model details side-by-side | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.subheader(f"Leaderboard for {dataset_choice.upper()} Version {dataset_version}") | |
| st.dataframe(df_filtered.style.hide(axis='index')) | |
| with col2: | |
| st.subheader("Model Details") | |
| selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist()) | |
| model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0] | |
| st.write(f"**Model:** {model_details['Model']}") | |
| st.write(f"**Type:** {model_details['Type']}") | |
| st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}") | |
| st.write(f"**Rank:** {model_details['Rank']}") | |
| st.divider() | |
| # Display the random baseline accuracy above the leaderboard | |
| if dataset_choice == "secQA": | |
| st.markdown("### Random Baseline Accuracy") | |
| st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy)) | |
| st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2)) | |
| # Footer | |
| st.markdown("---") | |
| st.info("More dataset benchmarks will be added to this hub in the future.") | |
| with tab2: | |
| st.title("About the Cyber Benchmark Hub") | |
| st.markdown(""" | |
| Welcome to the **Cyber Benchmark Hub: Leaderboard**! | |
| This application benchmarks language models on their performance across cybersecurity question-answering tasks using the [SECQA dataset](https://huggingface.co/datasets/zefang-liu/secqa). It provides an interactive interface to explore model accuracy, rank models, and understand how different model types perform on security-centric multiple-choice questions. | |
| ### Leaderboard Features | |
| - Compare **different models** (e.g., GPT, Claude, Mistral) based on SECQA v1 or v2. | |
| - Filter by **model type/source** (open-source, closed) | |
| - View **dense rankings** where models with equal accuracy share the same rank. | |
| - See detailed information for each model, including: | |
| - Accuracy score | |
| - Rank | |
| ### Random Baseline Accuracy | |
| The app computes the **expected accuracy** if a model guessed randomly on all questions: | |
| This helps contextualize the actual performance of models. | |
| ### Built by | |
| [Priam.ai](https://www.priam.ai/) | |
| *This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.* | |
| """) | |