Spaces:

safe-challenge
/

video-challenge-leaderboard

Running

App Files Files Community

gmancino-ball commited on Oct 3

Commit

fba09af

verified ·

1 Parent(s): 4e59e27

gmb/updates (#7)

Browse files

- Refactor leaderboard (bdb48d3594e180083fa5c65bbe7c2f839a810ff9)

Files changed (3) hide show

app.py +586 -49
metric.py +4 -2
utils.py +85 -26

app.py CHANGED Viewed

@@ -13,9 +13,8 @@ TASKS = {
     "video-challenge-task-1-config": ["source"],
     "video-challenge-task-2-config": ["source", "category"],
 }
-valid_splits = ["public", "private"]
-with st.sidebar:
-    color_map =st.selectbox("colormap",["paired","category20","category20b","category20c","set2","set3"])
 #####################################################################
 ##                            Data loading                         ##
@@ -141,7 +140,13 @@ def make_roc_curves(task, submission_ids):
     # if rocs["team"].nunique() > 1:
     color_field = "team:N"
-    roc_chart = alt.Chart(rocs).mark_line().encode(x="fpr", y="tpr", color=alt.Color(color_field, scale=alt.Scale(scheme=color_map)), detail="submission_id:N")
     return roc_chart
@@ -159,12 +164,15 @@ st.set_page_config(
 ## Pull new results or toggle private public if you are an owner
 with st.sidebar:
     hf_token = os.getenv("HF_TOKEN")
     st.session_state["hf_token"] = hf_token
     password = st.text_input("Admin login:", type="password")
     if password == hf_token:
         if st.button("Pull New Results"):
             with st.spinner("Pulling new results", show_time=True):
                 try:
@@ -187,39 +195,127 @@ with st.sidebar:
                 except Exception as e:
                     st.error(f"Error starting background task: {e}")
-        ## Initialize the toggle state in session_state if it doesn't exist
-        if "private_view" not in st.session_state:
-            st.session_state.private_view = False
-        # Create the toggle widget
-        # The 'value' parameter sets the initial state, here linked to session_state
-        # The 'key' parameter is crucial for identifying the widget across reruns and linking to session_state
-        toggle_value = st.toggle("Private Scores", value=st.session_state.private_view, key="private_view")
-        # The 'toggle_value' variable will hold the current state of the toggle (True or False)
-        if toggle_value:
-            st.write("Showing **PRIVATE** scores.")
         else:
-            st.write("Showing **PUBLIC** scores.")
-        split = "public" if not toggle_value else "private"
     else:
         split = "public"
     st.session_state["split"] = split
-def show_dataframe_w_format(df,format="compact"):
-    column_config = {c: st.column_config.NumberColumn(c,format=format) for c in df.columns}
-    return st.dataframe(df,column_config=column_config)
 @st.fragment
 def show_leaderboard(task, score: str = "source"):
     split = st.session_state.get("split", "public")
     results = load_results(task, best_only=True)
     source_split_map = {}
-    if split == "private":
-        _sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
         pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
         source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
@@ -230,8 +326,8 @@ def show_leaderboard(task, score: str = "source"):
         # "pristine_accuracy",
         "auc",
         "total_time",
-         "datetime",
-        "fail_rate"
     ]
     column_config = {
@@ -271,7 +367,6 @@ def show_leaderboard(task, score: str = "source"):
             "🕒 Inference Time (s)",
             format="compact",
             # pinned=True,
             # width="small",
         ),
         "datetime": st.column_config.DatetimeColumn(
@@ -359,24 +454,26 @@ def show_leaderboard(task, score: str = "source"):
     if accuracy_types[granularity] == 0:
         "#### 👤 True Positive Rate | Generated Source"
         # st.dataframe(gen_tmp, column_config=column_config)
-        show_dataframe_w_format(gen_tmp)
         "#### 🧑‍🎤 True Negative Rate | Real Source"
         # st.dataframe(real_tmp, column_config=column_config)
-        show_dataframe_w_format(real_tmp)
     elif accuracy_types[granularity] == 1:
         "#### 👤 Balanced Accuracy | Generated Source"
         tnr = results[f"{split}_{score}_score"].loc[:, ["real_accuracy"]]
         gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
         # st.dataframe(gen_tmp, column_config=column_config)
-        show_dataframe_w_format(gen_tmp)
         "#### 🧑‍🎤 Balanced Accuracy | Real Source"
         tpr = results[f"{split}_{score}_score"].loc[:, ["generated_accuracy"]]
         real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
         # st.dataframe(real_tmp, column_config=column_config)
-        show_dataframe_w_format(real_tmp)
     else:
         cols = [c for c in results[f"{split}_{score}_score"].columns if "generated_conditional_auc" in c]
         col_names = [
@@ -405,10 +502,11 @@ def show_leaderboard(task, score: str = "source"):
         "#### 👤 Conditional AUC | Generated Source"
         # st.dataframe(gen_tmp, column_config=column_config)
-        show_dataframe_w_format(gen_tmp)
         "#### 🧑‍🎤 Conditional AUC | Real Source"
         # st.dataframe(real_tmp, column_config=column_config)
-        show_dataframe_w_format(real_tmp)
 def make_roc(results, show_text=False):
@@ -420,7 +518,7 @@ def make_roc(results, show_text=False):
         .encode(
             x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
             y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
-            color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)),  # Color by categorical field
             size=alt.Size(
                 "total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
             ),  # Size by quantitative field
@@ -440,7 +538,7 @@ def make_roc(results, show_text=False):
             .encode(
                 x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0, 1])),
                 y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0, 1])),
-                color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)),  # Color by categorical field
                 text="team",
             )
         )
@@ -469,7 +567,9 @@ def make_acc(results, show_text=False):
                 title="Balanced Accuracy",
                 scale=alt.Scale(domain=[0.4, 1]),
             ),
-            color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)),  # Color by categorical field # Size by quantitative field
         )
         .properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
     )
@@ -492,7 +592,9 @@ def make_acc(results, show_text=False):
                     title="Balanced Accuracy",
                     scale=alt.Scale(domain=[0.4, 1]),
                 ),
-                color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)),  # Color by categorical field # Size by quantitative field
                 text="team",
             )
         )
@@ -529,14 +631,29 @@ def show_augmentations(task, score):
         "Accuracy": 0,
         "AUC": 1,
     }
-    granularity = st.radio(
-        "accuracy type",
-        list(accuracy_types.keys()),
-        key=f"granularity-{task}-{score}",
-        horizontal=True,
-        label_visibility="collapsed",
-        index=0,
-    )
     ## Check cases
     if accuracy_types[granularity] == 0:
@@ -564,8 +681,20 @@ def show_augmentations(task, score):
             if "real_" in c and "accuracy" not in c and "conditional" not in c
         ]
         tmp = (gen_tmp + real_tmp) / 2.0
         # st.dataframe(tmp)
-        show_dataframe_w_format(tmp)
     else:
         cols = [c for c in results[f"{split}_{score}_score"].columns if "conditional_auc" in c]
@@ -578,8 +707,20 @@ def show_augmentations(task, score):
         tmp.columns = col_names
         "#### Conditional AUC"
         # st.dataframe(tmp)
-        show_dataframe_w_format(tmp)
 @st.fragment
@@ -658,8 +799,400 @@ updated = get_updated_time()
 st.markdown(updated)
-t1, t2, tp, volume_tab, all_submission_tab = st.tabs(
-    ["**Task 1**", "**Task 2**","**Pilot Task**",  "**Submission Volume**", "**All Submissions**"]
 )
 with t1:
@@ -671,6 +1204,10 @@ with t2:
 with tp:
     "*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
     make_plots_for_task(list(TASKS.keys())[0])
 with volume_tab:
     subs = get_volume()

     "video-challenge-task-1-config": ["source"],
     "video-challenge-task-2-config": ["source", "category"],
 }
+valid_splits = ["public", "private", "private_only"]
 #####################################################################
 ##                            Data loading                         ##
     # if rocs["team"].nunique() > 1:
     color_field = "team:N"
+    roc_chart = (
+        alt.Chart(rocs)
+        .mark_line()
+        .encode(
+            x="fpr", y="tpr", color=alt.Color(color_field, scale=alt.Scale(scheme=color_map)), detail="submission_id:N"
+        )
+    )
     return roc_chart
 ## Pull new results or toggle private public if you are an owner
 with st.sidebar:
+    color_map = st.selectbox("colormap", ["paired", "category20", "category20b", "category20c", "set2", "set3"])
     hf_token = os.getenv("HF_TOKEN")
     st.session_state["hf_token"] = hf_token
     password = st.text_input("Admin login:", type="password")
+    dataset_options = ["public"]
     if password == hf_token:
+        dataset_options = ["public", "private", "private_only"]
         if st.button("Pull New Results"):
             with st.spinner("Pulling new results", show_time=True):
                 try:
                 except Exception as e:
                     st.error(f"Error starting background task: {e}")
+        ## Initialize the dataset view state in session_state if it doesn't exist
+        if "dataset_view" not in st.session_state:
+            st.session_state.dataset_view = "public"
+        # Create the selectbox, ensuring the index is valid
+        current_view = st.session_state.dataset_view
+        valid_index = dataset_options.index(current_view) if current_view in dataset_options else 0
+        dataset_view = st.selectbox("Dataset View", options=dataset_options, index=valid_index, key="dataset_view")
+        # Display the current dataset view
+        if dataset_view == "private":
+            st.success("Showing **PRIVATE** scores (all data).")
+            # Visual indicator for admins in the UI
+            if password == hf_token:
+                st.info("🔐 Admin View: You have access to all data")
+            # Initialize the top_n parameter if not in session_state
+            if "top_n_value" not in st.session_state:
+                st.session_state.top_n_value = 3
+            # Add a slider to select the number of top elements to average
+            top_n_value = st.slider(
+                "Mean of top N elements",
+                min_value=2,
+                max_value=10,
+                value=st.session_state.top_n_value,
+                step=1,
+                help="Calculate the mean of the top N elements in each column",
+                key="top_n_value",
+            )
+            st.session_state["top_n"] = top_n_value
+        elif dataset_view == "private_only":
+            st.success("Showing **PRIVATE ONLY** scores (excluding public data).")
+            # Visual indicator for admins in the UI
+            if password == hf_token:
+                st.info("🔒 Admin View: You have access to private-only data")
+            # Initialize the top_n parameter if not in session_state
+            if "top_n_value" not in st.session_state:
+                st.session_state.top_n_value = 3
+            # Add a slider to select the number of top elements to average
+            top_n_value = st.slider(
+                "Mean of top N elements",
+                min_value=2,
+                max_value=10,
+                value=st.session_state.top_n_value,
+                step=1,
+                help="Calculate the mean of the top N elements in each column",
+                key="top_n_value",
+            )
+            st.session_state["top_n"] = top_n_value
         else:
+            st.info("Showing **PUBLIC** scores.")
+            st.session_state["top_n"] = None
+        # Ensure only admin users can access private data
+        if dataset_view in ["private", "private_only"] and password == hf_token:
+            split = dataset_view
+            # Clear the cache when the dataset view changes
+            previous_view = st.session_state.get("previous_dataset_view")
+            if previous_view != dataset_view:
+                load_results.clear()
+                st.session_state["previous_dataset_view"] = dataset_view
+        else:
+            split = "public"
     else:
         split = "public"
     st.session_state["split"] = split
+def show_dataframe_w_format(df, format="compact", top_n=None):
+    """
+    Display a dataframe with formatted columns. If in private mode and top_n is provided,
+    adds a row showing the mean of the top n values for each column.
+    Args:
+        df: Pandas dataframe to display
+        format: Format string for number columns (default: "compact")
+        top_n: Optional number of top values to average per column
+    """
+    split = st.session_state.get("split", "public")
+    # Only add top-n mean row in private mode
+    if split in ["private", "private_only"] and top_n is not None and isinstance(top_n, int) and top_n > 0:
+        # Create a copy to avoid modifying the original
+        df_display = df.copy()
+        # Calculate the mean of top n values for each column
+        top_n_means = {}
+        for col in df.columns:
+            sorted_values = df[col].sort_values(ascending=False)
+            # Ensure we don't try to take more values than available
+            actual_n = min(top_n, len(sorted_values))
+            if actual_n > 0:
+                top_n_means[col] = sorted_values.iloc[:actual_n].mean()
+            else:
+                top_n_means[col] = float("nan")
+        # Add the mean row as a new row in the dataframe
+        top_n_means_df = pd.DataFrame([top_n_means], index=[f"Top-{top_n} Mean"])
+        df_display = pd.concat([top_n_means_df, df_display])
+    else:
+        df_display = df
+    column_config = {c: st.column_config.NumberColumn(c, format=format) for c in df_display.columns}
+    return st.dataframe(df_display, column_config=column_config)
 @st.fragment
 def show_leaderboard(task, score: str = "source"):
     split = st.session_state.get("split", "public")
     results = load_results(task, best_only=True)
     source_split_map = {}
+    if split in ["private", "private_only"]:
+        _sol_df = pd.read_csv(COMP_CACHE / task / "solution-processed.csv")
         pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
         source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
         # "pristine_accuracy",
         "auc",
         "total_time",
+        "datetime",
+        "fail_rate",
     ]
     column_config = {
             "🕒 Inference Time (s)",
             format="compact",
             # pinned=True,
             # width="small",
         ),
         "datetime": st.column_config.DatetimeColumn(
     if accuracy_types[granularity] == 0:
         "#### 👤 True Positive Rate | Generated Source"
         # st.dataframe(gen_tmp, column_config=column_config)
+        top_n = st.session_state.get("top_n", None)
+        show_dataframe_w_format(gen_tmp, top_n=top_n)
         "#### 🧑‍🎤 True Negative Rate | Real Source"
         # st.dataframe(real_tmp, column_config=column_config)
+        show_dataframe_w_format(real_tmp, top_n=top_n)
     elif accuracy_types[granularity] == 1:
         "#### 👤 Balanced Accuracy | Generated Source"
         tnr = results[f"{split}_{score}_score"].loc[:, ["real_accuracy"]]
         gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
         # st.dataframe(gen_tmp, column_config=column_config)
+        top_n = st.session_state.get("top_n", None)
+        show_dataframe_w_format(gen_tmp, top_n=top_n)
         "#### 🧑‍🎤 Balanced Accuracy | Real Source"
         tpr = results[f"{split}_{score}_score"].loc[:, ["generated_accuracy"]]
         real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
         # st.dataframe(real_tmp, column_config=column_config)
+        show_dataframe_w_format(real_tmp, top_n=top_n)
     else:
         cols = [c for c in results[f"{split}_{score}_score"].columns if "generated_conditional_auc" in c]
         col_names = [
         "#### 👤 Conditional AUC | Generated Source"
         # st.dataframe(gen_tmp, column_config=column_config)
+        top_n = st.session_state.get("top_n", None)
+        show_dataframe_w_format(gen_tmp, top_n=top_n)
         "#### 🧑‍🎤 Conditional AUC | Real Source"
         # st.dataframe(real_tmp, column_config=column_config)
+        show_dataframe_w_format(real_tmp, top_n=top_n)
 def make_roc(results, show_text=False):
         .encode(
             x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
             y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
+            color=alt.Color("team:N", scale=alt.Scale(scheme=color_map)),  # Color by categorical field
             size=alt.Size(
                 "total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
             ),  # Size by quantitative field
             .encode(
                 x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0, 1])),
                 y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0, 1])),
+                color=alt.Color("team:N", scale=alt.Scale(scheme=color_map)),  # Color by categorical field
                 text="team",
             )
         )
                 title="Balanced Accuracy",
                 scale=alt.Scale(domain=[0.4, 1]),
             ),
+            color=alt.Color(
+                "team:N", scale=alt.Scale(scheme=color_map)
+            ),  # Color by categorical field # Size by quantitative field
         )
         .properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
     )
                     title="Balanced Accuracy",
                     scale=alt.Scale(domain=[0.4, 1]),
                 ),
+                color=alt.Color(
+                    "team:N", scale=alt.Scale(scheme=color_map)
+                ),  # Color by categorical field # Size by quantitative field
                 text="team",
             )
         )
         "Accuracy": 0,
         "AUC": 1,
     }
+    # Create a row with two columns for controls
+    col1, col2 = st.columns([0.1, 0.9])
+    with col1:
+        granularity = st.radio(
+            "accuracy type",
+            list(accuracy_types.keys()),
+            key=f"granularity-{task}-{score}",
+            horizontal=True,
+            label_visibility="collapsed",
+            index=0,
+        )
+    show_deltas = False
+    if split in ["private", "private_only"]:
+        with col2:
+            # Add toggle for showing deltas from "none" column
+            show_deltas = st.toggle(
+                "Show deltas from 'none' (higher values mean 'none' was **lower**)",
+                value=False,
+                key=f"deltas-{task}-{score}",
+            )
     ## Check cases
     if accuracy_types[granularity] == 0:
             if "real_" in c and "accuracy" not in c and "conditional" not in c
         ]
         tmp = (gen_tmp + real_tmp) / 2.0
+        # If toggle is on and "none" column exists, calculate deltas from "none" column
+        if show_deltas and "none" in tmp.columns:
+            # Get the "none" column values
+            none_values = tmp["none"].copy()
+            # Calculate deltas: none - current_column
+            for col in tmp.columns:
+                if col != "none":
+                    tmp[col] = -none_values + tmp[col]
         # st.dataframe(tmp)
+        top_n = st.session_state.get("top_n", None)
+        show_dataframe_w_format(tmp, top_n=top_n)
     else:
         cols = [c for c in results[f"{split}_{score}_score"].columns if "conditional_auc" in c]
         tmp.columns = col_names
         "#### Conditional AUC"
+        # If toggle is on and "none" column exists, calculate deltas from "none" column
+        if show_deltas and "none" in tmp.columns:
+            # Get the "none" column values
+            none_values = tmp["none"].copy()
+            # Calculate deltas: none - current_column
+            for col in tmp.columns:
+                if col != "none":
+                    tmp[col] = -none_values + tmp[col]
         # st.dataframe(tmp)
+        top_n = st.session_state.get("top_n", None)
+        show_dataframe_w_format(tmp, top_n=top_n)
 @st.fragment
 st.markdown(updated)
+@st.fragment
+def show_task_comparison():
+    """Show summary tables for Task 1 and Task 2 side by side."""
+    split = st.session_state.get("split", "public")
+    color_map_choice = st.session_state.get("colormap", "paired")
+    task1_key = list(TASKS.keys())[1]  # video-challenge-task-1-config
+    task2_key = list(TASKS.keys())[2]  # video-challenge-task-2-config
+    task1_results = load_results(task1_key, best_only=True)
+    task2_results = load_results(task2_key, best_only=True)
+    cols = ["balanced_accuracy", "generated_accuracy", "real_accuracy", "auc", "total_time", "datetime", "fail_rate"]
+    column_config = {
+        "balanced_accuracy": st.column_config.NumberColumn(
+            "⚖️ Balanced Accuracy",
+            format="compact",
+            min_value=0,
+            max_value=1.0,
+        ),
+        "generated_accuracy": st.column_config.NumberColumn(
+            "👤 True Positive Rate",
+            format="compact",
+            min_value=0,
+            max_value=1.0,
+        ),
+        "real_accuracy": st.column_config.NumberColumn(
+            "🧑‍🎤 True Negative Rate",
+            format="compact",
+            min_value=0,
+            max_value=1.0,
+        ),
+        "auc": st.column_config.NumberColumn(
+            "📐 AUC",
+            format="compact",
+            min_value=0,
+            max_value=1.0,
+        ),
+        "total_time": st.column_config.NumberColumn(
+            "🕒 Inference Time (s)",
+            format="compact",
+        ),
+        "datetime": st.column_config.DatetimeColumn(
+            "🗓️ Submission Date",
+            format="YYYY-MM-DD",
+        ),
+        "fail_rate": st.column_config.NumberColumn(
+            "❌ Fail Rate",
+            format="compact",
+        ),
+        "task1_balanced_accuracy": st.column_config.NumberColumn(
+            "⚖️ Task 1 Balanced Accuracy",
+            format="compact",
+            min_value=0,
+            max_value=1.0,
+        ),
+        "task2_balanced_accuracy": st.column_config.NumberColumn(
+            "⚖️ Task 2 Balanced Accuracy",
+            format="compact",
+            min_value=0,
+            max_value=1.0,
+        ),
+        "difference": st.column_config.NumberColumn(
+            "⚖️ Difference (T1-T2)",
+            format="compact",
+        ),
+        "percent_change": st.column_config.NumberColumn(
+            "% Change",
+            format="+.2%",
+        ),
+    }
+    # Create tabs for different views
+    tables_tab, charts_tab, time_tab = st.tabs(["Tables", "Charts", "Performance Timeline"])
+    with tables_tab:
+        # Create two columns for side-by-side tables
+        st.subheader("Performance Comparison: Task 1 vs Task 2")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Task 1: Original Content")
+            st.dataframe(
+                task1_results[f"{split}_source_score"].loc[:, cols],
+                column_config=column_config,
+                use_container_width=True,
+            )
+        with col2:
+            st.subheader("Task 2: Post-processed Content")
+            st.dataframe(
+                task2_results[f"{split}_source_score"].loc[:, cols],
+                column_config=column_config,
+                use_container_width=True,
+            )
+        # Add a section for comparison of task performance differences
+        st.subheader("Performance Analysis")
+        st.markdown(
+            """
+        Performance comparison between Task 1 (original content) and
+        Task 2 (post-processed content). A positive difference indicates degraded performance
+        on post-processed content.
+        """
+        )
+        # Get the datasets for both tasks
+        task1_df = task1_results[f"{split}_source_score"].reset_index()
+        task2_df = task2_results[f"{split}_source_score"].reset_index()
+        # Create a combined dataframe for analysis
+        common_teams = set(task1_df["team"]) & set(task2_df["team"])
+        if common_teams:
+            # Filter to teams that appear in both tasks
+            task1_filtered = task1_df[task1_df["team"].isin(common_teams)]
+            task2_filtered = task2_df[task2_df["team"].isin(common_teams)]
+            # Create a comparison dataframe
+            comparison_df = pd.DataFrame(
+                {
+                    "team": list(common_teams),
+                    "task1_balanced_accuracy": [
+                        task1_filtered[task1_filtered["team"] == team]["balanced_accuracy"].values[0]
+                        for team in common_teams
+                    ],
+                    "task2_balanced_accuracy": [
+                        task2_filtered[task2_filtered["team"] == team]["balanced_accuracy"].values[0]
+                        for team in common_teams
+                    ],
+                }
+            )
+            # Calculate differences and percentage changes
+            comparison_df["difference"] = (
+                comparison_df["task1_balanced_accuracy"] - comparison_df["task2_balanced_accuracy"]
+            )
+            comparison_df["percent_change"] = comparison_df["difference"] / comparison_df["task1_balanced_accuracy"]
+            # Sort by the absolute difference (to show biggest performance changes first)
+            comparison_df = comparison_df.sort_values(by="difference", ascending=False).reset_index(drop=True)
+            # Display the comparison table
+            show_dataframe_w_format(comparison_df, top_n=0)
+        else:
+            st.warning("No common teams found across both tasks.")
+    with charts_tab:
+        st.subheader("Team Performance Across Tasks")
+        # Get the datasets for both tasks if not already done
+        if "task1_df" not in locals():
+            task1_df = task1_results[f"{split}_source_score"].reset_index()
+            task2_df = task2_results[f"{split}_source_score"].reset_index()
+            common_teams = set(task1_df["team"]) & set(task2_df["team"])
+        if common_teams:
+            # Prepare data for the plot
+            plot_data = []
+            for team in common_teams:
+                # Get team's balanced accuracy for each task
+                task1_acc = task1_df[task1_df["team"] == team]["balanced_accuracy"].values[0]
+                task2_acc = task2_df[task2_df["team"] == team]["balanced_accuracy"].values[0]
+                # Add points for Task 1
+                plot_data.append({"team": team, "task": "Task 1", "balanced_accuracy": task1_acc})
+                # Add points for Task 2
+                plot_data.append({"team": team, "task": "Task 2", "balanced_accuracy": task2_acc})
+            plot_df = pd.DataFrame(plot_data)
+            # Create line chart connecting team performances
+            lines = (
+                alt.Chart(plot_df)
+                .mark_line(point=alt.OverlayMarkDef(filled=True, size=100), strokeDash=[4, 2], strokeWidth=2)
+                .encode(
+                    x=alt.X("task:N", title="Task", sort=["Task 1", "Task 2"]),
+                    y=alt.Y("balanced_accuracy:Q", title="Balanced Accuracy", scale=alt.Scale(domain=[0.4, 1.0])),
+                    color=alt.Color(
+                        "team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
+                    ),
+                    tooltip=["team:N", "task:N", "balanced_accuracy:Q"],
+                )
+                .properties(width=700, height=500, title="Performance Changes Across Tasks")
+            )
+            st.altair_chart(lines, use_container_width=False)
+        else:
+            st.warning("No common teams found across both tasks.")
+    with time_tab:
+        st.subheader("Team Performance Timeline")
+        # Get full submission data (not just best_only) to analyze performance over time
+        task1_results_full = load_results(task1_key, best_only=False)
+        task2_results_full = load_results(task2_key, best_only=False)
+        # We need to select specific task result keys based on what's available
+        task1_result_key = f"{split}_source_score"
+        task2_result_key = f"{split}_source_score"
+        # Check if we have data for both tasks
+        if (
+            task1_result_key in task1_results_full
+            and task2_result_key in task2_results_full
+            and not task1_results_full[task1_result_key].empty
+            and not task2_results_full[task2_result_key].empty
+        ):
+            # Extract datetime and make it datetime objects
+            task1_time_df = task1_results_full[task1_result_key].reset_index().copy()
+            task2_time_df = task2_results_full[task2_result_key].reset_index().copy()
+            # Ensure datetime column exists in both dataframes
+            if "datetime" in task1_time_df.columns and "datetime" in task2_time_df.columns:
+                # Convert string dates to datetime objects if they aren't already
+                if pd.api.types.is_string_dtype(task1_time_df["datetime"]):
+                    task1_time_df["datetime"] = pd.to_datetime(task1_time_df["datetime"])
+                if pd.api.types.is_string_dtype(task2_time_df["datetime"]):
+                    task2_time_df["datetime"] = pd.to_datetime(task2_time_df["datetime"])
+                # Make a list of unique teams across both tasks
+                all_teams = sorted(
+                    list(set(list(task1_time_df["team"].unique()) + list(task2_time_df["team"].unique())))
+                )
+                # Create a selectbox to select teams to display
+                if len(all_teams) > 10:  # If we have many teams, add a filter
+                    selected_teams = st.multiselect("Select Teams to Display", all_teams, default=all_teams[:5])
+                    if not selected_teams:  # Default to first 5 if none selected
+                        selected_teams = all_teams[:5]
+                else:
+                    selected_teams = all_teams
+                # Function to compute running max for each team
+                def compute_running_max(df):
+                    # Group by team and sort by datetime
+                    result_df = df.copy()
+                    for team in result_df["team"].unique():
+                        team_data = result_df[result_df["team"] == team].copy()
+                        team_data = team_data.sort_values("datetime")
+                        # Calculate running maximum
+                        team_data["balanced_accuracy"] = team_data["balanced_accuracy"].cummax()
+                        # Update the original dataframe
+                        result_df.loc[team_data.index, "balanced_accuracy"] = team_data["balanced_accuracy"]
+                    return result_df
+                # Filter and compute running maximum for each task
+                task1_filtered = task1_time_df[task1_time_df["team"].isin(selected_teams)].copy()
+                task2_filtered = task2_time_df[task2_time_df["team"].isin(selected_teams)].copy()
+                if not task1_filtered.empty and not task2_filtered.empty:
+                    # Compute running maximum
+                    task1_max = compute_running_max(task1_filtered)
+                    task2_max = compute_running_max(task2_filtered)
+                    # Create tabs for the two tasks
+                    task1_plot_tab, task2_plot_tab = st.tabs(["Task 1 Timeline", "Task 2 Timeline"])
+                    # Create plot for Task 1
+                    with task1_plot_tab:
+                        st.subheader("Task 1: Original Content - Performance Over Time")
+                        # Calculate max performance for baseline
+                        task1_max_performance = task1_time_df[
+                            task1_time_df["team"].apply(lambda x: x.lower()).isin(["baseline"])
+                        ]["balanced_accuracy"].max()
+                        # Create baseline data
+                        baseline_data = pd.DataFrame(
+                            {
+                                "datetime": [task1_max["datetime"].min(), task1_max["datetime"].max()],
+                                "balanced_accuracy": [task1_max_performance, task1_max_performance],
+                                "label": ["Max Performance", "Max Performance"],
+                            }
+                        )
+                        # Create baseline chart
+                        baseline_chart = (
+                            alt.Chart(baseline_data)
+                            .mark_line(strokeDash=[4, 4], color="black", strokeWidth=2)
+                            .encode(
+                                x="datetime:T",
+                                y="balanced_accuracy:Q",
+                                tooltip=alt.Tooltip("balanced_accuracy:Q", title="Baseline", format=".4f"),
+                            )
+                        )
+                        # Create main chart
+                        task1_chart = (
+                            alt.Chart(task1_max)
+                            .mark_line(point=True)
+                            .encode(
+                                x=alt.X(
+                                    "datetime:T",
+                                    title="Submission Date",
+                                    axis=alt.Axis(format="%b %d"),  # Format as "Month Date"
+                                ),
+                                y=alt.Y(
+                                    "balanced_accuracy:Q",
+                                    title="Best Balanced Accuracy",
+                                    scale=alt.Scale(domain=[0.4, 1.0]),
+                                ),
+                                color=alt.Color(
+                                    "team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
+                                ),
+                                tooltip=[
+                                    "team:N",
+                                    alt.Tooltip("datetime:T", title="Date", format="%b %d, %Y"),
+                                    alt.Tooltip("balanced_accuracy:Q", title="Best Accuracy", format=".4f"),
+                                ],
+                            )
+                            .properties(width=800, height=500, title="Best Performance Over Time (Original Content)")
+                            .interactive()
+                        )
+                        # Combine charts and display
+                        st.altair_chart(task1_chart + baseline_chart, use_container_width=False)
+                    # Create plot for Task 2
+                    with task2_plot_tab:
+                        st.subheader("Task 2: Post-processed Content - Performance Over Time")
+                        # Calculate max performance for baseline
+                        task2_max_performance = task2_time_df[
+                            task2_time_df["team"].apply(lambda x: x.lower()).isin(["baseline"])
+                        ]["balanced_accuracy"].max()
+                        # Create baseline data
+                        baseline_data = pd.DataFrame(
+                            {
+                                "datetime": [task2_max["datetime"].min(), task2_max["datetime"].max()],
+                                "balanced_accuracy": [task2_max_performance, task2_max_performance],
+                                "label": ["Max Performance", "Max Performance"],
+                            }
+                        )
+                        # Create baseline chart
+                        baseline_chart = (
+                            alt.Chart(baseline_data)
+                            .mark_line(strokeDash=[4, 4], color="black", strokeWidth=2)
+                            .encode(
+                                x="datetime:T",
+                                y="balanced_accuracy:Q",
+                                tooltip=alt.Tooltip("balanced_accuracy:Q", title="Baseline", format=".4f"),
+                            )
+                        )
+                        # Create main chart
+                        task2_chart = (
+                            alt.Chart(task2_max)
+                            .mark_line(point=True)
+                            .encode(
+                                x=alt.X(
+                                    "datetime:T",
+                                    title="Submission Date",
+                                    axis=alt.Axis(format="%b %d"),  # Format as "Month Date"
+                                ),
+                                y=alt.Y(
+                                    "balanced_accuracy:Q",
+                                    title="Best Balanced Accuracy",
+                                    scale=alt.Scale(domain=[0.4, 1.0]),
+                                ),
+                                color=alt.Color(
+                                    "team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
+                                ),
+                                tooltip=[
+                                    "team:N",
+                                    alt.Tooltip("datetime:T", title="Date", format="%b %d, %Y"),
+                                    alt.Tooltip("balanced_accuracy:Q", title="Best Accuracy", format=".4f"),
+                                ],
+                            )
+                            .properties(
+                                width=800, height=500, title="Best Performance Over Time (Post-Processed Content)"
+                            )
+                            .interactive()
+                        )
+                        # Combine charts and display
+                        st.altair_chart(task2_chart + baseline_chart, use_container_width=False)
+                else:
+                    st.warning("No data available for selected teams.")
+            else:
+                st.warning("Datetime information is not available in the dataset.")
+        else:
+            st.warning("Historical performance data is not available for both tasks.")
+t1, t2, tp, comparison_tab, volume_tab, all_submission_tab = st.tabs(
+    ["**Task 1**", "**Task 2**", "**Pilot Task**", "**Compare Tasks**", "**Submission Volume**", "**All Submissions**"]
 )
 with t1:
 with tp:
     "*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
     make_plots_for_task(list(TASKS.keys())[0])
+if split in ["private", "private_only"]:
+    with comparison_tab:
+        "**Task 1 to Task 2 performance comparison.**"
+        show_task_comparison()
 with volume_tab:
     subs = get_volume()

metric.py CHANGED Viewed

@@ -147,13 +147,15 @@ def _metric(
     ## Save data split
     evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
     evaluation["private_score"]["proportion"] = 1.0
-    ## Public and private split
     public_df = solution_df.query("split=='public'").copy()
     private_df = solution_df.copy()
     ## Loop
-    for split, dataframe in zip(["public", "private"], [public_df, private_df]):
         metrics = compute_metrics(
             df=dataframe.copy(), score_name=score_name if split == "public" else f"{score_name}_og", use_all=use_all
         )

     ## Save data split
     evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
     evaluation["private_score"]["proportion"] = 1.0
+    evaluation["private_only_score"]["proportion"] = len(solution_df.query(f"split=='private'").copy()) / len(solution_df)
+    ## Public, private, and private_only split
     public_df = solution_df.query("split=='public'").copy()
     private_df = solution_df.copy()
+    private_only_df = solution_df.query("split=='private'").copy()
     ## Loop
+    for split, dataframe in zip(["public", "private", "private_only"], [public_df, private_df, private_only_df]):
         metrics = compute_metrics(
             df=dataframe.copy(), score_name=score_name if split == "public" else f"{score_name}_og", use_all=use_all
         )

utils.py CHANGED Viewed

@@ -22,7 +22,7 @@ def download_competition_data(competition_names: List[str]) -> None:
             local_dir=os.path.join(COMP_CACHE, repo_id),
             repo_type="dataset",
             token=os.environ.get("HF_TOKEN"),
-            ignore_patterns="submission_logs/*"
         )
@@ -142,7 +142,7 @@ def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]:
     return new
-def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
     import pandas as pd
     import json
@@ -153,7 +153,7 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
     team_id = "insiders-id-1-2-3"
     team_name = "insiders"
-    submission_id = f"sub{threshold}".replace(".","")
     ## update teams
     teams = json.load(open(path_to_cache + "/teams.json"))
@@ -169,20 +169,22 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
     if os.path.exists(submission_info_file):
         temp = json.load(open(submission_info_file))
     else:
-        temp = {"id": team_id,"submissions": []}
-    temp["submissions"].append({
-                    "datetime": "2025-09-22 14:42:14",
-                    "submission_id": submission_id,
-                    "submission_comment": "",
-                    "submission_repo": "",
-                    "space_id": "",
-                    "submitted_by": "na",
-                    "status": 3,
-                    "selected": True,
-                    "public_score": {},
-                    "private_score": {},
-                })
     with open(submission_info_file, "w") as f:
         json.dump(temp, f)
@@ -191,11 +193,16 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
         path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
     )
 def create_custom_subs():
     import numpy as np
-    for threshold in np.linspace(-6,0,10):
-        add_custom_submission(path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config",
-                          path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt", threshold=threshold)
 if __name__ == "__main__":
@@ -206,15 +213,11 @@ if __name__ == "__main__":
         "safe-challenge/video-challenge-task-1-config",
         "safe-challenge/video-challenge-task-2-config",
     ]
-    download_competition_data(competition_names=spaces)
     if os.environ.get("MAKE_CUSTOM"):
         print("adding custom subs")
         create_custom_subs()
     ## Loop
     for space in spaces:
@@ -263,7 +266,7 @@ if __name__ == "__main__":
             scores = ["source"]
         for score_name in scores:
             ## Loop and save by team
-            public, private, rocs = [], [], []
             # for team_id, submission_set in submissions.items():
             for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
                 "team_id"
@@ -298,6 +301,11 @@ if __name__ == "__main__":
                     for key, value in results.items()
                     if key in team_submissions
                 }
                 ## Add timing
                 public_times = {
@@ -312,10 +320,18 @@ if __name__ == "__main__":
                         ["submission_id", "private_time"]
                     ].to_dict(orient="records")
                 }
                 for key in public_results.keys():
                     public_results[key]["total_time"] = public_times[key]
                 for key in private_results.keys():
                     private_results[key]["total_time"] = private_times[key]
                 ## Roc computations
                 roc_results = {
@@ -425,9 +441,45 @@ if __name__ == "__main__":
                 )
                 private.append(private_df)
             ## Save as csvs
             public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
             private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
             rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
             public.to_csv(
                 Path("competition_cache")
@@ -441,6 +493,13 @@ if __name__ == "__main__":
                 / f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
                 index=False,
             )
             rocs.to_csv(
                 Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
                 index=False,

             local_dir=os.path.join(COMP_CACHE, repo_id),
             repo_type="dataset",
             token=os.environ.get("HF_TOKEN"),
+            ignore_patterns="submission_logs/*",
         )
     return new
+def add_custom_submission(path_to_cache, path_to_subfile, threshold=0):
     import pandas as pd
     import json
     team_id = "insiders-id-1-2-3"
     team_name = "insiders"
+    submission_id = f"sub{threshold}".replace(".", "")
     ## update teams
     teams = json.load(open(path_to_cache + "/teams.json"))
     if os.path.exists(submission_info_file):
         temp = json.load(open(submission_info_file))
     else:
+        temp = {"id": team_id, "submissions": []}
+    temp["submissions"].append(
+        {
+            "datetime": "2025-09-22 14:42:14",
+            "submission_id": submission_id,
+            "submission_comment": "",
+            "submission_repo": "",
+            "space_id": "",
+            "submitted_by": "na",
+            "status": 3,
+            "selected": True,
+            "public_score": {},
+            "private_score": {},
+        }
+    )
     with open(submission_info_file, "w") as f:
         json.dump(temp, f)
         path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
     )
 def create_custom_subs():
     import numpy as np
+    for threshold in np.linspace(-6, 0, 10):
+        add_custom_submission(
+            path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config",
+            path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt",
+            threshold=threshold,
+        )
 if __name__ == "__main__":
         "safe-challenge/video-challenge-task-1-config",
         "safe-challenge/video-challenge-task-2-config",
     ]
+    # download_competition_data(competition_names=spaces)
     if os.environ.get("MAKE_CUSTOM"):
         print("adding custom subs")
         create_custom_subs()
     ## Loop
     for space in spaces:
             scores = ["source"]
         for score_name in scores:
             ## Loop and save by team
+            public, private, private_only, rocs = [], [], [], []
             # for team_id, submission_set in submissions.items():
             for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
                 "team_id"
                     for key, value in results.items()
                     if key in team_submissions
                 }
+                private_only_results = {
+                    key: prep_private(value["private_only_score"])
+                    for key, value in results.items()
+                    if key in team_submissions
+                }
                 ## Add timing
                 public_times = {
                         ["submission_id", "private_time"]
                     ].to_dict(orient="records")
                 }
+                private_only_times = {
+                    x["submission_id"]: x["private_time"] - x["public_time"]
+                    for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
+                        ["submission_id", "private_time", "public_time"]
+                    ].to_dict(orient="records")
+                }
                 for key in public_results.keys():
                     public_results[key]["total_time"] = public_times[key]
                 for key in private_results.keys():
                     private_results[key]["total_time"] = private_times[key]
+                for key in private_only_results.keys():
+                    private_only_results[key]["total_time"] = private_only_times[key]
                 ## Roc computations
                 roc_results = {
                 )
                 private.append(private_df)
+                ## Private ONLY results
+                private_only_df = pd.json_normalize(private_only_results.values())
+                private_only_df.insert(
+                    loc=0,
+                    column="submission_id",
+                    value=list(private_only_results.keys()),
+                )
+                private_only_df.insert(
+                    loc=0,
+                    column="team",
+                    value=[
+                        teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
+                        for submission_id in private_only_results.keys()
+                    ],
+                )
+                private_only_df.insert(
+                    loc=0,
+                    column="team_id",
+                    value=[
+                        teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
+                        for submission_id in private_only_results.keys()
+                    ],
+                )
+                private_only_df.insert(
+                    loc=0,
+                    column="datetime",
+                    value=[
+                        submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0]
+                        for submission_id in private_only_results.keys()
+                    ],
+                )
+                private_only.append(private_only_df)
             ## Save as csvs
             public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
             private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
+            private_only = pd.concat(private_only, axis=0, ignore_index=True).sort_values(
+                by="balanced_accuracy", ascending=False
+            )
             rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
             public.to_csv(
                 Path("competition_cache")
                 / f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
                 index=False,
             )
+            private_only.to_csv(
+                Path("competition_cache")
+                / "cached_results"
+                / f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv",
+                index=False,
+            )
             rocs.to_csv(
                 Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
                 index=False,