gmb/updates (#7)
Browse files- Refactor leaderboard (bdb48d3594e180083fa5c65bbe7c2f839a810ff9)
app.py
CHANGED
|
@@ -13,9 +13,8 @@ TASKS = {
|
|
| 13 |
"video-challenge-task-1-config": ["source"],
|
| 14 |
"video-challenge-task-2-config": ["source", "category"],
|
| 15 |
}
|
| 16 |
-
valid_splits = ["public", "private"]
|
| 17 |
-
|
| 18 |
-
color_map =st.selectbox("colormap",["paired","category20","category20b","category20c","set2","set3"])
|
| 19 |
|
| 20 |
#####################################################################
|
| 21 |
## Data loading ##
|
|
@@ -141,7 +140,13 @@ def make_roc_curves(task, submission_ids):
|
|
| 141 |
# if rocs["team"].nunique() > 1:
|
| 142 |
color_field = "team:N"
|
| 143 |
|
| 144 |
-
roc_chart =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
return roc_chart
|
| 147 |
|
|
@@ -159,12 +164,15 @@ st.set_page_config(
|
|
| 159 |
|
| 160 |
## Pull new results or toggle private public if you are an owner
|
| 161 |
with st.sidebar:
|
|
|
|
| 162 |
|
| 163 |
hf_token = os.getenv("HF_TOKEN")
|
| 164 |
st.session_state["hf_token"] = hf_token
|
| 165 |
password = st.text_input("Admin login:", type="password")
|
| 166 |
|
|
|
|
| 167 |
if password == hf_token:
|
|
|
|
| 168 |
if st.button("Pull New Results"):
|
| 169 |
with st.spinner("Pulling new results", show_time=True):
|
| 170 |
try:
|
|
@@ -187,39 +195,127 @@ with st.sidebar:
|
|
| 187 |
except Exception as e:
|
| 188 |
st.error(f"Error starting background task: {e}")
|
| 189 |
|
| 190 |
-
## Initialize the
|
| 191 |
-
if "
|
| 192 |
-
st.session_state.
|
| 193 |
-
|
| 194 |
-
# Create the
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
else:
|
| 203 |
-
st.
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
else:
|
| 207 |
split = "public"
|
| 208 |
|
| 209 |
st.session_state["split"] = split
|
| 210 |
|
| 211 |
|
| 212 |
-
def show_dataframe_w_format(df,format="compact"):
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
@st.fragment
|
| 217 |
def show_leaderboard(task, score: str = "source"):
|
| 218 |
split = st.session_state.get("split", "public")
|
| 219 |
results = load_results(task, best_only=True)
|
| 220 |
source_split_map = {}
|
| 221 |
-
if split
|
| 222 |
-
_sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
|
| 223 |
pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
|
| 224 |
source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
|
| 225 |
|
|
@@ -230,8 +326,8 @@ def show_leaderboard(task, score: str = "source"):
|
|
| 230 |
# "pristine_accuracy",
|
| 231 |
"auc",
|
| 232 |
"total_time",
|
| 233 |
-
|
| 234 |
-
"fail_rate"
|
| 235 |
]
|
| 236 |
|
| 237 |
column_config = {
|
|
@@ -271,7 +367,6 @@ def show_leaderboard(task, score: str = "source"):
|
|
| 271 |
"🕒 Inference Time (s)",
|
| 272 |
format="compact",
|
| 273 |
# pinned=True,
|
| 274 |
-
|
| 275 |
# width="small",
|
| 276 |
),
|
| 277 |
"datetime": st.column_config.DatetimeColumn(
|
|
@@ -359,24 +454,26 @@ def show_leaderboard(task, score: str = "source"):
|
|
| 359 |
if accuracy_types[granularity] == 0:
|
| 360 |
"#### 👤 True Positive Rate | Generated Source"
|
| 361 |
# st.dataframe(gen_tmp, column_config=column_config)
|
| 362 |
-
|
|
|
|
| 363 |
|
| 364 |
"#### 🧑🎤 True Negative Rate | Real Source"
|
| 365 |
# st.dataframe(real_tmp, column_config=column_config)
|
| 366 |
-
show_dataframe_w_format(real_tmp)
|
| 367 |
|
| 368 |
elif accuracy_types[granularity] == 1:
|
| 369 |
"#### 👤 Balanced Accuracy | Generated Source"
|
| 370 |
tnr = results[f"{split}_{score}_score"].loc[:, ["real_accuracy"]]
|
| 371 |
gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
|
| 372 |
# st.dataframe(gen_tmp, column_config=column_config)
|
| 373 |
-
|
|
|
|
| 374 |
|
| 375 |
"#### 🧑🎤 Balanced Accuracy | Real Source"
|
| 376 |
tpr = results[f"{split}_{score}_score"].loc[:, ["generated_accuracy"]]
|
| 377 |
real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
|
| 378 |
# st.dataframe(real_tmp, column_config=column_config)
|
| 379 |
-
show_dataframe_w_format(real_tmp)
|
| 380 |
else:
|
| 381 |
cols = [c for c in results[f"{split}_{score}_score"].columns if "generated_conditional_auc" in c]
|
| 382 |
col_names = [
|
|
@@ -405,10 +502,11 @@ def show_leaderboard(task, score: str = "source"):
|
|
| 405 |
|
| 406 |
"#### 👤 Conditional AUC | Generated Source"
|
| 407 |
# st.dataframe(gen_tmp, column_config=column_config)
|
| 408 |
-
|
|
|
|
| 409 |
"#### 🧑🎤 Conditional AUC | Real Source"
|
| 410 |
# st.dataframe(real_tmp, column_config=column_config)
|
| 411 |
-
show_dataframe_w_format(real_tmp)
|
| 412 |
|
| 413 |
|
| 414 |
def make_roc(results, show_text=False):
|
|
@@ -420,7 +518,7 @@ def make_roc(results, show_text=False):
|
|
| 420 |
.encode(
|
| 421 |
x=alt.X("FA:Q", title="🧑🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
|
| 422 |
y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
|
| 423 |
-
color=alt.Color(
|
| 424 |
size=alt.Size(
|
| 425 |
"total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
|
| 426 |
), # Size by quantitative field
|
|
@@ -440,7 +538,7 @@ def make_roc(results, show_text=False):
|
|
| 440 |
.encode(
|
| 441 |
x=alt.X("FA:Q", title="🧑🎤 False Positive Rate", scale=alt.Scale(domain=[0, 1])),
|
| 442 |
y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0, 1])),
|
| 443 |
-
color=alt.Color(
|
| 444 |
text="team",
|
| 445 |
)
|
| 446 |
)
|
|
@@ -469,7 +567,9 @@ def make_acc(results, show_text=False):
|
|
| 469 |
title="Balanced Accuracy",
|
| 470 |
scale=alt.Scale(domain=[0.4, 1]),
|
| 471 |
),
|
| 472 |
-
color=alt.Color(
|
|
|
|
|
|
|
| 473 |
)
|
| 474 |
.properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
|
| 475 |
)
|
|
@@ -492,7 +592,9 @@ def make_acc(results, show_text=False):
|
|
| 492 |
title="Balanced Accuracy",
|
| 493 |
scale=alt.Scale(domain=[0.4, 1]),
|
| 494 |
),
|
| 495 |
-
color=alt.Color(
|
|
|
|
|
|
|
| 496 |
text="team",
|
| 497 |
)
|
| 498 |
)
|
|
@@ -529,14 +631,29 @@ def show_augmentations(task, score):
|
|
| 529 |
"Accuracy": 0,
|
| 530 |
"AUC": 1,
|
| 531 |
}
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
|
| 541 |
## Check cases
|
| 542 |
if accuracy_types[granularity] == 0:
|
|
@@ -564,8 +681,20 @@ def show_augmentations(task, score):
|
|
| 564 |
if "real_" in c and "accuracy" not in c and "conditional" not in c
|
| 565 |
]
|
| 566 |
tmp = (gen_tmp + real_tmp) / 2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
# st.dataframe(tmp)
|
| 568 |
-
|
|
|
|
| 569 |
|
| 570 |
else:
|
| 571 |
cols = [c for c in results[f"{split}_{score}_score"].columns if "conditional_auc" in c]
|
|
@@ -578,8 +707,20 @@ def show_augmentations(task, score):
|
|
| 578 |
tmp.columns = col_names
|
| 579 |
|
| 580 |
"#### Conditional AUC"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
# st.dataframe(tmp)
|
| 582 |
-
|
|
|
|
| 583 |
|
| 584 |
|
| 585 |
@st.fragment
|
|
@@ -658,8 +799,400 @@ updated = get_updated_time()
|
|
| 658 |
st.markdown(updated)
|
| 659 |
|
| 660 |
|
| 661 |
-
|
| 662 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
)
|
| 664 |
|
| 665 |
with t1:
|
|
@@ -671,6 +1204,10 @@ with t2:
|
|
| 671 |
with tp:
|
| 672 |
"*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
|
| 673 |
make_plots_for_task(list(TASKS.keys())[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
|
| 675 |
with volume_tab:
|
| 676 |
subs = get_volume()
|
|
|
|
| 13 |
"video-challenge-task-1-config": ["source"],
|
| 14 |
"video-challenge-task-2-config": ["source", "category"],
|
| 15 |
}
|
| 16 |
+
valid_splits = ["public", "private", "private_only"]
|
| 17 |
+
|
|
|
|
| 18 |
|
| 19 |
#####################################################################
|
| 20 |
## Data loading ##
|
|
|
|
| 140 |
# if rocs["team"].nunique() > 1:
|
| 141 |
color_field = "team:N"
|
| 142 |
|
| 143 |
+
roc_chart = (
|
| 144 |
+
alt.Chart(rocs)
|
| 145 |
+
.mark_line()
|
| 146 |
+
.encode(
|
| 147 |
+
x="fpr", y="tpr", color=alt.Color(color_field, scale=alt.Scale(scheme=color_map)), detail="submission_id:N"
|
| 148 |
+
)
|
| 149 |
+
)
|
| 150 |
|
| 151 |
return roc_chart
|
| 152 |
|
|
|
|
| 164 |
|
| 165 |
## Pull new results or toggle private public if you are an owner
|
| 166 |
with st.sidebar:
|
| 167 |
+
color_map = st.selectbox("colormap", ["paired", "category20", "category20b", "category20c", "set2", "set3"])
|
| 168 |
|
| 169 |
hf_token = os.getenv("HF_TOKEN")
|
| 170 |
st.session_state["hf_token"] = hf_token
|
| 171 |
password = st.text_input("Admin login:", type="password")
|
| 172 |
|
| 173 |
+
dataset_options = ["public"]
|
| 174 |
if password == hf_token:
|
| 175 |
+
dataset_options = ["public", "private", "private_only"]
|
| 176 |
if st.button("Pull New Results"):
|
| 177 |
with st.spinner("Pulling new results", show_time=True):
|
| 178 |
try:
|
|
|
|
| 195 |
except Exception as e:
|
| 196 |
st.error(f"Error starting background task: {e}")
|
| 197 |
|
| 198 |
+
## Initialize the dataset view state in session_state if it doesn't exist
|
| 199 |
+
if "dataset_view" not in st.session_state:
|
| 200 |
+
st.session_state.dataset_view = "public"
|
| 201 |
+
|
| 202 |
+
# Create the selectbox, ensuring the index is valid
|
| 203 |
+
current_view = st.session_state.dataset_view
|
| 204 |
+
valid_index = dataset_options.index(current_view) if current_view in dataset_options else 0
|
| 205 |
+
|
| 206 |
+
dataset_view = st.selectbox("Dataset View", options=dataset_options, index=valid_index, key="dataset_view")
|
| 207 |
+
|
| 208 |
+
# Display the current dataset view
|
| 209 |
+
if dataset_view == "private":
|
| 210 |
+
st.success("Showing **PRIVATE** scores (all data).")
|
| 211 |
+
|
| 212 |
+
# Visual indicator for admins in the UI
|
| 213 |
+
if password == hf_token:
|
| 214 |
+
st.info("🔐 Admin View: You have access to all data")
|
| 215 |
+
|
| 216 |
+
# Initialize the top_n parameter if not in session_state
|
| 217 |
+
if "top_n_value" not in st.session_state:
|
| 218 |
+
st.session_state.top_n_value = 3
|
| 219 |
+
|
| 220 |
+
# Add a slider to select the number of top elements to average
|
| 221 |
+
top_n_value = st.slider(
|
| 222 |
+
"Mean of top N elements",
|
| 223 |
+
min_value=2,
|
| 224 |
+
max_value=10,
|
| 225 |
+
value=st.session_state.top_n_value,
|
| 226 |
+
step=1,
|
| 227 |
+
help="Calculate the mean of the top N elements in each column",
|
| 228 |
+
key="top_n_value",
|
| 229 |
+
)
|
| 230 |
+
st.session_state["top_n"] = top_n_value
|
| 231 |
+
elif dataset_view == "private_only":
|
| 232 |
+
st.success("Showing **PRIVATE ONLY** scores (excluding public data).")
|
| 233 |
+
|
| 234 |
+
# Visual indicator for admins in the UI
|
| 235 |
+
if password == hf_token:
|
| 236 |
+
st.info("🔒 Admin View: You have access to private-only data")
|
| 237 |
+
|
| 238 |
+
# Initialize the top_n parameter if not in session_state
|
| 239 |
+
if "top_n_value" not in st.session_state:
|
| 240 |
+
st.session_state.top_n_value = 3
|
| 241 |
+
|
| 242 |
+
# Add a slider to select the number of top elements to average
|
| 243 |
+
top_n_value = st.slider(
|
| 244 |
+
"Mean of top N elements",
|
| 245 |
+
min_value=2,
|
| 246 |
+
max_value=10,
|
| 247 |
+
value=st.session_state.top_n_value,
|
| 248 |
+
step=1,
|
| 249 |
+
help="Calculate the mean of the top N elements in each column",
|
| 250 |
+
key="top_n_value",
|
| 251 |
+
)
|
| 252 |
+
st.session_state["top_n"] = top_n_value
|
| 253 |
else:
|
| 254 |
+
st.info("Showing **PUBLIC** scores.")
|
| 255 |
+
st.session_state["top_n"] = None
|
| 256 |
+
|
| 257 |
+
# Ensure only admin users can access private data
|
| 258 |
+
if dataset_view in ["private", "private_only"] and password == hf_token:
|
| 259 |
+
split = dataset_view
|
| 260 |
+
|
| 261 |
+
# Clear the cache when the dataset view changes
|
| 262 |
+
previous_view = st.session_state.get("previous_dataset_view")
|
| 263 |
+
if previous_view != dataset_view:
|
| 264 |
+
load_results.clear()
|
| 265 |
+
st.session_state["previous_dataset_view"] = dataset_view
|
| 266 |
+
else:
|
| 267 |
+
split = "public"
|
| 268 |
else:
|
| 269 |
split = "public"
|
| 270 |
|
| 271 |
st.session_state["split"] = split
|
| 272 |
|
| 273 |
|
| 274 |
+
def show_dataframe_w_format(df, format="compact", top_n=None):
|
| 275 |
+
"""
|
| 276 |
+
Display a dataframe with formatted columns. If in private mode and top_n is provided,
|
| 277 |
+
adds a row showing the mean of the top n values for each column.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
df: Pandas dataframe to display
|
| 281 |
+
format: Format string for number columns (default: "compact")
|
| 282 |
+
top_n: Optional number of top values to average per column
|
| 283 |
+
"""
|
| 284 |
+
split = st.session_state.get("split", "public")
|
| 285 |
+
|
| 286 |
+
# Only add top-n mean row in private mode
|
| 287 |
+
if split in ["private", "private_only"] and top_n is not None and isinstance(top_n, int) and top_n > 0:
|
| 288 |
+
# Create a copy to avoid modifying the original
|
| 289 |
+
df_display = df.copy()
|
| 290 |
+
|
| 291 |
+
# Calculate the mean of top n values for each column
|
| 292 |
+
top_n_means = {}
|
| 293 |
+
for col in df.columns:
|
| 294 |
+
sorted_values = df[col].sort_values(ascending=False)
|
| 295 |
+
# Ensure we don't try to take more values than available
|
| 296 |
+
actual_n = min(top_n, len(sorted_values))
|
| 297 |
+
if actual_n > 0:
|
| 298 |
+
top_n_means[col] = sorted_values.iloc[:actual_n].mean()
|
| 299 |
+
else:
|
| 300 |
+
top_n_means[col] = float("nan")
|
| 301 |
+
|
| 302 |
+
# Add the mean row as a new row in the dataframe
|
| 303 |
+
top_n_means_df = pd.DataFrame([top_n_means], index=[f"Top-{top_n} Mean"])
|
| 304 |
+
df_display = pd.concat([top_n_means_df, df_display])
|
| 305 |
+
else:
|
| 306 |
+
df_display = df
|
| 307 |
+
|
| 308 |
+
column_config = {c: st.column_config.NumberColumn(c, format=format) for c in df_display.columns}
|
| 309 |
+
return st.dataframe(df_display, column_config=column_config)
|
| 310 |
+
|
| 311 |
|
| 312 |
@st.fragment
|
| 313 |
def show_leaderboard(task, score: str = "source"):
|
| 314 |
split = st.session_state.get("split", "public")
|
| 315 |
results = load_results(task, best_only=True)
|
| 316 |
source_split_map = {}
|
| 317 |
+
if split in ["private", "private_only"]:
|
| 318 |
+
_sol_df = pd.read_csv(COMP_CACHE / task / "solution-processed.csv")
|
| 319 |
pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
|
| 320 |
source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
|
| 321 |
|
|
|
|
| 326 |
# "pristine_accuracy",
|
| 327 |
"auc",
|
| 328 |
"total_time",
|
| 329 |
+
"datetime",
|
| 330 |
+
"fail_rate",
|
| 331 |
]
|
| 332 |
|
| 333 |
column_config = {
|
|
|
|
| 367 |
"🕒 Inference Time (s)",
|
| 368 |
format="compact",
|
| 369 |
# pinned=True,
|
|
|
|
| 370 |
# width="small",
|
| 371 |
),
|
| 372 |
"datetime": st.column_config.DatetimeColumn(
|
|
|
|
| 454 |
if accuracy_types[granularity] == 0:
|
| 455 |
"#### 👤 True Positive Rate | Generated Source"
|
| 456 |
# st.dataframe(gen_tmp, column_config=column_config)
|
| 457 |
+
top_n = st.session_state.get("top_n", None)
|
| 458 |
+
show_dataframe_w_format(gen_tmp, top_n=top_n)
|
| 459 |
|
| 460 |
"#### 🧑🎤 True Negative Rate | Real Source"
|
| 461 |
# st.dataframe(real_tmp, column_config=column_config)
|
| 462 |
+
show_dataframe_w_format(real_tmp, top_n=top_n)
|
| 463 |
|
| 464 |
elif accuracy_types[granularity] == 1:
|
| 465 |
"#### 👤 Balanced Accuracy | Generated Source"
|
| 466 |
tnr = results[f"{split}_{score}_score"].loc[:, ["real_accuracy"]]
|
| 467 |
gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
|
| 468 |
# st.dataframe(gen_tmp, column_config=column_config)
|
| 469 |
+
top_n = st.session_state.get("top_n", None)
|
| 470 |
+
show_dataframe_w_format(gen_tmp, top_n=top_n)
|
| 471 |
|
| 472 |
"#### 🧑🎤 Balanced Accuracy | Real Source"
|
| 473 |
tpr = results[f"{split}_{score}_score"].loc[:, ["generated_accuracy"]]
|
| 474 |
real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
|
| 475 |
# st.dataframe(real_tmp, column_config=column_config)
|
| 476 |
+
show_dataframe_w_format(real_tmp, top_n=top_n)
|
| 477 |
else:
|
| 478 |
cols = [c for c in results[f"{split}_{score}_score"].columns if "generated_conditional_auc" in c]
|
| 479 |
col_names = [
|
|
|
|
| 502 |
|
| 503 |
"#### 👤 Conditional AUC | Generated Source"
|
| 504 |
# st.dataframe(gen_tmp, column_config=column_config)
|
| 505 |
+
top_n = st.session_state.get("top_n", None)
|
| 506 |
+
show_dataframe_w_format(gen_tmp, top_n=top_n)
|
| 507 |
"#### 🧑🎤 Conditional AUC | Real Source"
|
| 508 |
# st.dataframe(real_tmp, column_config=column_config)
|
| 509 |
+
show_dataframe_w_format(real_tmp, top_n=top_n)
|
| 510 |
|
| 511 |
|
| 512 |
def make_roc(results, show_text=False):
|
|
|
|
| 518 |
.encode(
|
| 519 |
x=alt.X("FA:Q", title="🧑🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
|
| 520 |
y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
|
| 521 |
+
color=alt.Color("team:N", scale=alt.Scale(scheme=color_map)), # Color by categorical field
|
| 522 |
size=alt.Size(
|
| 523 |
"total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
|
| 524 |
), # Size by quantitative field
|
|
|
|
| 538 |
.encode(
|
| 539 |
x=alt.X("FA:Q", title="🧑🎤 False Positive Rate", scale=alt.Scale(domain=[0, 1])),
|
| 540 |
y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0, 1])),
|
| 541 |
+
color=alt.Color("team:N", scale=alt.Scale(scheme=color_map)), # Color by categorical field
|
| 542 |
text="team",
|
| 543 |
)
|
| 544 |
)
|
|
|
|
| 567 |
title="Balanced Accuracy",
|
| 568 |
scale=alt.Scale(domain=[0.4, 1]),
|
| 569 |
),
|
| 570 |
+
color=alt.Color(
|
| 571 |
+
"team:N", scale=alt.Scale(scheme=color_map)
|
| 572 |
+
), # Color by categorical field # Size by quantitative field
|
| 573 |
)
|
| 574 |
.properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
|
| 575 |
)
|
|
|
|
| 592 |
title="Balanced Accuracy",
|
| 593 |
scale=alt.Scale(domain=[0.4, 1]),
|
| 594 |
),
|
| 595 |
+
color=alt.Color(
|
| 596 |
+
"team:N", scale=alt.Scale(scheme=color_map)
|
| 597 |
+
), # Color by categorical field # Size by quantitative field
|
| 598 |
text="team",
|
| 599 |
)
|
| 600 |
)
|
|
|
|
| 631 |
"Accuracy": 0,
|
| 632 |
"AUC": 1,
|
| 633 |
}
|
| 634 |
+
|
| 635 |
+
# Create a row with two columns for controls
|
| 636 |
+
col1, col2 = st.columns([0.1, 0.9])
|
| 637 |
+
|
| 638 |
+
with col1:
|
| 639 |
+
granularity = st.radio(
|
| 640 |
+
"accuracy type",
|
| 641 |
+
list(accuracy_types.keys()),
|
| 642 |
+
key=f"granularity-{task}-{score}",
|
| 643 |
+
horizontal=True,
|
| 644 |
+
label_visibility="collapsed",
|
| 645 |
+
index=0,
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
show_deltas = False
|
| 649 |
+
if split in ["private", "private_only"]:
|
| 650 |
+
with col2:
|
| 651 |
+
# Add toggle for showing deltas from "none" column
|
| 652 |
+
show_deltas = st.toggle(
|
| 653 |
+
"Show deltas from 'none' (higher values mean 'none' was **lower**)",
|
| 654 |
+
value=False,
|
| 655 |
+
key=f"deltas-{task}-{score}",
|
| 656 |
+
)
|
| 657 |
|
| 658 |
## Check cases
|
| 659 |
if accuracy_types[granularity] == 0:
|
|
|
|
| 681 |
if "real_" in c and "accuracy" not in c and "conditional" not in c
|
| 682 |
]
|
| 683 |
tmp = (gen_tmp + real_tmp) / 2.0
|
| 684 |
+
|
| 685 |
+
# If toggle is on and "none" column exists, calculate deltas from "none" column
|
| 686 |
+
if show_deltas and "none" in tmp.columns:
|
| 687 |
+
# Get the "none" column values
|
| 688 |
+
none_values = tmp["none"].copy()
|
| 689 |
+
|
| 690 |
+
# Calculate deltas: none - current_column
|
| 691 |
+
for col in tmp.columns:
|
| 692 |
+
if col != "none":
|
| 693 |
+
tmp[col] = -none_values + tmp[col]
|
| 694 |
+
|
| 695 |
# st.dataframe(tmp)
|
| 696 |
+
top_n = st.session_state.get("top_n", None)
|
| 697 |
+
show_dataframe_w_format(tmp, top_n=top_n)
|
| 698 |
|
| 699 |
else:
|
| 700 |
cols = [c for c in results[f"{split}_{score}_score"].columns if "conditional_auc" in c]
|
|
|
|
| 707 |
tmp.columns = col_names
|
| 708 |
|
| 709 |
"#### Conditional AUC"
|
| 710 |
+
|
| 711 |
+
# If toggle is on and "none" column exists, calculate deltas from "none" column
|
| 712 |
+
if show_deltas and "none" in tmp.columns:
|
| 713 |
+
# Get the "none" column values
|
| 714 |
+
none_values = tmp["none"].copy()
|
| 715 |
+
|
| 716 |
+
# Calculate deltas: none - current_column
|
| 717 |
+
for col in tmp.columns:
|
| 718 |
+
if col != "none":
|
| 719 |
+
tmp[col] = -none_values + tmp[col]
|
| 720 |
+
|
| 721 |
# st.dataframe(tmp)
|
| 722 |
+
top_n = st.session_state.get("top_n", None)
|
| 723 |
+
show_dataframe_w_format(tmp, top_n=top_n)
|
| 724 |
|
| 725 |
|
| 726 |
@st.fragment
|
|
|
|
| 799 |
st.markdown(updated)
|
| 800 |
|
| 801 |
|
| 802 |
+
@st.fragment
|
| 803 |
+
def show_task_comparison():
|
| 804 |
+
"""Show summary tables for Task 1 and Task 2 side by side."""
|
| 805 |
+
split = st.session_state.get("split", "public")
|
| 806 |
+
color_map_choice = st.session_state.get("colormap", "paired")
|
| 807 |
+
|
| 808 |
+
task1_key = list(TASKS.keys())[1] # video-challenge-task-1-config
|
| 809 |
+
task2_key = list(TASKS.keys())[2] # video-challenge-task-2-config
|
| 810 |
+
|
| 811 |
+
task1_results = load_results(task1_key, best_only=True)
|
| 812 |
+
task2_results = load_results(task2_key, best_only=True)
|
| 813 |
+
|
| 814 |
+
cols = ["balanced_accuracy", "generated_accuracy", "real_accuracy", "auc", "total_time", "datetime", "fail_rate"]
|
| 815 |
+
|
| 816 |
+
column_config = {
|
| 817 |
+
"balanced_accuracy": st.column_config.NumberColumn(
|
| 818 |
+
"⚖️ Balanced Accuracy",
|
| 819 |
+
format="compact",
|
| 820 |
+
min_value=0,
|
| 821 |
+
max_value=1.0,
|
| 822 |
+
),
|
| 823 |
+
"generated_accuracy": st.column_config.NumberColumn(
|
| 824 |
+
"👤 True Positive Rate",
|
| 825 |
+
format="compact",
|
| 826 |
+
min_value=0,
|
| 827 |
+
max_value=1.0,
|
| 828 |
+
),
|
| 829 |
+
"real_accuracy": st.column_config.NumberColumn(
|
| 830 |
+
"🧑🎤 True Negative Rate",
|
| 831 |
+
format="compact",
|
| 832 |
+
min_value=0,
|
| 833 |
+
max_value=1.0,
|
| 834 |
+
),
|
| 835 |
+
"auc": st.column_config.NumberColumn(
|
| 836 |
+
"📐 AUC",
|
| 837 |
+
format="compact",
|
| 838 |
+
min_value=0,
|
| 839 |
+
max_value=1.0,
|
| 840 |
+
),
|
| 841 |
+
"total_time": st.column_config.NumberColumn(
|
| 842 |
+
"🕒 Inference Time (s)",
|
| 843 |
+
format="compact",
|
| 844 |
+
),
|
| 845 |
+
"datetime": st.column_config.DatetimeColumn(
|
| 846 |
+
"🗓️ Submission Date",
|
| 847 |
+
format="YYYY-MM-DD",
|
| 848 |
+
),
|
| 849 |
+
"fail_rate": st.column_config.NumberColumn(
|
| 850 |
+
"❌ Fail Rate",
|
| 851 |
+
format="compact",
|
| 852 |
+
),
|
| 853 |
+
"task1_balanced_accuracy": st.column_config.NumberColumn(
|
| 854 |
+
"⚖️ Task 1 Balanced Accuracy",
|
| 855 |
+
format="compact",
|
| 856 |
+
min_value=0,
|
| 857 |
+
max_value=1.0,
|
| 858 |
+
),
|
| 859 |
+
"task2_balanced_accuracy": st.column_config.NumberColumn(
|
| 860 |
+
"⚖️ Task 2 Balanced Accuracy",
|
| 861 |
+
format="compact",
|
| 862 |
+
min_value=0,
|
| 863 |
+
max_value=1.0,
|
| 864 |
+
),
|
| 865 |
+
"difference": st.column_config.NumberColumn(
|
| 866 |
+
"⚖️ Difference (T1-T2)",
|
| 867 |
+
format="compact",
|
| 868 |
+
),
|
| 869 |
+
"percent_change": st.column_config.NumberColumn(
|
| 870 |
+
"% Change",
|
| 871 |
+
format="+.2%",
|
| 872 |
+
),
|
| 873 |
+
}
|
| 874 |
+
|
| 875 |
+
# Create tabs for different views
|
| 876 |
+
tables_tab, charts_tab, time_tab = st.tabs(["Tables", "Charts", "Performance Timeline"])
|
| 877 |
+
|
| 878 |
+
with tables_tab:
|
| 879 |
+
# Create two columns for side-by-side tables
|
| 880 |
+
st.subheader("Performance Comparison: Task 1 vs Task 2")
|
| 881 |
+
col1, col2 = st.columns(2)
|
| 882 |
+
|
| 883 |
+
with col1:
|
| 884 |
+
st.subheader("Task 1: Original Content")
|
| 885 |
+
st.dataframe(
|
| 886 |
+
task1_results[f"{split}_source_score"].loc[:, cols],
|
| 887 |
+
column_config=column_config,
|
| 888 |
+
use_container_width=True,
|
| 889 |
+
)
|
| 890 |
+
|
| 891 |
+
with col2:
|
| 892 |
+
st.subheader("Task 2: Post-processed Content")
|
| 893 |
+
st.dataframe(
|
| 894 |
+
task2_results[f"{split}_source_score"].loc[:, cols],
|
| 895 |
+
column_config=column_config,
|
| 896 |
+
use_container_width=True,
|
| 897 |
+
)
|
| 898 |
+
|
| 899 |
+
# Add a section for comparison of task performance differences
|
| 900 |
+
st.subheader("Performance Analysis")
|
| 901 |
+
st.markdown(
|
| 902 |
+
"""
|
| 903 |
+
Performance comparison between Task 1 (original content) and
|
| 904 |
+
Task 2 (post-processed content). A positive difference indicates degraded performance
|
| 905 |
+
on post-processed content.
|
| 906 |
+
"""
|
| 907 |
+
)
|
| 908 |
+
|
| 909 |
+
# Get the datasets for both tasks
|
| 910 |
+
task1_df = task1_results[f"{split}_source_score"].reset_index()
|
| 911 |
+
task2_df = task2_results[f"{split}_source_score"].reset_index()
|
| 912 |
+
|
| 913 |
+
# Create a combined dataframe for analysis
|
| 914 |
+
common_teams = set(task1_df["team"]) & set(task2_df["team"])
|
| 915 |
+
|
| 916 |
+
if common_teams:
|
| 917 |
+
# Filter to teams that appear in both tasks
|
| 918 |
+
task1_filtered = task1_df[task1_df["team"].isin(common_teams)]
|
| 919 |
+
task2_filtered = task2_df[task2_df["team"].isin(common_teams)]
|
| 920 |
+
|
| 921 |
+
# Create a comparison dataframe
|
| 922 |
+
comparison_df = pd.DataFrame(
|
| 923 |
+
{
|
| 924 |
+
"team": list(common_teams),
|
| 925 |
+
"task1_balanced_accuracy": [
|
| 926 |
+
task1_filtered[task1_filtered["team"] == team]["balanced_accuracy"].values[0]
|
| 927 |
+
for team in common_teams
|
| 928 |
+
],
|
| 929 |
+
"task2_balanced_accuracy": [
|
| 930 |
+
task2_filtered[task2_filtered["team"] == team]["balanced_accuracy"].values[0]
|
| 931 |
+
for team in common_teams
|
| 932 |
+
],
|
| 933 |
+
}
|
| 934 |
+
)
|
| 935 |
+
|
| 936 |
+
# Calculate differences and percentage changes
|
| 937 |
+
comparison_df["difference"] = (
|
| 938 |
+
comparison_df["task1_balanced_accuracy"] - comparison_df["task2_balanced_accuracy"]
|
| 939 |
+
)
|
| 940 |
+
comparison_df["percent_change"] = comparison_df["difference"] / comparison_df["task1_balanced_accuracy"]
|
| 941 |
+
|
| 942 |
+
# Sort by the absolute difference (to show biggest performance changes first)
|
| 943 |
+
comparison_df = comparison_df.sort_values(by="difference", ascending=False).reset_index(drop=True)
|
| 944 |
+
|
| 945 |
+
# Display the comparison table
|
| 946 |
+
show_dataframe_w_format(comparison_df, top_n=0)
|
| 947 |
+
else:
|
| 948 |
+
st.warning("No common teams found across both tasks.")
|
| 949 |
+
|
| 950 |
+
with charts_tab:
|
| 951 |
+
st.subheader("Team Performance Across Tasks")
|
| 952 |
+
|
| 953 |
+
# Get the datasets for both tasks if not already done
|
| 954 |
+
if "task1_df" not in locals():
|
| 955 |
+
task1_df = task1_results[f"{split}_source_score"].reset_index()
|
| 956 |
+
task2_df = task2_results[f"{split}_source_score"].reset_index()
|
| 957 |
+
common_teams = set(task1_df["team"]) & set(task2_df["team"])
|
| 958 |
+
|
| 959 |
+
if common_teams:
|
| 960 |
+
# Prepare data for the plot
|
| 961 |
+
plot_data = []
|
| 962 |
+
|
| 963 |
+
for team in common_teams:
|
| 964 |
+
# Get team's balanced accuracy for each task
|
| 965 |
+
task1_acc = task1_df[task1_df["team"] == team]["balanced_accuracy"].values[0]
|
| 966 |
+
task2_acc = task2_df[task2_df["team"] == team]["balanced_accuracy"].values[0]
|
| 967 |
+
|
| 968 |
+
# Add points for Task 1
|
| 969 |
+
plot_data.append({"team": team, "task": "Task 1", "balanced_accuracy": task1_acc})
|
| 970 |
+
|
| 971 |
+
# Add points for Task 2
|
| 972 |
+
plot_data.append({"team": team, "task": "Task 2", "balanced_accuracy": task2_acc})
|
| 973 |
+
|
| 974 |
+
plot_df = pd.DataFrame(plot_data)
|
| 975 |
+
|
| 976 |
+
# Create line chart connecting team performances
|
| 977 |
+
lines = (
|
| 978 |
+
alt.Chart(plot_df)
|
| 979 |
+
.mark_line(point=alt.OverlayMarkDef(filled=True, size=100), strokeDash=[4, 2], strokeWidth=2)
|
| 980 |
+
.encode(
|
| 981 |
+
x=alt.X("task:N", title="Task", sort=["Task 1", "Task 2"]),
|
| 982 |
+
y=alt.Y("balanced_accuracy:Q", title="Balanced Accuracy", scale=alt.Scale(domain=[0.4, 1.0])),
|
| 983 |
+
color=alt.Color(
|
| 984 |
+
"team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
|
| 985 |
+
),
|
| 986 |
+
tooltip=["team:N", "task:N", "balanced_accuracy:Q"],
|
| 987 |
+
)
|
| 988 |
+
.properties(width=700, height=500, title="Performance Changes Across Tasks")
|
| 989 |
+
)
|
| 990 |
+
|
| 991 |
+
st.altair_chart(lines, use_container_width=False)
|
| 992 |
+
else:
|
| 993 |
+
st.warning("No common teams found across both tasks.")
|
| 994 |
+
|
| 995 |
+
with time_tab:
|
| 996 |
+
st.subheader("Team Performance Timeline")
|
| 997 |
+
|
| 998 |
+
# Get full submission data (not just best_only) to analyze performance over time
|
| 999 |
+
task1_results_full = load_results(task1_key, best_only=False)
|
| 1000 |
+
task2_results_full = load_results(task2_key, best_only=False)
|
| 1001 |
+
|
| 1002 |
+
# We need to select specific task result keys based on what's available
|
| 1003 |
+
task1_result_key = f"{split}_source_score"
|
| 1004 |
+
task2_result_key = f"{split}_source_score"
|
| 1005 |
+
|
| 1006 |
+
# Check if we have data for both tasks
|
| 1007 |
+
if (
|
| 1008 |
+
task1_result_key in task1_results_full
|
| 1009 |
+
and task2_result_key in task2_results_full
|
| 1010 |
+
and not task1_results_full[task1_result_key].empty
|
| 1011 |
+
and not task2_results_full[task2_result_key].empty
|
| 1012 |
+
):
|
| 1013 |
+
|
| 1014 |
+
# Extract datetime and make it datetime objects
|
| 1015 |
+
task1_time_df = task1_results_full[task1_result_key].reset_index().copy()
|
| 1016 |
+
task2_time_df = task2_results_full[task2_result_key].reset_index().copy()
|
| 1017 |
+
|
| 1018 |
+
# Ensure datetime column exists in both dataframes
|
| 1019 |
+
if "datetime" in task1_time_df.columns and "datetime" in task2_time_df.columns:
|
| 1020 |
+
# Convert string dates to datetime objects if they aren't already
|
| 1021 |
+
if pd.api.types.is_string_dtype(task1_time_df["datetime"]):
|
| 1022 |
+
task1_time_df["datetime"] = pd.to_datetime(task1_time_df["datetime"])
|
| 1023 |
+
if pd.api.types.is_string_dtype(task2_time_df["datetime"]):
|
| 1024 |
+
task2_time_df["datetime"] = pd.to_datetime(task2_time_df["datetime"])
|
| 1025 |
+
|
| 1026 |
+
# Make a list of unique teams across both tasks
|
| 1027 |
+
all_teams = sorted(
|
| 1028 |
+
list(set(list(task1_time_df["team"].unique()) + list(task2_time_df["team"].unique())))
|
| 1029 |
+
)
|
| 1030 |
+
|
| 1031 |
+
# Create a selectbox to select teams to display
|
| 1032 |
+
if len(all_teams) > 10: # If we have many teams, add a filter
|
| 1033 |
+
selected_teams = st.multiselect("Select Teams to Display", all_teams, default=all_teams[:5])
|
| 1034 |
+
if not selected_teams: # Default to first 5 if none selected
|
| 1035 |
+
selected_teams = all_teams[:5]
|
| 1036 |
+
else:
|
| 1037 |
+
selected_teams = all_teams
|
| 1038 |
+
|
| 1039 |
+
# Function to compute running max for each team
|
| 1040 |
+
def compute_running_max(df):
|
| 1041 |
+
# Group by team and sort by datetime
|
| 1042 |
+
result_df = df.copy()
|
| 1043 |
+
for team in result_df["team"].unique():
|
| 1044 |
+
team_data = result_df[result_df["team"] == team].copy()
|
| 1045 |
+
team_data = team_data.sort_values("datetime")
|
| 1046 |
+
# Calculate running maximum
|
| 1047 |
+
team_data["balanced_accuracy"] = team_data["balanced_accuracy"].cummax()
|
| 1048 |
+
# Update the original dataframe
|
| 1049 |
+
result_df.loc[team_data.index, "balanced_accuracy"] = team_data["balanced_accuracy"]
|
| 1050 |
+
return result_df
|
| 1051 |
+
|
| 1052 |
+
# Filter and compute running maximum for each task
|
| 1053 |
+
task1_filtered = task1_time_df[task1_time_df["team"].isin(selected_teams)].copy()
|
| 1054 |
+
task2_filtered = task2_time_df[task2_time_df["team"].isin(selected_teams)].copy()
|
| 1055 |
+
|
| 1056 |
+
if not task1_filtered.empty and not task2_filtered.empty:
|
| 1057 |
+
# Compute running maximum
|
| 1058 |
+
task1_max = compute_running_max(task1_filtered)
|
| 1059 |
+
task2_max = compute_running_max(task2_filtered)
|
| 1060 |
+
|
| 1061 |
+
# Create tabs for the two tasks
|
| 1062 |
+
task1_plot_tab, task2_plot_tab = st.tabs(["Task 1 Timeline", "Task 2 Timeline"])
|
| 1063 |
+
|
| 1064 |
+
# Create plot for Task 1
|
| 1065 |
+
with task1_plot_tab:
|
| 1066 |
+
st.subheader("Task 1: Original Content - Performance Over Time")
|
| 1067 |
+
|
| 1068 |
+
# Calculate max performance for baseline
|
| 1069 |
+
task1_max_performance = task1_time_df[
|
| 1070 |
+
task1_time_df["team"].apply(lambda x: x.lower()).isin(["baseline"])
|
| 1071 |
+
]["balanced_accuracy"].max()
|
| 1072 |
+
|
| 1073 |
+
# Create baseline data
|
| 1074 |
+
baseline_data = pd.DataFrame(
|
| 1075 |
+
{
|
| 1076 |
+
"datetime": [task1_max["datetime"].min(), task1_max["datetime"].max()],
|
| 1077 |
+
"balanced_accuracy": [task1_max_performance, task1_max_performance],
|
| 1078 |
+
"label": ["Max Performance", "Max Performance"],
|
| 1079 |
+
}
|
| 1080 |
+
)
|
| 1081 |
+
|
| 1082 |
+
# Create baseline chart
|
| 1083 |
+
baseline_chart = (
|
| 1084 |
+
alt.Chart(baseline_data)
|
| 1085 |
+
.mark_line(strokeDash=[4, 4], color="black", strokeWidth=2)
|
| 1086 |
+
.encode(
|
| 1087 |
+
x="datetime:T",
|
| 1088 |
+
y="balanced_accuracy:Q",
|
| 1089 |
+
tooltip=alt.Tooltip("balanced_accuracy:Q", title="Baseline", format=".4f"),
|
| 1090 |
+
)
|
| 1091 |
+
)
|
| 1092 |
+
|
| 1093 |
+
# Create main chart
|
| 1094 |
+
task1_chart = (
|
| 1095 |
+
alt.Chart(task1_max)
|
| 1096 |
+
.mark_line(point=True)
|
| 1097 |
+
.encode(
|
| 1098 |
+
x=alt.X(
|
| 1099 |
+
"datetime:T",
|
| 1100 |
+
title="Submission Date",
|
| 1101 |
+
axis=alt.Axis(format="%b %d"), # Format as "Month Date"
|
| 1102 |
+
),
|
| 1103 |
+
y=alt.Y(
|
| 1104 |
+
"balanced_accuracy:Q",
|
| 1105 |
+
title="Best Balanced Accuracy",
|
| 1106 |
+
scale=alt.Scale(domain=[0.4, 1.0]),
|
| 1107 |
+
),
|
| 1108 |
+
color=alt.Color(
|
| 1109 |
+
"team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
|
| 1110 |
+
),
|
| 1111 |
+
tooltip=[
|
| 1112 |
+
"team:N",
|
| 1113 |
+
alt.Tooltip("datetime:T", title="Date", format="%b %d, %Y"),
|
| 1114 |
+
alt.Tooltip("balanced_accuracy:Q", title="Best Accuracy", format=".4f"),
|
| 1115 |
+
],
|
| 1116 |
+
)
|
| 1117 |
+
.properties(width=800, height=500, title="Best Performance Over Time (Original Content)")
|
| 1118 |
+
.interactive()
|
| 1119 |
+
)
|
| 1120 |
+
|
| 1121 |
+
# Combine charts and display
|
| 1122 |
+
st.altair_chart(task1_chart + baseline_chart, use_container_width=False)
|
| 1123 |
+
|
| 1124 |
+
# Create plot for Task 2
|
| 1125 |
+
with task2_plot_tab:
|
| 1126 |
+
st.subheader("Task 2: Post-processed Content - Performance Over Time")
|
| 1127 |
+
|
| 1128 |
+
# Calculate max performance for baseline
|
| 1129 |
+
task2_max_performance = task2_time_df[
|
| 1130 |
+
task2_time_df["team"].apply(lambda x: x.lower()).isin(["baseline"])
|
| 1131 |
+
]["balanced_accuracy"].max()
|
| 1132 |
+
|
| 1133 |
+
# Create baseline data
|
| 1134 |
+
baseline_data = pd.DataFrame(
|
| 1135 |
+
{
|
| 1136 |
+
"datetime": [task2_max["datetime"].min(), task2_max["datetime"].max()],
|
| 1137 |
+
"balanced_accuracy": [task2_max_performance, task2_max_performance],
|
| 1138 |
+
"label": ["Max Performance", "Max Performance"],
|
| 1139 |
+
}
|
| 1140 |
+
)
|
| 1141 |
+
|
| 1142 |
+
# Create baseline chart
|
| 1143 |
+
baseline_chart = (
|
| 1144 |
+
alt.Chart(baseline_data)
|
| 1145 |
+
.mark_line(strokeDash=[4, 4], color="black", strokeWidth=2)
|
| 1146 |
+
.encode(
|
| 1147 |
+
x="datetime:T",
|
| 1148 |
+
y="balanced_accuracy:Q",
|
| 1149 |
+
tooltip=alt.Tooltip("balanced_accuracy:Q", title="Baseline", format=".4f"),
|
| 1150 |
+
)
|
| 1151 |
+
)
|
| 1152 |
+
|
| 1153 |
+
# Create main chart
|
| 1154 |
+
task2_chart = (
|
| 1155 |
+
alt.Chart(task2_max)
|
| 1156 |
+
.mark_line(point=True)
|
| 1157 |
+
.encode(
|
| 1158 |
+
x=alt.X(
|
| 1159 |
+
"datetime:T",
|
| 1160 |
+
title="Submission Date",
|
| 1161 |
+
axis=alt.Axis(format="%b %d"), # Format as "Month Date"
|
| 1162 |
+
),
|
| 1163 |
+
y=alt.Y(
|
| 1164 |
+
"balanced_accuracy:Q",
|
| 1165 |
+
title="Best Balanced Accuracy",
|
| 1166 |
+
scale=alt.Scale(domain=[0.4, 1.0]),
|
| 1167 |
+
),
|
| 1168 |
+
color=alt.Color(
|
| 1169 |
+
"team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
|
| 1170 |
+
),
|
| 1171 |
+
tooltip=[
|
| 1172 |
+
"team:N",
|
| 1173 |
+
alt.Tooltip("datetime:T", title="Date", format="%b %d, %Y"),
|
| 1174 |
+
alt.Tooltip("balanced_accuracy:Q", title="Best Accuracy", format=".4f"),
|
| 1175 |
+
],
|
| 1176 |
+
)
|
| 1177 |
+
.properties(
|
| 1178 |
+
width=800, height=500, title="Best Performance Over Time (Post-Processed Content)"
|
| 1179 |
+
)
|
| 1180 |
+
.interactive()
|
| 1181 |
+
)
|
| 1182 |
+
|
| 1183 |
+
# Combine charts and display
|
| 1184 |
+
st.altair_chart(task2_chart + baseline_chart, use_container_width=False)
|
| 1185 |
+
|
| 1186 |
+
else:
|
| 1187 |
+
st.warning("No data available for selected teams.")
|
| 1188 |
+
else:
|
| 1189 |
+
st.warning("Datetime information is not available in the dataset.")
|
| 1190 |
+
else:
|
| 1191 |
+
st.warning("Historical performance data is not available for both tasks.")
|
| 1192 |
+
|
| 1193 |
+
|
| 1194 |
+
t1, t2, tp, comparison_tab, volume_tab, all_submission_tab = st.tabs(
|
| 1195 |
+
["**Task 1**", "**Task 2**", "**Pilot Task**", "**Compare Tasks**", "**Submission Volume**", "**All Submissions**"]
|
| 1196 |
)
|
| 1197 |
|
| 1198 |
with t1:
|
|
|
|
| 1204 |
with tp:
|
| 1205 |
"*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
|
| 1206 |
make_plots_for_task(list(TASKS.keys())[0])
|
| 1207 |
+
if split in ["private", "private_only"]:
|
| 1208 |
+
with comparison_tab:
|
| 1209 |
+
"**Task 1 to Task 2 performance comparison.**"
|
| 1210 |
+
show_task_comparison()
|
| 1211 |
|
| 1212 |
with volume_tab:
|
| 1213 |
subs = get_volume()
|
metric.py
CHANGED
|
@@ -147,13 +147,15 @@ def _metric(
|
|
| 147 |
## Save data split
|
| 148 |
evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
|
| 149 |
evaluation["private_score"]["proportion"] = 1.0
|
|
|
|
| 150 |
|
| 151 |
-
## Public and
|
| 152 |
public_df = solution_df.query("split=='public'").copy()
|
| 153 |
private_df = solution_df.copy()
|
|
|
|
| 154 |
|
| 155 |
## Loop
|
| 156 |
-
for split, dataframe in zip(["public", "private"], [public_df, private_df]):
|
| 157 |
metrics = compute_metrics(
|
| 158 |
df=dataframe.copy(), score_name=score_name if split == "public" else f"{score_name}_og", use_all=use_all
|
| 159 |
)
|
|
|
|
| 147 |
## Save data split
|
| 148 |
evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
|
| 149 |
evaluation["private_score"]["proportion"] = 1.0
|
| 150 |
+
evaluation["private_only_score"]["proportion"] = len(solution_df.query(f"split=='private'").copy()) / len(solution_df)
|
| 151 |
|
| 152 |
+
## Public, private, and private_only split
|
| 153 |
public_df = solution_df.query("split=='public'").copy()
|
| 154 |
private_df = solution_df.copy()
|
| 155 |
+
private_only_df = solution_df.query("split=='private'").copy()
|
| 156 |
|
| 157 |
## Loop
|
| 158 |
+
for split, dataframe in zip(["public", "private", "private_only"], [public_df, private_df, private_only_df]):
|
| 159 |
metrics = compute_metrics(
|
| 160 |
df=dataframe.copy(), score_name=score_name if split == "public" else f"{score_name}_og", use_all=use_all
|
| 161 |
)
|
utils.py
CHANGED
|
@@ -22,7 +22,7 @@ def download_competition_data(competition_names: List[str]) -> None:
|
|
| 22 |
local_dir=os.path.join(COMP_CACHE, repo_id),
|
| 23 |
repo_type="dataset",
|
| 24 |
token=os.environ.get("HF_TOKEN"),
|
| 25 |
-
ignore_patterns="submission_logs/*"
|
| 26 |
)
|
| 27 |
|
| 28 |
|
|
@@ -142,7 +142,7 @@ def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 142 |
return new
|
| 143 |
|
| 144 |
|
| 145 |
-
def add_custom_submission(path_to_cache, path_to_subfile, threshold
|
| 146 |
import pandas as pd
|
| 147 |
import json
|
| 148 |
|
|
@@ -153,7 +153,7 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
|
|
| 153 |
|
| 154 |
team_id = "insiders-id-1-2-3"
|
| 155 |
team_name = "insiders"
|
| 156 |
-
submission_id = f"sub{threshold}".replace(".","")
|
| 157 |
|
| 158 |
## update teams
|
| 159 |
teams = json.load(open(path_to_cache + "/teams.json"))
|
|
@@ -169,20 +169,22 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
|
|
| 169 |
if os.path.exists(submission_info_file):
|
| 170 |
temp = json.load(open(submission_info_file))
|
| 171 |
else:
|
| 172 |
-
temp = {"id": team_id,"submissions": []}
|
| 173 |
-
|
| 174 |
-
temp["submissions"].append(
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
| 186 |
|
| 187 |
with open(submission_info_file, "w") as f:
|
| 188 |
json.dump(temp, f)
|
|
@@ -191,11 +193,16 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
|
|
| 191 |
path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
|
| 192 |
)
|
| 193 |
|
|
|
|
| 194 |
def create_custom_subs():
|
| 195 |
import numpy as np
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
if __name__ == "__main__":
|
|
@@ -206,15 +213,11 @@ if __name__ == "__main__":
|
|
| 206 |
"safe-challenge/video-challenge-task-1-config",
|
| 207 |
"safe-challenge/video-challenge-task-2-config",
|
| 208 |
]
|
| 209 |
-
download_competition_data(competition_names=spaces)
|
| 210 |
|
| 211 |
-
|
| 212 |
if os.environ.get("MAKE_CUSTOM"):
|
| 213 |
print("adding custom subs")
|
| 214 |
create_custom_subs()
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
|
| 219 |
## Loop
|
| 220 |
for space in spaces:
|
|
@@ -263,7 +266,7 @@ if __name__ == "__main__":
|
|
| 263 |
scores = ["source"]
|
| 264 |
for score_name in scores:
|
| 265 |
## Loop and save by team
|
| 266 |
-
public, private, rocs = [], [], []
|
| 267 |
# for team_id, submission_set in submissions.items():
|
| 268 |
for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
|
| 269 |
"team_id"
|
|
@@ -298,6 +301,11 @@ if __name__ == "__main__":
|
|
| 298 |
for key, value in results.items()
|
| 299 |
if key in team_submissions
|
| 300 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
## Add timing
|
| 303 |
public_times = {
|
|
@@ -312,10 +320,18 @@ if __name__ == "__main__":
|
|
| 312 |
["submission_id", "private_time"]
|
| 313 |
].to_dict(orient="records")
|
| 314 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
for key in public_results.keys():
|
| 316 |
public_results[key]["total_time"] = public_times[key]
|
| 317 |
for key in private_results.keys():
|
| 318 |
private_results[key]["total_time"] = private_times[key]
|
|
|
|
|
|
|
| 319 |
|
| 320 |
## Roc computations
|
| 321 |
roc_results = {
|
|
@@ -425,9 +441,45 @@ if __name__ == "__main__":
|
|
| 425 |
)
|
| 426 |
private.append(private_df)
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
## Save as csvs
|
| 429 |
public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
|
| 430 |
private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
|
|
|
|
|
|
|
|
|
|
| 431 |
rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
|
| 432 |
public.to_csv(
|
| 433 |
Path("competition_cache")
|
|
@@ -441,6 +493,13 @@ if __name__ == "__main__":
|
|
| 441 |
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
|
| 442 |
index=False,
|
| 443 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
rocs.to_csv(
|
| 445 |
Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
|
| 446 |
index=False,
|
|
|
|
| 22 |
local_dir=os.path.join(COMP_CACHE, repo_id),
|
| 23 |
repo_type="dataset",
|
| 24 |
token=os.environ.get("HF_TOKEN"),
|
| 25 |
+
ignore_patterns="submission_logs/*",
|
| 26 |
)
|
| 27 |
|
| 28 |
|
|
|
|
| 142 |
return new
|
| 143 |
|
| 144 |
|
| 145 |
+
def add_custom_submission(path_to_cache, path_to_subfile, threshold=0):
|
| 146 |
import pandas as pd
|
| 147 |
import json
|
| 148 |
|
|
|
|
| 153 |
|
| 154 |
team_id = "insiders-id-1-2-3"
|
| 155 |
team_name = "insiders"
|
| 156 |
+
submission_id = f"sub{threshold}".replace(".", "")
|
| 157 |
|
| 158 |
## update teams
|
| 159 |
teams = json.load(open(path_to_cache + "/teams.json"))
|
|
|
|
| 169 |
if os.path.exists(submission_info_file):
|
| 170 |
temp = json.load(open(submission_info_file))
|
| 171 |
else:
|
| 172 |
+
temp = {"id": team_id, "submissions": []}
|
| 173 |
+
|
| 174 |
+
temp["submissions"].append(
|
| 175 |
+
{
|
| 176 |
+
"datetime": "2025-09-22 14:42:14",
|
| 177 |
+
"submission_id": submission_id,
|
| 178 |
+
"submission_comment": "",
|
| 179 |
+
"submission_repo": "",
|
| 180 |
+
"space_id": "",
|
| 181 |
+
"submitted_by": "na",
|
| 182 |
+
"status": 3,
|
| 183 |
+
"selected": True,
|
| 184 |
+
"public_score": {},
|
| 185 |
+
"private_score": {},
|
| 186 |
+
}
|
| 187 |
+
)
|
| 188 |
|
| 189 |
with open(submission_info_file, "w") as f:
|
| 190 |
json.dump(temp, f)
|
|
|
|
| 193 |
path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
|
| 194 |
)
|
| 195 |
|
| 196 |
+
|
| 197 |
def create_custom_subs():
|
| 198 |
import numpy as np
|
| 199 |
+
|
| 200 |
+
for threshold in np.linspace(-6, 0, 10):
|
| 201 |
+
add_custom_submission(
|
| 202 |
+
path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config",
|
| 203 |
+
path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt",
|
| 204 |
+
threshold=threshold,
|
| 205 |
+
)
|
| 206 |
|
| 207 |
|
| 208 |
if __name__ == "__main__":
|
|
|
|
| 213 |
"safe-challenge/video-challenge-task-1-config",
|
| 214 |
"safe-challenge/video-challenge-task-2-config",
|
| 215 |
]
|
| 216 |
+
# download_competition_data(competition_names=spaces)
|
| 217 |
|
|
|
|
| 218 |
if os.environ.get("MAKE_CUSTOM"):
|
| 219 |
print("adding custom subs")
|
| 220 |
create_custom_subs()
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
## Loop
|
| 223 |
for space in spaces:
|
|
|
|
| 266 |
scores = ["source"]
|
| 267 |
for score_name in scores:
|
| 268 |
## Loop and save by team
|
| 269 |
+
public, private, private_only, rocs = [], [], [], []
|
| 270 |
# for team_id, submission_set in submissions.items():
|
| 271 |
for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
|
| 272 |
"team_id"
|
|
|
|
| 301 |
for key, value in results.items()
|
| 302 |
if key in team_submissions
|
| 303 |
}
|
| 304 |
+
private_only_results = {
|
| 305 |
+
key: prep_private(value["private_only_score"])
|
| 306 |
+
for key, value in results.items()
|
| 307 |
+
if key in team_submissions
|
| 308 |
+
}
|
| 309 |
|
| 310 |
## Add timing
|
| 311 |
public_times = {
|
|
|
|
| 320 |
["submission_id", "private_time"]
|
| 321 |
].to_dict(orient="records")
|
| 322 |
}
|
| 323 |
+
private_only_times = {
|
| 324 |
+
x["submission_id"]: x["private_time"] - x["public_time"]
|
| 325 |
+
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
|
| 326 |
+
["submission_id", "private_time", "public_time"]
|
| 327 |
+
].to_dict(orient="records")
|
| 328 |
+
}
|
| 329 |
for key in public_results.keys():
|
| 330 |
public_results[key]["total_time"] = public_times[key]
|
| 331 |
for key in private_results.keys():
|
| 332 |
private_results[key]["total_time"] = private_times[key]
|
| 333 |
+
for key in private_only_results.keys():
|
| 334 |
+
private_only_results[key]["total_time"] = private_only_times[key]
|
| 335 |
|
| 336 |
## Roc computations
|
| 337 |
roc_results = {
|
|
|
|
| 441 |
)
|
| 442 |
private.append(private_df)
|
| 443 |
|
| 444 |
+
## Private ONLY results
|
| 445 |
+
private_only_df = pd.json_normalize(private_only_results.values())
|
| 446 |
+
private_only_df.insert(
|
| 447 |
+
loc=0,
|
| 448 |
+
column="submission_id",
|
| 449 |
+
value=list(private_only_results.keys()),
|
| 450 |
+
)
|
| 451 |
+
private_only_df.insert(
|
| 452 |
+
loc=0,
|
| 453 |
+
column="team",
|
| 454 |
+
value=[
|
| 455 |
+
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
|
| 456 |
+
for submission_id in private_only_results.keys()
|
| 457 |
+
],
|
| 458 |
+
)
|
| 459 |
+
private_only_df.insert(
|
| 460 |
+
loc=0,
|
| 461 |
+
column="team_id",
|
| 462 |
+
value=[
|
| 463 |
+
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
|
| 464 |
+
for submission_id in private_only_results.keys()
|
| 465 |
+
],
|
| 466 |
+
)
|
| 467 |
+
private_only_df.insert(
|
| 468 |
+
loc=0,
|
| 469 |
+
column="datetime",
|
| 470 |
+
value=[
|
| 471 |
+
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0]
|
| 472 |
+
for submission_id in private_only_results.keys()
|
| 473 |
+
],
|
| 474 |
+
)
|
| 475 |
+
private_only.append(private_only_df)
|
| 476 |
+
|
| 477 |
## Save as csvs
|
| 478 |
public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
|
| 479 |
private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
|
| 480 |
+
private_only = pd.concat(private_only, axis=0, ignore_index=True).sort_values(
|
| 481 |
+
by="balanced_accuracy", ascending=False
|
| 482 |
+
)
|
| 483 |
rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
|
| 484 |
public.to_csv(
|
| 485 |
Path("competition_cache")
|
|
|
|
| 493 |
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
|
| 494 |
index=False,
|
| 495 |
)
|
| 496 |
+
private_only.to_csv(
|
| 497 |
+
Path("competition_cache")
|
| 498 |
+
/ "cached_results"
|
| 499 |
+
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv",
|
| 500 |
+
index=False,
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
rocs.to_csv(
|
| 504 |
Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
|
| 505 |
index=False,
|