gmancino-ball commited on
Commit
fba09af
·
verified ·
1 Parent(s): 4e59e27

gmb/updates (#7)

Browse files

- Refactor leaderboard (bdb48d3594e180083fa5c65bbe7c2f839a810ff9)

Files changed (3) hide show
  1. app.py +586 -49
  2. metric.py +4 -2
  3. utils.py +85 -26
app.py CHANGED
@@ -13,9 +13,8 @@ TASKS = {
13
  "video-challenge-task-1-config": ["source"],
14
  "video-challenge-task-2-config": ["source", "category"],
15
  }
16
- valid_splits = ["public", "private"]
17
- with st.sidebar:
18
- color_map =st.selectbox("colormap",["paired","category20","category20b","category20c","set2","set3"])
19
 
20
  #####################################################################
21
  ## Data loading ##
@@ -141,7 +140,13 @@ def make_roc_curves(task, submission_ids):
141
  # if rocs["team"].nunique() > 1:
142
  color_field = "team:N"
143
 
144
- roc_chart = alt.Chart(rocs).mark_line().encode(x="fpr", y="tpr", color=alt.Color(color_field, scale=alt.Scale(scheme=color_map)), detail="submission_id:N")
 
 
 
 
 
 
145
 
146
  return roc_chart
147
 
@@ -159,12 +164,15 @@ st.set_page_config(
159
 
160
  ## Pull new results or toggle private public if you are an owner
161
  with st.sidebar:
 
162
 
163
  hf_token = os.getenv("HF_TOKEN")
164
  st.session_state["hf_token"] = hf_token
165
  password = st.text_input("Admin login:", type="password")
166
 
 
167
  if password == hf_token:
 
168
  if st.button("Pull New Results"):
169
  with st.spinner("Pulling new results", show_time=True):
170
  try:
@@ -187,39 +195,127 @@ with st.sidebar:
187
  except Exception as e:
188
  st.error(f"Error starting background task: {e}")
189
 
190
- ## Initialize the toggle state in session_state if it doesn't exist
191
- if "private_view" not in st.session_state:
192
- st.session_state.private_view = False
193
-
194
- # Create the toggle widget
195
- # The 'value' parameter sets the initial state, here linked to session_state
196
- # The 'key' parameter is crucial for identifying the widget across reruns and linking to session_state
197
- toggle_value = st.toggle("Private Scores", value=st.session_state.private_view, key="private_view")
198
-
199
- # The 'toggle_value' variable will hold the current state of the toggle (True or False)
200
- if toggle_value:
201
- st.write("Showing **PRIVATE** scores.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  else:
203
- st.write("Showing **PUBLIC** scores.")
204
-
205
- split = "public" if not toggle_value else "private"
 
 
 
 
 
 
 
 
 
 
 
206
  else:
207
  split = "public"
208
 
209
  st.session_state["split"] = split
210
 
211
 
212
- def show_dataframe_w_format(df,format="compact"):
213
- column_config = {c: st.column_config.NumberColumn(c,format=format) for c in df.columns}
214
- return st.dataframe(df,column_config=column_config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  @st.fragment
217
  def show_leaderboard(task, score: str = "source"):
218
  split = st.session_state.get("split", "public")
219
  results = load_results(task, best_only=True)
220
  source_split_map = {}
221
- if split == "private":
222
- _sol_df = pd.read_csv(COMP_CACHE / task / "solution.csv")
223
  pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
224
  source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
225
 
@@ -230,8 +326,8 @@ def show_leaderboard(task, score: str = "source"):
230
  # "pristine_accuracy",
231
  "auc",
232
  "total_time",
233
- "datetime",
234
- "fail_rate"
235
  ]
236
 
237
  column_config = {
@@ -271,7 +367,6 @@ def show_leaderboard(task, score: str = "source"):
271
  "🕒 Inference Time (s)",
272
  format="compact",
273
  # pinned=True,
274
-
275
  # width="small",
276
  ),
277
  "datetime": st.column_config.DatetimeColumn(
@@ -359,24 +454,26 @@ def show_leaderboard(task, score: str = "source"):
359
  if accuracy_types[granularity] == 0:
360
  "#### 👤 True Positive Rate | Generated Source"
361
  # st.dataframe(gen_tmp, column_config=column_config)
362
- show_dataframe_w_format(gen_tmp)
 
363
 
364
  "#### 🧑‍🎤 True Negative Rate | Real Source"
365
  # st.dataframe(real_tmp, column_config=column_config)
366
- show_dataframe_w_format(real_tmp)
367
 
368
  elif accuracy_types[granularity] == 1:
369
  "#### 👤 Balanced Accuracy | Generated Source"
370
  tnr = results[f"{split}_{score}_score"].loc[:, ["real_accuracy"]]
371
  gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
372
  # st.dataframe(gen_tmp, column_config=column_config)
373
- show_dataframe_w_format(gen_tmp)
 
374
 
375
  "#### 🧑‍🎤 Balanced Accuracy | Real Source"
376
  tpr = results[f"{split}_{score}_score"].loc[:, ["generated_accuracy"]]
377
  real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
378
  # st.dataframe(real_tmp, column_config=column_config)
379
- show_dataframe_w_format(real_tmp)
380
  else:
381
  cols = [c for c in results[f"{split}_{score}_score"].columns if "generated_conditional_auc" in c]
382
  col_names = [
@@ -405,10 +502,11 @@ def show_leaderboard(task, score: str = "source"):
405
 
406
  "#### 👤 Conditional AUC | Generated Source"
407
  # st.dataframe(gen_tmp, column_config=column_config)
408
- show_dataframe_w_format(gen_tmp)
 
409
  "#### 🧑‍🎤 Conditional AUC | Real Source"
410
  # st.dataframe(real_tmp, column_config=column_config)
411
- show_dataframe_w_format(real_tmp)
412
 
413
 
414
  def make_roc(results, show_text=False):
@@ -420,7 +518,7 @@ def make_roc(results, show_text=False):
420
  .encode(
421
  x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
422
  y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
423
- color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)), # Color by categorical field
424
  size=alt.Size(
425
  "total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
426
  ), # Size by quantitative field
@@ -440,7 +538,7 @@ def make_roc(results, show_text=False):
440
  .encode(
441
  x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0, 1])),
442
  y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0, 1])),
443
- color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)), # Color by categorical field
444
  text="team",
445
  )
446
  )
@@ -469,7 +567,9 @@ def make_acc(results, show_text=False):
469
  title="Balanced Accuracy",
470
  scale=alt.Scale(domain=[0.4, 1]),
471
  ),
472
- color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)), # Color by categorical field # Size by quantitative field
 
 
473
  )
474
  .properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
475
  )
@@ -492,7 +592,9 @@ def make_acc(results, show_text=False):
492
  title="Balanced Accuracy",
493
  scale=alt.Scale(domain=[0.4, 1]),
494
  ),
495
- color=alt.Color('team:N', scale=alt.Scale(scheme=color_map)), # Color by categorical field # Size by quantitative field
 
 
496
  text="team",
497
  )
498
  )
@@ -529,14 +631,29 @@ def show_augmentations(task, score):
529
  "Accuracy": 0,
530
  "AUC": 1,
531
  }
532
- granularity = st.radio(
533
- "accuracy type",
534
- list(accuracy_types.keys()),
535
- key=f"granularity-{task}-{score}",
536
- horizontal=True,
537
- label_visibility="collapsed",
538
- index=0,
539
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
 
541
  ## Check cases
542
  if accuracy_types[granularity] == 0:
@@ -564,8 +681,20 @@ def show_augmentations(task, score):
564
  if "real_" in c and "accuracy" not in c and "conditional" not in c
565
  ]
566
  tmp = (gen_tmp + real_tmp) / 2.0
 
 
 
 
 
 
 
 
 
 
 
567
  # st.dataframe(tmp)
568
- show_dataframe_w_format(tmp)
 
569
 
570
  else:
571
  cols = [c for c in results[f"{split}_{score}_score"].columns if "conditional_auc" in c]
@@ -578,8 +707,20 @@ def show_augmentations(task, score):
578
  tmp.columns = col_names
579
 
580
  "#### Conditional AUC"
 
 
 
 
 
 
 
 
 
 
 
581
  # st.dataframe(tmp)
582
- show_dataframe_w_format(tmp)
 
583
 
584
 
585
  @st.fragment
@@ -658,8 +799,400 @@ updated = get_updated_time()
658
  st.markdown(updated)
659
 
660
 
661
- t1, t2, tp, volume_tab, all_submission_tab = st.tabs(
662
- ["**Task 1**", "**Task 2**","**Pilot Task**", "**Submission Volume**", "**All Submissions**"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663
  )
664
 
665
  with t1:
@@ -671,6 +1204,10 @@ with t2:
671
  with tp:
672
  "*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
673
  make_plots_for_task(list(TASKS.keys())[0])
 
 
 
 
674
 
675
  with volume_tab:
676
  subs = get_volume()
 
13
  "video-challenge-task-1-config": ["source"],
14
  "video-challenge-task-2-config": ["source", "category"],
15
  }
16
+ valid_splits = ["public", "private", "private_only"]
17
+
 
18
 
19
  #####################################################################
20
  ## Data loading ##
 
140
  # if rocs["team"].nunique() > 1:
141
  color_field = "team:N"
142
 
143
+ roc_chart = (
144
+ alt.Chart(rocs)
145
+ .mark_line()
146
+ .encode(
147
+ x="fpr", y="tpr", color=alt.Color(color_field, scale=alt.Scale(scheme=color_map)), detail="submission_id:N"
148
+ )
149
+ )
150
 
151
  return roc_chart
152
 
 
164
 
165
  ## Pull new results or toggle private public if you are an owner
166
  with st.sidebar:
167
+ color_map = st.selectbox("colormap", ["paired", "category20", "category20b", "category20c", "set2", "set3"])
168
 
169
  hf_token = os.getenv("HF_TOKEN")
170
  st.session_state["hf_token"] = hf_token
171
  password = st.text_input("Admin login:", type="password")
172
 
173
+ dataset_options = ["public"]
174
  if password == hf_token:
175
+ dataset_options = ["public", "private", "private_only"]
176
  if st.button("Pull New Results"):
177
  with st.spinner("Pulling new results", show_time=True):
178
  try:
 
195
  except Exception as e:
196
  st.error(f"Error starting background task: {e}")
197
 
198
+ ## Initialize the dataset view state in session_state if it doesn't exist
199
+ if "dataset_view" not in st.session_state:
200
+ st.session_state.dataset_view = "public"
201
+
202
+ # Create the selectbox, ensuring the index is valid
203
+ current_view = st.session_state.dataset_view
204
+ valid_index = dataset_options.index(current_view) if current_view in dataset_options else 0
205
+
206
+ dataset_view = st.selectbox("Dataset View", options=dataset_options, index=valid_index, key="dataset_view")
207
+
208
+ # Display the current dataset view
209
+ if dataset_view == "private":
210
+ st.success("Showing **PRIVATE** scores (all data).")
211
+
212
+ # Visual indicator for admins in the UI
213
+ if password == hf_token:
214
+ st.info("🔐 Admin View: You have access to all data")
215
+
216
+ # Initialize the top_n parameter if not in session_state
217
+ if "top_n_value" not in st.session_state:
218
+ st.session_state.top_n_value = 3
219
+
220
+ # Add a slider to select the number of top elements to average
221
+ top_n_value = st.slider(
222
+ "Mean of top N elements",
223
+ min_value=2,
224
+ max_value=10,
225
+ value=st.session_state.top_n_value,
226
+ step=1,
227
+ help="Calculate the mean of the top N elements in each column",
228
+ key="top_n_value",
229
+ )
230
+ st.session_state["top_n"] = top_n_value
231
+ elif dataset_view == "private_only":
232
+ st.success("Showing **PRIVATE ONLY** scores (excluding public data).")
233
+
234
+ # Visual indicator for admins in the UI
235
+ if password == hf_token:
236
+ st.info("🔒 Admin View: You have access to private-only data")
237
+
238
+ # Initialize the top_n parameter if not in session_state
239
+ if "top_n_value" not in st.session_state:
240
+ st.session_state.top_n_value = 3
241
+
242
+ # Add a slider to select the number of top elements to average
243
+ top_n_value = st.slider(
244
+ "Mean of top N elements",
245
+ min_value=2,
246
+ max_value=10,
247
+ value=st.session_state.top_n_value,
248
+ step=1,
249
+ help="Calculate the mean of the top N elements in each column",
250
+ key="top_n_value",
251
+ )
252
+ st.session_state["top_n"] = top_n_value
253
  else:
254
+ st.info("Showing **PUBLIC** scores.")
255
+ st.session_state["top_n"] = None
256
+
257
+ # Ensure only admin users can access private data
258
+ if dataset_view in ["private", "private_only"] and password == hf_token:
259
+ split = dataset_view
260
+
261
+ # Clear the cache when the dataset view changes
262
+ previous_view = st.session_state.get("previous_dataset_view")
263
+ if previous_view != dataset_view:
264
+ load_results.clear()
265
+ st.session_state["previous_dataset_view"] = dataset_view
266
+ else:
267
+ split = "public"
268
  else:
269
  split = "public"
270
 
271
  st.session_state["split"] = split
272
 
273
 
274
+ def show_dataframe_w_format(df, format="compact", top_n=None):
275
+ """
276
+ Display a dataframe with formatted columns. If in private mode and top_n is provided,
277
+ adds a row showing the mean of the top n values for each column.
278
+
279
+ Args:
280
+ df: Pandas dataframe to display
281
+ format: Format string for number columns (default: "compact")
282
+ top_n: Optional number of top values to average per column
283
+ """
284
+ split = st.session_state.get("split", "public")
285
+
286
+ # Only add top-n mean row in private mode
287
+ if split in ["private", "private_only"] and top_n is not None and isinstance(top_n, int) and top_n > 0:
288
+ # Create a copy to avoid modifying the original
289
+ df_display = df.copy()
290
+
291
+ # Calculate the mean of top n values for each column
292
+ top_n_means = {}
293
+ for col in df.columns:
294
+ sorted_values = df[col].sort_values(ascending=False)
295
+ # Ensure we don't try to take more values than available
296
+ actual_n = min(top_n, len(sorted_values))
297
+ if actual_n > 0:
298
+ top_n_means[col] = sorted_values.iloc[:actual_n].mean()
299
+ else:
300
+ top_n_means[col] = float("nan")
301
+
302
+ # Add the mean row as a new row in the dataframe
303
+ top_n_means_df = pd.DataFrame([top_n_means], index=[f"Top-{top_n} Mean"])
304
+ df_display = pd.concat([top_n_means_df, df_display])
305
+ else:
306
+ df_display = df
307
+
308
+ column_config = {c: st.column_config.NumberColumn(c, format=format) for c in df_display.columns}
309
+ return st.dataframe(df_display, column_config=column_config)
310
+
311
 
312
  @st.fragment
313
  def show_leaderboard(task, score: str = "source"):
314
  split = st.session_state.get("split", "public")
315
  results = load_results(task, best_only=True)
316
  source_split_map = {}
317
+ if split in ["private", "private_only"]:
318
+ _sol_df = pd.read_csv(COMP_CACHE / task / "solution-processed.csv")
319
  pairs_df = _sol_df[["source_og", "split"]].drop_duplicates()
320
  source_split_map = {x: y for x, y in zip(pairs_df["source_og"], pairs_df["split"])}
321
 
 
326
  # "pristine_accuracy",
327
  "auc",
328
  "total_time",
329
+ "datetime",
330
+ "fail_rate",
331
  ]
332
 
333
  column_config = {
 
367
  "🕒 Inference Time (s)",
368
  format="compact",
369
  # pinned=True,
 
370
  # width="small",
371
  ),
372
  "datetime": st.column_config.DatetimeColumn(
 
454
  if accuracy_types[granularity] == 0:
455
  "#### 👤 True Positive Rate | Generated Source"
456
  # st.dataframe(gen_tmp, column_config=column_config)
457
+ top_n = st.session_state.get("top_n", None)
458
+ show_dataframe_w_format(gen_tmp, top_n=top_n)
459
 
460
  "#### 🧑‍🎤 True Negative Rate | Real Source"
461
  # st.dataframe(real_tmp, column_config=column_config)
462
+ show_dataframe_w_format(real_tmp, top_n=top_n)
463
 
464
  elif accuracy_types[granularity] == 1:
465
  "#### 👤 Balanced Accuracy | Generated Source"
466
  tnr = results[f"{split}_{score}_score"].loc[:, ["real_accuracy"]]
467
  gen_tmp[:] = (gen_tmp.values + tnr.values) / 2.0
468
  # st.dataframe(gen_tmp, column_config=column_config)
469
+ top_n = st.session_state.get("top_n", None)
470
+ show_dataframe_w_format(gen_tmp, top_n=top_n)
471
 
472
  "#### 🧑‍🎤 Balanced Accuracy | Real Source"
473
  tpr = results[f"{split}_{score}_score"].loc[:, ["generated_accuracy"]]
474
  real_tmp[:] = (real_tmp.values + tpr.values) / 2.0
475
  # st.dataframe(real_tmp, column_config=column_config)
476
+ show_dataframe_w_format(real_tmp, top_n=top_n)
477
  else:
478
  cols = [c for c in results[f"{split}_{score}_score"].columns if "generated_conditional_auc" in c]
479
  col_names = [
 
502
 
503
  "#### 👤 Conditional AUC | Generated Source"
504
  # st.dataframe(gen_tmp, column_config=column_config)
505
+ top_n = st.session_state.get("top_n", None)
506
+ show_dataframe_w_format(gen_tmp, top_n=top_n)
507
  "#### 🧑‍🎤 Conditional AUC | Real Source"
508
  # st.dataframe(real_tmp, column_config=column_config)
509
+ show_dataframe_w_format(real_tmp, top_n=top_n)
510
 
511
 
512
  def make_roc(results, show_text=False):
 
518
  .encode(
519
  x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
520
  y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0.0, 1.0])),
521
+ color=alt.Color("team:N", scale=alt.Scale(scheme=color_map)), # Color by categorical field
522
  size=alt.Size(
523
  "total_time:Q", title="🕒 Inference Time", scale=alt.Scale(rangeMin=100)
524
  ), # Size by quantitative field
 
538
  .encode(
539
  x=alt.X("FA:Q", title="🧑‍🎤 False Positive Rate", scale=alt.Scale(domain=[0, 1])),
540
  y=alt.Y("generated_accuracy:Q", title="👤 True Positive Rate", scale=alt.Scale(domain=[0, 1])),
541
+ color=alt.Color("team:N", scale=alt.Scale(scheme=color_map)), # Color by categorical field
542
  text="team",
543
  )
544
  )
 
567
  title="Balanced Accuracy",
568
  scale=alt.Scale(domain=[0.4, 1]),
569
  ),
570
+ color=alt.Color(
571
+ "team:N", scale=alt.Scale(scheme=color_map)
572
+ ), # Color by categorical field # Size by quantitative field
573
  )
574
  .properties(width=400, height=400, title="Inference Time vs Balanced Accuracy")
575
  )
 
592
  title="Balanced Accuracy",
593
  scale=alt.Scale(domain=[0.4, 1]),
594
  ),
595
+ color=alt.Color(
596
+ "team:N", scale=alt.Scale(scheme=color_map)
597
+ ), # Color by categorical field # Size by quantitative field
598
  text="team",
599
  )
600
  )
 
631
  "Accuracy": 0,
632
  "AUC": 1,
633
  }
634
+
635
+ # Create a row with two columns for controls
636
+ col1, col2 = st.columns([0.1, 0.9])
637
+
638
+ with col1:
639
+ granularity = st.radio(
640
+ "accuracy type",
641
+ list(accuracy_types.keys()),
642
+ key=f"granularity-{task}-{score}",
643
+ horizontal=True,
644
+ label_visibility="collapsed",
645
+ index=0,
646
+ )
647
+
648
+ show_deltas = False
649
+ if split in ["private", "private_only"]:
650
+ with col2:
651
+ # Add toggle for showing deltas from "none" column
652
+ show_deltas = st.toggle(
653
+ "Show deltas from 'none' (higher values mean 'none' was **lower**)",
654
+ value=False,
655
+ key=f"deltas-{task}-{score}",
656
+ )
657
 
658
  ## Check cases
659
  if accuracy_types[granularity] == 0:
 
681
  if "real_" in c and "accuracy" not in c and "conditional" not in c
682
  ]
683
  tmp = (gen_tmp + real_tmp) / 2.0
684
+
685
+ # If toggle is on and "none" column exists, calculate deltas from "none" column
686
+ if show_deltas and "none" in tmp.columns:
687
+ # Get the "none" column values
688
+ none_values = tmp["none"].copy()
689
+
690
+ # Calculate deltas: none - current_column
691
+ for col in tmp.columns:
692
+ if col != "none":
693
+ tmp[col] = -none_values + tmp[col]
694
+
695
  # st.dataframe(tmp)
696
+ top_n = st.session_state.get("top_n", None)
697
+ show_dataframe_w_format(tmp, top_n=top_n)
698
 
699
  else:
700
  cols = [c for c in results[f"{split}_{score}_score"].columns if "conditional_auc" in c]
 
707
  tmp.columns = col_names
708
 
709
  "#### Conditional AUC"
710
+
711
+ # If toggle is on and "none" column exists, calculate deltas from "none" column
712
+ if show_deltas and "none" in tmp.columns:
713
+ # Get the "none" column values
714
+ none_values = tmp["none"].copy()
715
+
716
+ # Calculate deltas: none - current_column
717
+ for col in tmp.columns:
718
+ if col != "none":
719
+ tmp[col] = -none_values + tmp[col]
720
+
721
  # st.dataframe(tmp)
722
+ top_n = st.session_state.get("top_n", None)
723
+ show_dataframe_w_format(tmp, top_n=top_n)
724
 
725
 
726
  @st.fragment
 
799
  st.markdown(updated)
800
 
801
 
802
+ @st.fragment
803
+ def show_task_comparison():
804
+ """Show summary tables for Task 1 and Task 2 side by side."""
805
+ split = st.session_state.get("split", "public")
806
+ color_map_choice = st.session_state.get("colormap", "paired")
807
+
808
+ task1_key = list(TASKS.keys())[1] # video-challenge-task-1-config
809
+ task2_key = list(TASKS.keys())[2] # video-challenge-task-2-config
810
+
811
+ task1_results = load_results(task1_key, best_only=True)
812
+ task2_results = load_results(task2_key, best_only=True)
813
+
814
+ cols = ["balanced_accuracy", "generated_accuracy", "real_accuracy", "auc", "total_time", "datetime", "fail_rate"]
815
+
816
+ column_config = {
817
+ "balanced_accuracy": st.column_config.NumberColumn(
818
+ "⚖️ Balanced Accuracy",
819
+ format="compact",
820
+ min_value=0,
821
+ max_value=1.0,
822
+ ),
823
+ "generated_accuracy": st.column_config.NumberColumn(
824
+ "👤 True Positive Rate",
825
+ format="compact",
826
+ min_value=0,
827
+ max_value=1.0,
828
+ ),
829
+ "real_accuracy": st.column_config.NumberColumn(
830
+ "🧑‍🎤 True Negative Rate",
831
+ format="compact",
832
+ min_value=0,
833
+ max_value=1.0,
834
+ ),
835
+ "auc": st.column_config.NumberColumn(
836
+ "📐 AUC",
837
+ format="compact",
838
+ min_value=0,
839
+ max_value=1.0,
840
+ ),
841
+ "total_time": st.column_config.NumberColumn(
842
+ "🕒 Inference Time (s)",
843
+ format="compact",
844
+ ),
845
+ "datetime": st.column_config.DatetimeColumn(
846
+ "🗓️ Submission Date",
847
+ format="YYYY-MM-DD",
848
+ ),
849
+ "fail_rate": st.column_config.NumberColumn(
850
+ "❌ Fail Rate",
851
+ format="compact",
852
+ ),
853
+ "task1_balanced_accuracy": st.column_config.NumberColumn(
854
+ "⚖️ Task 1 Balanced Accuracy",
855
+ format="compact",
856
+ min_value=0,
857
+ max_value=1.0,
858
+ ),
859
+ "task2_balanced_accuracy": st.column_config.NumberColumn(
860
+ "⚖️ Task 2 Balanced Accuracy",
861
+ format="compact",
862
+ min_value=0,
863
+ max_value=1.0,
864
+ ),
865
+ "difference": st.column_config.NumberColumn(
866
+ "⚖️ Difference (T1-T2)",
867
+ format="compact",
868
+ ),
869
+ "percent_change": st.column_config.NumberColumn(
870
+ "% Change",
871
+ format="+.2%",
872
+ ),
873
+ }
874
+
875
+ # Create tabs for different views
876
+ tables_tab, charts_tab, time_tab = st.tabs(["Tables", "Charts", "Performance Timeline"])
877
+
878
+ with tables_tab:
879
+ # Create two columns for side-by-side tables
880
+ st.subheader("Performance Comparison: Task 1 vs Task 2")
881
+ col1, col2 = st.columns(2)
882
+
883
+ with col1:
884
+ st.subheader("Task 1: Original Content")
885
+ st.dataframe(
886
+ task1_results[f"{split}_source_score"].loc[:, cols],
887
+ column_config=column_config,
888
+ use_container_width=True,
889
+ )
890
+
891
+ with col2:
892
+ st.subheader("Task 2: Post-processed Content")
893
+ st.dataframe(
894
+ task2_results[f"{split}_source_score"].loc[:, cols],
895
+ column_config=column_config,
896
+ use_container_width=True,
897
+ )
898
+
899
+ # Add a section for comparison of task performance differences
900
+ st.subheader("Performance Analysis")
901
+ st.markdown(
902
+ """
903
+ Performance comparison between Task 1 (original content) and
904
+ Task 2 (post-processed content). A positive difference indicates degraded performance
905
+ on post-processed content.
906
+ """
907
+ )
908
+
909
+ # Get the datasets for both tasks
910
+ task1_df = task1_results[f"{split}_source_score"].reset_index()
911
+ task2_df = task2_results[f"{split}_source_score"].reset_index()
912
+
913
+ # Create a combined dataframe for analysis
914
+ common_teams = set(task1_df["team"]) & set(task2_df["team"])
915
+
916
+ if common_teams:
917
+ # Filter to teams that appear in both tasks
918
+ task1_filtered = task1_df[task1_df["team"].isin(common_teams)]
919
+ task2_filtered = task2_df[task2_df["team"].isin(common_teams)]
920
+
921
+ # Create a comparison dataframe
922
+ comparison_df = pd.DataFrame(
923
+ {
924
+ "team": list(common_teams),
925
+ "task1_balanced_accuracy": [
926
+ task1_filtered[task1_filtered["team"] == team]["balanced_accuracy"].values[0]
927
+ for team in common_teams
928
+ ],
929
+ "task2_balanced_accuracy": [
930
+ task2_filtered[task2_filtered["team"] == team]["balanced_accuracy"].values[0]
931
+ for team in common_teams
932
+ ],
933
+ }
934
+ )
935
+
936
+ # Calculate differences and percentage changes
937
+ comparison_df["difference"] = (
938
+ comparison_df["task1_balanced_accuracy"] - comparison_df["task2_balanced_accuracy"]
939
+ )
940
+ comparison_df["percent_change"] = comparison_df["difference"] / comparison_df["task1_balanced_accuracy"]
941
+
942
+ # Sort by the absolute difference (to show biggest performance changes first)
943
+ comparison_df = comparison_df.sort_values(by="difference", ascending=False).reset_index(drop=True)
944
+
945
+ # Display the comparison table
946
+ show_dataframe_w_format(comparison_df, top_n=0)
947
+ else:
948
+ st.warning("No common teams found across both tasks.")
949
+
950
+ with charts_tab:
951
+ st.subheader("Team Performance Across Tasks")
952
+
953
+ # Get the datasets for both tasks if not already done
954
+ if "task1_df" not in locals():
955
+ task1_df = task1_results[f"{split}_source_score"].reset_index()
956
+ task2_df = task2_results[f"{split}_source_score"].reset_index()
957
+ common_teams = set(task1_df["team"]) & set(task2_df["team"])
958
+
959
+ if common_teams:
960
+ # Prepare data for the plot
961
+ plot_data = []
962
+
963
+ for team in common_teams:
964
+ # Get team's balanced accuracy for each task
965
+ task1_acc = task1_df[task1_df["team"] == team]["balanced_accuracy"].values[0]
966
+ task2_acc = task2_df[task2_df["team"] == team]["balanced_accuracy"].values[0]
967
+
968
+ # Add points for Task 1
969
+ plot_data.append({"team": team, "task": "Task 1", "balanced_accuracy": task1_acc})
970
+
971
+ # Add points for Task 2
972
+ plot_data.append({"team": team, "task": "Task 2", "balanced_accuracy": task2_acc})
973
+
974
+ plot_df = pd.DataFrame(plot_data)
975
+
976
+ # Create line chart connecting team performances
977
+ lines = (
978
+ alt.Chart(plot_df)
979
+ .mark_line(point=alt.OverlayMarkDef(filled=True, size=100), strokeDash=[4, 2], strokeWidth=2)
980
+ .encode(
981
+ x=alt.X("task:N", title="Task", sort=["Task 1", "Task 2"]),
982
+ y=alt.Y("balanced_accuracy:Q", title="Balanced Accuracy", scale=alt.Scale(domain=[0.4, 1.0])),
983
+ color=alt.Color(
984
+ "team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
985
+ ),
986
+ tooltip=["team:N", "task:N", "balanced_accuracy:Q"],
987
+ )
988
+ .properties(width=700, height=500, title="Performance Changes Across Tasks")
989
+ )
990
+
991
+ st.altair_chart(lines, use_container_width=False)
992
+ else:
993
+ st.warning("No common teams found across both tasks.")
994
+
995
+ with time_tab:
996
+ st.subheader("Team Performance Timeline")
997
+
998
+ # Get full submission data (not just best_only) to analyze performance over time
999
+ task1_results_full = load_results(task1_key, best_only=False)
1000
+ task2_results_full = load_results(task2_key, best_only=False)
1001
+
1002
+ # We need to select specific task result keys based on what's available
1003
+ task1_result_key = f"{split}_source_score"
1004
+ task2_result_key = f"{split}_source_score"
1005
+
1006
+ # Check if we have data for both tasks
1007
+ if (
1008
+ task1_result_key in task1_results_full
1009
+ and task2_result_key in task2_results_full
1010
+ and not task1_results_full[task1_result_key].empty
1011
+ and not task2_results_full[task2_result_key].empty
1012
+ ):
1013
+
1014
+ # Extract datetime and make it datetime objects
1015
+ task1_time_df = task1_results_full[task1_result_key].reset_index().copy()
1016
+ task2_time_df = task2_results_full[task2_result_key].reset_index().copy()
1017
+
1018
+ # Ensure datetime column exists in both dataframes
1019
+ if "datetime" in task1_time_df.columns and "datetime" in task2_time_df.columns:
1020
+ # Convert string dates to datetime objects if they aren't already
1021
+ if pd.api.types.is_string_dtype(task1_time_df["datetime"]):
1022
+ task1_time_df["datetime"] = pd.to_datetime(task1_time_df["datetime"])
1023
+ if pd.api.types.is_string_dtype(task2_time_df["datetime"]):
1024
+ task2_time_df["datetime"] = pd.to_datetime(task2_time_df["datetime"])
1025
+
1026
+ # Make a list of unique teams across both tasks
1027
+ all_teams = sorted(
1028
+ list(set(list(task1_time_df["team"].unique()) + list(task2_time_df["team"].unique())))
1029
+ )
1030
+
1031
+ # Create a selectbox to select teams to display
1032
+ if len(all_teams) > 10: # If we have many teams, add a filter
1033
+ selected_teams = st.multiselect("Select Teams to Display", all_teams, default=all_teams[:5])
1034
+ if not selected_teams: # Default to first 5 if none selected
1035
+ selected_teams = all_teams[:5]
1036
+ else:
1037
+ selected_teams = all_teams
1038
+
1039
+ # Function to compute running max for each team
1040
+ def compute_running_max(df):
1041
+ # Group by team and sort by datetime
1042
+ result_df = df.copy()
1043
+ for team in result_df["team"].unique():
1044
+ team_data = result_df[result_df["team"] == team].copy()
1045
+ team_data = team_data.sort_values("datetime")
1046
+ # Calculate running maximum
1047
+ team_data["balanced_accuracy"] = team_data["balanced_accuracy"].cummax()
1048
+ # Update the original dataframe
1049
+ result_df.loc[team_data.index, "balanced_accuracy"] = team_data["balanced_accuracy"]
1050
+ return result_df
1051
+
1052
+ # Filter and compute running maximum for each task
1053
+ task1_filtered = task1_time_df[task1_time_df["team"].isin(selected_teams)].copy()
1054
+ task2_filtered = task2_time_df[task2_time_df["team"].isin(selected_teams)].copy()
1055
+
1056
+ if not task1_filtered.empty and not task2_filtered.empty:
1057
+ # Compute running maximum
1058
+ task1_max = compute_running_max(task1_filtered)
1059
+ task2_max = compute_running_max(task2_filtered)
1060
+
1061
+ # Create tabs for the two tasks
1062
+ task1_plot_tab, task2_plot_tab = st.tabs(["Task 1 Timeline", "Task 2 Timeline"])
1063
+
1064
+ # Create plot for Task 1
1065
+ with task1_plot_tab:
1066
+ st.subheader("Task 1: Original Content - Performance Over Time")
1067
+
1068
+ # Calculate max performance for baseline
1069
+ task1_max_performance = task1_time_df[
1070
+ task1_time_df["team"].apply(lambda x: x.lower()).isin(["baseline"])
1071
+ ]["balanced_accuracy"].max()
1072
+
1073
+ # Create baseline data
1074
+ baseline_data = pd.DataFrame(
1075
+ {
1076
+ "datetime": [task1_max["datetime"].min(), task1_max["datetime"].max()],
1077
+ "balanced_accuracy": [task1_max_performance, task1_max_performance],
1078
+ "label": ["Max Performance", "Max Performance"],
1079
+ }
1080
+ )
1081
+
1082
+ # Create baseline chart
1083
+ baseline_chart = (
1084
+ alt.Chart(baseline_data)
1085
+ .mark_line(strokeDash=[4, 4], color="black", strokeWidth=2)
1086
+ .encode(
1087
+ x="datetime:T",
1088
+ y="balanced_accuracy:Q",
1089
+ tooltip=alt.Tooltip("balanced_accuracy:Q", title="Baseline", format=".4f"),
1090
+ )
1091
+ )
1092
+
1093
+ # Create main chart
1094
+ task1_chart = (
1095
+ alt.Chart(task1_max)
1096
+ .mark_line(point=True)
1097
+ .encode(
1098
+ x=alt.X(
1099
+ "datetime:T",
1100
+ title="Submission Date",
1101
+ axis=alt.Axis(format="%b %d"), # Format as "Month Date"
1102
+ ),
1103
+ y=alt.Y(
1104
+ "balanced_accuracy:Q",
1105
+ title="Best Balanced Accuracy",
1106
+ scale=alt.Scale(domain=[0.4, 1.0]),
1107
+ ),
1108
+ color=alt.Color(
1109
+ "team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
1110
+ ),
1111
+ tooltip=[
1112
+ "team:N",
1113
+ alt.Tooltip("datetime:T", title="Date", format="%b %d, %Y"),
1114
+ alt.Tooltip("balanced_accuracy:Q", title="Best Accuracy", format=".4f"),
1115
+ ],
1116
+ )
1117
+ .properties(width=800, height=500, title="Best Performance Over Time (Original Content)")
1118
+ .interactive()
1119
+ )
1120
+
1121
+ # Combine charts and display
1122
+ st.altair_chart(task1_chart + baseline_chart, use_container_width=False)
1123
+
1124
+ # Create plot for Task 2
1125
+ with task2_plot_tab:
1126
+ st.subheader("Task 2: Post-processed Content - Performance Over Time")
1127
+
1128
+ # Calculate max performance for baseline
1129
+ task2_max_performance = task2_time_df[
1130
+ task2_time_df["team"].apply(lambda x: x.lower()).isin(["baseline"])
1131
+ ]["balanced_accuracy"].max()
1132
+
1133
+ # Create baseline data
1134
+ baseline_data = pd.DataFrame(
1135
+ {
1136
+ "datetime": [task2_max["datetime"].min(), task2_max["datetime"].max()],
1137
+ "balanced_accuracy": [task2_max_performance, task2_max_performance],
1138
+ "label": ["Max Performance", "Max Performance"],
1139
+ }
1140
+ )
1141
+
1142
+ # Create baseline chart
1143
+ baseline_chart = (
1144
+ alt.Chart(baseline_data)
1145
+ .mark_line(strokeDash=[4, 4], color="black", strokeWidth=2)
1146
+ .encode(
1147
+ x="datetime:T",
1148
+ y="balanced_accuracy:Q",
1149
+ tooltip=alt.Tooltip("balanced_accuracy:Q", title="Baseline", format=".4f"),
1150
+ )
1151
+ )
1152
+
1153
+ # Create main chart
1154
+ task2_chart = (
1155
+ alt.Chart(task2_max)
1156
+ .mark_line(point=True)
1157
+ .encode(
1158
+ x=alt.X(
1159
+ "datetime:T",
1160
+ title="Submission Date",
1161
+ axis=alt.Axis(format="%b %d"), # Format as "Month Date"
1162
+ ),
1163
+ y=alt.Y(
1164
+ "balanced_accuracy:Q",
1165
+ title="Best Balanced Accuracy",
1166
+ scale=alt.Scale(domain=[0.4, 1.0]),
1167
+ ),
1168
+ color=alt.Color(
1169
+ "team:N", scale=alt.Scale(scheme=color_map_choice), legend=alt.Legend(title="Teams")
1170
+ ),
1171
+ tooltip=[
1172
+ "team:N",
1173
+ alt.Tooltip("datetime:T", title="Date", format="%b %d, %Y"),
1174
+ alt.Tooltip("balanced_accuracy:Q", title="Best Accuracy", format=".4f"),
1175
+ ],
1176
+ )
1177
+ .properties(
1178
+ width=800, height=500, title="Best Performance Over Time (Post-Processed Content)"
1179
+ )
1180
+ .interactive()
1181
+ )
1182
+
1183
+ # Combine charts and display
1184
+ st.altair_chart(task2_chart + baseline_chart, use_container_width=False)
1185
+
1186
+ else:
1187
+ st.warning("No data available for selected teams.")
1188
+ else:
1189
+ st.warning("Datetime information is not available in the dataset.")
1190
+ else:
1191
+ st.warning("Historical performance data is not available for both tasks.")
1192
+
1193
+
1194
+ t1, t2, tp, comparison_tab, volume_tab, all_submission_tab = st.tabs(
1195
+ ["**Task 1**", "**Task 2**", "**Pilot Task**", "**Compare Tasks**", "**Submission Volume**", "**All Submissions**"]
1196
  )
1197
 
1198
  with t1:
 
1204
  with tp:
1205
  "*Detection of Synthetic Video Content. Video files are unmodified from the original output from the models or the real sources.*"
1206
  make_plots_for_task(list(TASKS.keys())[0])
1207
+ if split in ["private", "private_only"]:
1208
+ with comparison_tab:
1209
+ "**Task 1 to Task 2 performance comparison.**"
1210
+ show_task_comparison()
1211
 
1212
  with volume_tab:
1213
  subs = get_volume()
metric.py CHANGED
@@ -147,13 +147,15 @@ def _metric(
147
  ## Save data split
148
  evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
149
  evaluation["private_score"]["proportion"] = 1.0
 
150
 
151
- ## Public and private split
152
  public_df = solution_df.query("split=='public'").copy()
153
  private_df = solution_df.copy()
 
154
 
155
  ## Loop
156
- for split, dataframe in zip(["public", "private"], [public_df, private_df]):
157
  metrics = compute_metrics(
158
  df=dataframe.copy(), score_name=score_name if split == "public" else f"{score_name}_og", use_all=use_all
159
  )
 
147
  ## Save data split
148
  evaluation["public_score"]["proportion"] = len(solution_df.query(f"split=='public'").copy()) / len(solution_df)
149
  evaluation["private_score"]["proportion"] = 1.0
150
+ evaluation["private_only_score"]["proportion"] = len(solution_df.query(f"split=='private'").copy()) / len(solution_df)
151
 
152
+ ## Public, private, and private_only split
153
  public_df = solution_df.query("split=='public'").copy()
154
  private_df = solution_df.copy()
155
+ private_only_df = solution_df.query("split=='private'").copy()
156
 
157
  ## Loop
158
+ for split, dataframe in zip(["public", "private", "private_only"], [public_df, private_df, private_only_df]):
159
  metrics = compute_metrics(
160
  df=dataframe.copy(), score_name=score_name if split == "public" else f"{score_name}_og", use_all=use_all
161
  )
utils.py CHANGED
@@ -22,7 +22,7 @@ def download_competition_data(competition_names: List[str]) -> None:
22
  local_dir=os.path.join(COMP_CACHE, repo_id),
23
  repo_type="dataset",
24
  token=os.environ.get("HF_TOKEN"),
25
- ignore_patterns="submission_logs/*"
26
  )
27
 
28
 
@@ -142,7 +142,7 @@ def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]:
142
  return new
143
 
144
 
145
- def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
146
  import pandas as pd
147
  import json
148
 
@@ -153,7 +153,7 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
153
 
154
  team_id = "insiders-id-1-2-3"
155
  team_name = "insiders"
156
- submission_id = f"sub{threshold}".replace(".","")
157
 
158
  ## update teams
159
  teams = json.load(open(path_to_cache + "/teams.json"))
@@ -169,20 +169,22 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
169
  if os.path.exists(submission_info_file):
170
  temp = json.load(open(submission_info_file))
171
  else:
172
- temp = {"id": team_id,"submissions": []}
173
-
174
- temp["submissions"].append({
175
- "datetime": "2025-09-22 14:42:14",
176
- "submission_id": submission_id,
177
- "submission_comment": "",
178
- "submission_repo": "",
179
- "space_id": "",
180
- "submitted_by": "na",
181
- "status": 3,
182
- "selected": True,
183
- "public_score": {},
184
- "private_score": {},
185
- })
 
 
186
 
187
  with open(submission_info_file, "w") as f:
188
  json.dump(temp, f)
@@ -191,11 +193,16 @@ def add_custom_submission(path_to_cache, path_to_subfile, threshold = 0):
191
  path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
192
  )
193
 
 
194
  def create_custom_subs():
195
  import numpy as np
196
- for threshold in np.linspace(-6,0,10):
197
- add_custom_submission(path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config",
198
- path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt", threshold=threshold)
 
 
 
 
199
 
200
 
201
  if __name__ == "__main__":
@@ -206,15 +213,11 @@ if __name__ == "__main__":
206
  "safe-challenge/video-challenge-task-1-config",
207
  "safe-challenge/video-challenge-task-2-config",
208
  ]
209
- download_competition_data(competition_names=spaces)
210
 
211
-
212
  if os.environ.get("MAKE_CUSTOM"):
213
  print("adding custom subs")
214
  create_custom_subs()
215
-
216
-
217
-
218
 
219
  ## Loop
220
  for space in spaces:
@@ -263,7 +266,7 @@ if __name__ == "__main__":
263
  scores = ["source"]
264
  for score_name in scores:
265
  ## Loop and save by team
266
- public, private, rocs = [], [], []
267
  # for team_id, submission_set in submissions.items():
268
  for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
269
  "team_id"
@@ -298,6 +301,11 @@ if __name__ == "__main__":
298
  for key, value in results.items()
299
  if key in team_submissions
300
  }
 
 
 
 
 
301
 
302
  ## Add timing
303
  public_times = {
@@ -312,10 +320,18 @@ if __name__ == "__main__":
312
  ["submission_id", "private_time"]
313
  ].to_dict(orient="records")
314
  }
 
 
 
 
 
 
315
  for key in public_results.keys():
316
  public_results[key]["total_time"] = public_times[key]
317
  for key in private_results.keys():
318
  private_results[key]["total_time"] = private_times[key]
 
 
319
 
320
  ## Roc computations
321
  roc_results = {
@@ -425,9 +441,45 @@ if __name__ == "__main__":
425
  )
426
  private.append(private_df)
427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  ## Save as csvs
429
  public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
430
  private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
 
 
 
431
  rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
432
  public.to_csv(
433
  Path("competition_cache")
@@ -441,6 +493,13 @@ if __name__ == "__main__":
441
  / f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
442
  index=False,
443
  )
 
 
 
 
 
 
 
444
  rocs.to_csv(
445
  Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
446
  index=False,
 
22
  local_dir=os.path.join(COMP_CACHE, repo_id),
23
  repo_type="dataset",
24
  token=os.environ.get("HF_TOKEN"),
25
+ ignore_patterns="submission_logs/*",
26
  )
27
 
28
 
 
142
  return new
143
 
144
 
145
+ def add_custom_submission(path_to_cache, path_to_subfile, threshold=0):
146
  import pandas as pd
147
  import json
148
 
 
153
 
154
  team_id = "insiders-id-1-2-3"
155
  team_name = "insiders"
156
+ submission_id = f"sub{threshold}".replace(".", "")
157
 
158
  ## update teams
159
  teams = json.load(open(path_to_cache + "/teams.json"))
 
169
  if os.path.exists(submission_info_file):
170
  temp = json.load(open(submission_info_file))
171
  else:
172
+ temp = {"id": team_id, "submissions": []}
173
+
174
+ temp["submissions"].append(
175
+ {
176
+ "datetime": "2025-09-22 14:42:14",
177
+ "submission_id": submission_id,
178
+ "submission_comment": "",
179
+ "submission_repo": "",
180
+ "space_id": "",
181
+ "submitted_by": "na",
182
+ "status": 3,
183
+ "selected": True,
184
+ "public_score": {},
185
+ "private_score": {},
186
+ }
187
+ )
188
 
189
  with open(submission_info_file, "w") as f:
190
  json.dump(temp, f)
 
193
  path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
194
  )
195
 
196
+
197
  def create_custom_subs():
198
  import numpy as np
199
+
200
+ for threshold in np.linspace(-6, 0, 10):
201
+ add_custom_submission(
202
+ path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config",
203
+ path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt",
204
+ threshold=threshold,
205
+ )
206
 
207
 
208
  if __name__ == "__main__":
 
213
  "safe-challenge/video-challenge-task-1-config",
214
  "safe-challenge/video-challenge-task-2-config",
215
  ]
216
+ # download_competition_data(competition_names=spaces)
217
 
 
218
  if os.environ.get("MAKE_CUSTOM"):
219
  print("adding custom subs")
220
  create_custom_subs()
 
 
 
221
 
222
  ## Loop
223
  for space in spaces:
 
266
  scores = ["source"]
267
  for score_name in scores:
268
  ## Loop and save by team
269
+ public, private, private_only, rocs = [], [], [], []
270
  # for team_id, submission_set in submissions.items():
271
  for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
272
  "team_id"
 
301
  for key, value in results.items()
302
  if key in team_submissions
303
  }
304
+ private_only_results = {
305
+ key: prep_private(value["private_only_score"])
306
+ for key, value in results.items()
307
+ if key in team_submissions
308
+ }
309
 
310
  ## Add timing
311
  public_times = {
 
320
  ["submission_id", "private_time"]
321
  ].to_dict(orient="records")
322
  }
323
+ private_only_times = {
324
+ x["submission_id"]: x["private_time"] - x["public_time"]
325
+ for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
326
+ ["submission_id", "private_time", "public_time"]
327
+ ].to_dict(orient="records")
328
+ }
329
  for key in public_results.keys():
330
  public_results[key]["total_time"] = public_times[key]
331
  for key in private_results.keys():
332
  private_results[key]["total_time"] = private_times[key]
333
+ for key in private_only_results.keys():
334
+ private_only_results[key]["total_time"] = private_only_times[key]
335
 
336
  ## Roc computations
337
  roc_results = {
 
441
  )
442
  private.append(private_df)
443
 
444
+ ## Private ONLY results
445
+ private_only_df = pd.json_normalize(private_only_results.values())
446
+ private_only_df.insert(
447
+ loc=0,
448
+ column="submission_id",
449
+ value=list(private_only_results.keys()),
450
+ )
451
+ private_only_df.insert(
452
+ loc=0,
453
+ column="team",
454
+ value=[
455
+ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
456
+ for submission_id in private_only_results.keys()
457
+ ],
458
+ )
459
+ private_only_df.insert(
460
+ loc=0,
461
+ column="team_id",
462
+ value=[
463
+ teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
464
+ for submission_id in private_only_results.keys()
465
+ ],
466
+ )
467
+ private_only_df.insert(
468
+ loc=0,
469
+ column="datetime",
470
+ value=[
471
+ submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0]
472
+ for submission_id in private_only_results.keys()
473
+ ],
474
+ )
475
+ private_only.append(private_only_df)
476
+
477
  ## Save as csvs
478
  public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
479
  private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
480
+ private_only = pd.concat(private_only, axis=0, ignore_index=True).sort_values(
481
+ by="balanced_accuracy", ascending=False
482
+ )
483
  rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
484
  public.to_csv(
485
  Path("competition_cache")
 
493
  / f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
494
  index=False,
495
  )
496
+ private_only.to_csv(
497
+ Path("competition_cache")
498
+ / "cached_results"
499
+ / f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv",
500
+ index=False,
501
+ )
502
+
503
  rocs.to_csv(
504
  Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
505
  index=False,