Spaces:
Running
Running
Commit
·
a90f2dd
1
Parent(s):
371767b
Startup model check: 2025-09-01 09:31:10
Browse files- logs/startup_update.log +0 -0
- output/model_results.json +2 -2
- src/predict/pipeline.py +89 -29
- wa.py +0 -0
logs/startup_update.log
CHANGED
|
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
|
|
|
output/model_results.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa6c9cdfa89c172663708c5987d5bd43c108003ba1310cd008090c614bb18ee1
|
| 3 |
+
size 27298
|
src/predict/pipeline.py
CHANGED
|
@@ -190,6 +190,9 @@ class PredictionPipeline:
|
|
| 190 |
|
| 191 |
should_retrain = self._should_retrain_models()
|
| 192 |
|
|
|
|
|
|
|
|
|
|
| 193 |
for i, model in enumerate(self.models):
|
| 194 |
model_name = model.__class__.__name__
|
| 195 |
print(f"\n--- Evaluating Model: {model_name} ---")
|
|
@@ -241,6 +244,21 @@ class PredictionPipeline:
|
|
| 241 |
'total_fights': len(eval_fights),
|
| 242 |
'model_status': model_status
|
| 243 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
if detailed_report:
|
| 246 |
self._report_detailed_results()
|
|
@@ -249,7 +267,7 @@ class PredictionPipeline:
|
|
| 249 |
|
| 250 |
# Only train and save models if retraining was performed
|
| 251 |
if should_retrain:
|
| 252 |
-
self.
|
| 253 |
|
| 254 |
def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
|
| 255 |
"""Performs k-fold cross-validation where each fold is a set of events.
|
|
@@ -262,6 +280,9 @@ class PredictionPipeline:
|
|
| 262 |
# Initialize KFold splitter on events
|
| 263 |
kf = KFold(n_splits=k, shuffle=True, random_state=42)
|
| 264 |
|
|
|
|
|
|
|
|
|
|
| 265 |
all_fold_metrics = []
|
| 266 |
for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
|
| 267 |
train_events = [event_list[i] for i in train_event_idx]
|
|
@@ -298,19 +319,32 @@ class PredictionPipeline:
|
|
| 298 |
|
| 299 |
acc = correct / len(test_set) if test_set else 0.0
|
| 300 |
fold_results[model_name] = acc
|
| 301 |
-
|
| 302 |
-
# Log metrics and model artifact
|
| 303 |
mlflow.log_metric(f"accuracy_{model_name}", acc)
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
all_fold_metrics.append(fold_results)
|
| 307 |
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
def update_models_if_new_data(self):
|
| 311 |
"""
|
| 312 |
-
Checks for new data and retrains/saves
|
| 313 |
-
This
|
| 314 |
"""
|
| 315 |
print("\n--- Checking for Model Updates ---")
|
| 316 |
|
|
@@ -318,28 +352,50 @@ class PredictionPipeline:
|
|
| 318 |
missing_models = [m for m in self.models if not self._model_exists(m)]
|
| 319 |
has_new_data = self._has_new_data_since_last_training()
|
| 320 |
|
| 321 |
-
if missing_models:
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
else:
|
| 329 |
print("No new data detected. Models are already up-to-date.")
|
| 330 |
|
| 331 |
-
def
|
| 332 |
-
"""Trains
|
| 333 |
-
print("\n\n--- Training and Saving
|
| 334 |
|
| 335 |
if not os.path.exists(FIGHTS_CSV_PATH):
|
| 336 |
-
print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save
|
| 337 |
return
|
| 338 |
|
| 339 |
with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
|
| 340 |
all_fights = list(csv.DictReader(f))
|
| 341 |
|
| 342 |
-
print(f"Training
|
| 343 |
|
| 344 |
if not os.path.exists(MODELS_DIR):
|
| 345 |
os.makedirs(MODELS_DIR)
|
|
@@ -352,21 +408,25 @@ class PredictionPipeline:
|
|
| 352 |
latest_event_name = latest_fight['event_name']
|
| 353 |
latest_event_date = latest_fight['event_date']
|
| 354 |
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
| 358 |
model.train(all_fights)
|
| 359 |
|
| 360 |
-
# Sanitize and save the model
|
| 361 |
-
file_name = f"{model_name}
|
| 362 |
save_path = os.path.join(MODELS_DIR, file_name)
|
| 363 |
joblib.dump(model, save_path)
|
| 364 |
-
print(f"
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
| 370 |
|
| 371 |
def _report_summary(self):
|
| 372 |
"""Prints a concise summary of model performance."""
|
|
|
|
| 190 |
|
| 191 |
should_retrain = self._should_retrain_models()
|
| 192 |
|
| 193 |
+
# Track best model across all evaluations
|
| 194 |
+
best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
|
| 195 |
+
|
| 196 |
for i, model in enumerate(self.models):
|
| 197 |
model_name = model.__class__.__name__
|
| 198 |
print(f"\n--- Evaluating Model: {model_name} ---")
|
|
|
|
| 244 |
'total_fights': len(eval_fights),
|
| 245 |
'model_status': model_status
|
| 246 |
}
|
| 247 |
+
|
| 248 |
+
# Track best model
|
| 249 |
+
if accuracy > best_model_info['accuracy']:
|
| 250 |
+
best_model_info['accuracy'] = accuracy
|
| 251 |
+
best_model_info['model_name'] = model_name
|
| 252 |
+
best_model_info['model'] = model
|
| 253 |
+
|
| 254 |
+
# Log best model to MLflow
|
| 255 |
+
if best_model_info['model'] is not None:
|
| 256 |
+
mlflow.set_experiment("UFC_Best_Models")
|
| 257 |
+
with mlflow.start_run(run_name="best_model_evaluation"):
|
| 258 |
+
mlflow.log_metric("best_accuracy", best_model_info['accuracy'])
|
| 259 |
+
mlflow.log_param("model_type", best_model_info['model_name'])
|
| 260 |
+
mlflow.sklearn.log_model(best_model_info['model'], "best_model")
|
| 261 |
+
print(f"Best model logged to MLflow: {best_model_info['model_name']} with {best_model_info['accuracy']:.2f}% accuracy")
|
| 262 |
|
| 263 |
if detailed_report:
|
| 264 |
self._report_detailed_results()
|
|
|
|
| 267 |
|
| 268 |
# Only train and save models if retraining was performed
|
| 269 |
if should_retrain:
|
| 270 |
+
self._train_and_save_best_model(best_model_info)
|
| 271 |
|
| 272 |
def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
|
| 273 |
"""Performs k-fold cross-validation where each fold is a set of events.
|
|
|
|
| 280 |
# Initialize KFold splitter on events
|
| 281 |
kf = KFold(n_splits=k, shuffle=True, random_state=42)
|
| 282 |
|
| 283 |
+
# Track best model across all folds
|
| 284 |
+
best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
|
| 285 |
+
|
| 286 |
all_fold_metrics = []
|
| 287 |
for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
|
| 288 |
train_events = [event_list[i] for i in train_event_idx]
|
|
|
|
| 319 |
|
| 320 |
acc = correct / len(test_set) if test_set else 0.0
|
| 321 |
fold_results[model_name] = acc
|
|
|
|
|
|
|
| 322 |
mlflow.log_metric(f"accuracy_{model_name}", acc)
|
| 323 |
+
|
| 324 |
+
# Update best model tracking
|
| 325 |
+
if acc > best_model_info['accuracy']:
|
| 326 |
+
best_model_info['accuracy'] = acc
|
| 327 |
+
best_model_info['model_name'] = model_name
|
| 328 |
+
best_model_info['model'] = model
|
| 329 |
|
| 330 |
all_fold_metrics.append(fold_results)
|
| 331 |
|
| 332 |
+
# Log the overall best model across all folds
|
| 333 |
+
if best_model_info['model'] is not None:
|
| 334 |
+
mlflow.set_experiment("UFC_Best_Models")
|
| 335 |
+
with mlflow.start_run(run_name="kfold_best_model"):
|
| 336 |
+
mlflow.log_metric("best_accuracy", best_model_info['accuracy'])
|
| 337 |
+
mlflow.log_param("model_type", best_model_info['model_name'])
|
| 338 |
+
mlflow.log_param("k_folds", k)
|
| 339 |
+
mlflow.sklearn.log_model(best_model_info['model'], "best_model")
|
| 340 |
+
print(f"Overall best model from k-fold CV: {best_model_info['model_name']} with {best_model_info['accuracy']:.2%} accuracy")
|
| 341 |
+
|
| 342 |
+
return all_fold_metrics, best_model_info
|
| 343 |
|
| 344 |
def update_models_if_new_data(self):
|
| 345 |
"""
|
| 346 |
+
Checks for new data and retrains/saves the best model on the full dataset if needed.
|
| 347 |
+
This runs a quick evaluation to determine the best model.
|
| 348 |
"""
|
| 349 |
print("\n--- Checking for Model Updates ---")
|
| 350 |
|
|
|
|
| 352 |
missing_models = [m for m in self.models if not self._model_exists(m)]
|
| 353 |
has_new_data = self._has_new_data_since_last_training()
|
| 354 |
|
| 355 |
+
if missing_models or has_new_data:
|
| 356 |
+
print("Running quick evaluation to find best model...")
|
| 357 |
+
|
| 358 |
+
# Quick evaluation to find best model
|
| 359 |
+
self._load_and_split_data()
|
| 360 |
+
eval_fights = [f for f in self.test_fights if f['winner'] not in ["Draw", "NC", ""]]
|
| 361 |
+
|
| 362 |
+
best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
|
| 363 |
+
|
| 364 |
+
for model in self.models:
|
| 365 |
+
model_name = model.__class__.__name__
|
| 366 |
+
print(f"Evaluating {model_name}...")
|
| 367 |
+
|
| 368 |
+
model.train(self.train_fights)
|
| 369 |
+
correct = 0
|
| 370 |
+
for fight in eval_fights:
|
| 371 |
+
prediction = model.predict(fight)
|
| 372 |
+
if prediction.get('winner') == fight['winner']:
|
| 373 |
+
correct += 1
|
| 374 |
+
|
| 375 |
+
accuracy = (correct / len(eval_fights)) * 100 if eval_fights else 0
|
| 376 |
+
|
| 377 |
+
if accuracy > best_model_info['accuracy']:
|
| 378 |
+
best_model_info['accuracy'] = accuracy
|
| 379 |
+
best_model_info['model_name'] = model_name
|
| 380 |
+
best_model_info['model'] = model
|
| 381 |
+
|
| 382 |
+
print(f"Best model: {best_model_info['model_name']} with {best_model_info['accuracy']:.2f}% accuracy")
|
| 383 |
+
self._train_and_save_best_model(best_model_info)
|
| 384 |
else:
|
| 385 |
print("No new data detected. Models are already up-to-date.")
|
| 386 |
|
| 387 |
+
def _train_and_save_best_model(self, best_model_info):
|
| 388 |
+
"""Trains only the best performing model on the full dataset and saves it."""
|
| 389 |
+
print("\n\n--- Training and Saving Best Model on Full Dataset ---")
|
| 390 |
|
| 391 |
if not os.path.exists(FIGHTS_CSV_PATH):
|
| 392 |
+
print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save model.")
|
| 393 |
return
|
| 394 |
|
| 395 |
with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
|
| 396 |
all_fights = list(csv.DictReader(f))
|
| 397 |
|
| 398 |
+
print(f"Training best model on all {len(all_fights)} available fights...")
|
| 399 |
|
| 400 |
if not os.path.exists(MODELS_DIR):
|
| 401 |
os.makedirs(MODELS_DIR)
|
|
|
|
| 408 |
latest_event_name = latest_fight['event_name']
|
| 409 |
latest_event_date = latest_fight['event_date']
|
| 410 |
|
| 411 |
+
if best_model_info['model'] is not None:
|
| 412 |
+
model = best_model_info['model']
|
| 413 |
+
model_name = best_model_info['model_name']
|
| 414 |
+
|
| 415 |
+
print(f"\n--- Training Best Model: {model_name} ---")
|
| 416 |
model.train(all_fights)
|
| 417 |
|
| 418 |
+
# Sanitize and save the best model
|
| 419 |
+
file_name = f"best_{model_name}_{best_model_info['accuracy']:.2f}%.joblib"
|
| 420 |
save_path = os.path.join(MODELS_DIR, file_name)
|
| 421 |
joblib.dump(model, save_path)
|
| 422 |
+
print(f"Best model saved successfully to {save_path} with {best_model_info['accuracy']:.2f}% accuracy")
|
| 423 |
|
| 424 |
+
# Save the last trained event info
|
| 425 |
+
if all_fights:
|
| 426 |
+
self._save_last_trained_event(latest_event_name, latest_event_date)
|
| 427 |
+
print(f"Updated last trained event: {latest_event_name} ({latest_event_date})")
|
| 428 |
+
else:
|
| 429 |
+
print("No best model found to train and save.")
|
| 430 |
|
| 431 |
def _report_summary(self):
|
| 432 |
"""Prints a concise summary of model performance."""
|
wa.py
ADDED
|
File without changes
|