tarekmasryo's picture
add app.py + requirements + sample CSV
cbbf2e1 verified
raw
history blame
3.9 kB
import gradio as gr
import pandas as pd
import numpy as np
import joblib
import json
from huggingface_hub import hf_hub_download
# === Config ===
REPO_ID = "TarekMasryo/CreditCard-fraud-detection-ML" # your model repo
MODEL_FILENAME = "model_rf_cal.joblib" # using RF-Cal by default
META_FILENAME = "meta.json"
# Engineered features expected by the model
PCA_FEATURES = [f"V{i}" for i in range(1, 29)]
ENGINEERED = ["Amount", "_log_amount", "Hour_from_start_mod24",
"is_night_proxy", "is_business_hours_proxy"]
FEATURES = PCA_FEATURES + ENGINEERED
# === Load model & thresholds ===
model_path = hf_hub_download(REPO_ID, MODEL_FILENAME)
model = joblib.load(model_path)
meta_path = hf_hub_download(REPO_ID, META_FILENAME)
with open(meta_path, "r") as f:
meta = json.load(f)
# Default threshold (Validation P>=90%) for RF-Cal
DEFAULT_THR = float(meta["thresholds"]["rf_cal"]["p90"])
def ensure_engineered_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Create engineered columns if missing, using the same logic as training."""
df = df.copy()
# _log_amount
if "_log_amount" not in df.columns and "Amount" in df.columns:
df["_log_amount"] = np.log1p(df["Amount"].astype(float))
# Hour_from_start_mod24 and proxies if Time exists (seconds from start)
if "Hour_from_start_mod24" not in df.columns and "Time" in df.columns:
hours = (np.floor(df["Time"].astype(float) / 3600) % 24).astype(int)
df["Hour_from_start_mod24"] = hours
if "is_night_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns:
df["is_night_proxy"] = df["Hour_from_start_mod24"].isin([22,23,0,1,2,3,4,5]).astype(int)
if "is_business_hours_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns:
df["is_business_hours_proxy"] = df["Hour_from_start_mod24"].between(9,17).astype(int)
return df
def predict_csv(file, threshold: float, return_all_rows: bool):
# Load data
df = pd.read_csv(file.name)
df = ensure_engineered_columns(df)
# Check required columns
missing = [c for c in FEATURES if c not in df.columns]
if missing:
return f"❌ Missing required columns: {missing}. Provide these or include 'Time' and 'Amount' so the app can derive engineered features."
# Predict
probs = model.predict_proba(df[FEATURES])[:, 1]
preds = (probs >= threshold).astype(int)
out = df.copy()
out["Fraud_Probability"] = probs
out["Prediction"] = preds
# Save to a temporary CSV for download
out_path = "predictions.csv"
(out if return_all_rows else out.head(200)).to_csv(out_path, index=False)
# Display top rows + file for download
display_df = out if return_all_rows else out.head(50)
return display_df, out_path
with gr.Blocks() as demo:
gr.Markdown("# 💳 Credit Card Fraud Detection — Calibrated RF (HF Model)")
gr.Markdown(
"Upload a CSV with transaction rows. The app loads a calibrated Random Forest model "
"and applies the **validation P≥90% threshold** by default. "
"Required columns: V1..V28, Amount, and either engineered features or a raw Time column "
"(seconds from start) so the app can derive them."
)
with gr.Row():
file_in = gr.File(label="Upload CSV", file_types=[".csv"])
with gr.Row():
thr = gr.Slider(minimum=0.0, maximum=1.0, value=DEFAULT_THR, step=0.001,
label=f"Decision Threshold (default P≥90% = {DEFAULT_THR:.3f})")
all_rows = gr.Checkbox(label="Return all rows (uncheck to preview first 50)", value=False)
btn = gr.Button("Predict")
out_df = gr.Dataframe(label="Predictions")
out_file = gr.File(label="Download predictions.csv")
btn.click(fn=predict_csv, inputs=[file_in, thr, all_rows], outputs=[out_df, out_file])
if __name__ == "__main__":
demo.launch()