Spaces:

tarekmasryo
/

fraud-detection-calibrated

Sleeping

File size: 3,900 Bytes

cbbf2e1


import gradio as gr
import pandas as pd
import numpy as np
import joblib
import json
from huggingface_hub import hf_hub_download

# === Config ===
REPO_ID = "TarekMasryo/CreditCard-fraud-detection-ML"  # your model repo
MODEL_FILENAME = "model_rf_cal.joblib"                  # using RF-Cal by default
META_FILENAME = "meta.json"

# Engineered features expected by the model
PCA_FEATURES = [f"V{i}" for i in range(1, 29)]
ENGINEERED = ["Amount", "_log_amount", "Hour_from_start_mod24",
              "is_night_proxy", "is_business_hours_proxy"]
FEATURES = PCA_FEATURES + ENGINEERED

# === Load model & thresholds ===
model_path = hf_hub_download(REPO_ID, MODEL_FILENAME)
model = joblib.load(model_path)

meta_path = hf_hub_download(REPO_ID, META_FILENAME)
with open(meta_path, "r") as f:
    meta = json.load(f)

# Default threshold (Validation P>=90%) for RF-Cal
DEFAULT_THR = float(meta["thresholds"]["rf_cal"]["p90"])

def ensure_engineered_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Create engineered columns if missing, using the same logic as training."""
    df = df.copy()
    # _log_amount
    if "_log_amount" not in df.columns and "Amount" in df.columns:
        df["_log_amount"] = np.log1p(df["Amount"].astype(float))

    # Hour_from_start_mod24 and proxies if Time exists (seconds from start)
    if "Hour_from_start_mod24" not in df.columns and "Time" in df.columns:
        hours = (np.floor(df["Time"].astype(float) / 3600) % 24).astype(int)
        df["Hour_from_start_mod24"] = hours

    if "is_night_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns:
        df["is_night_proxy"] = df["Hour_from_start_mod24"].isin([22,23,0,1,2,3,4,5]).astype(int)

    if "is_business_hours_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns:
        df["is_business_hours_proxy"] = df["Hour_from_start_mod24"].between(9,17).astype(int)

    return df

def predict_csv(file, threshold: float, return_all_rows: bool):
    # Load data
    df = pd.read_csv(file.name)
    df = ensure_engineered_columns(df)

    # Check required columns
    missing = [c for c in FEATURES if c not in df.columns]
    if missing:
        return f"❌ Missing required columns: {missing}. Provide these or include 'Time' and 'Amount' so the app can derive engineered features."

    # Predict
    probs = model.predict_proba(df[FEATURES])[:, 1]
    preds = (probs >= threshold).astype(int)

    out = df.copy()
    out["Fraud_Probability"] = probs
    out["Prediction"] = preds

    # Save to a temporary CSV for download
    out_path = "predictions.csv"
    (out if return_all_rows else out.head(200)).to_csv(out_path, index=False)

    # Display top rows + file for download
    display_df = out if return_all_rows else out.head(50)
    return display_df, out_path

with gr.Blocks() as demo:
    gr.Markdown("# 💳 Credit Card Fraud Detection — Calibrated RF (HF Model)")
    gr.Markdown(
        "Upload a CSV with transaction rows. The app loads a calibrated Random Forest model "
        "and applies the **validation P≥90% threshold** by default. "
        "Required columns: V1..V28, Amount, and either engineered features or a raw Time column "
        "(seconds from start) so the app can derive them."
    )

    with gr.Row():
        file_in = gr.File(label="Upload CSV", file_types=[".csv"])
    with gr.Row():
        thr = gr.Slider(minimum=0.0, maximum=1.0, value=DEFAULT_THR, step=0.001,
                        label=f"Decision Threshold (default P≥90% = {DEFAULT_THR:.3f})")
        all_rows = gr.Checkbox(label="Return all rows (uncheck to preview first 50)", value=False)

    btn = gr.Button("Predict")

    out_df = gr.Dataframe(label="Predictions")
    out_file = gr.File(label="Download predictions.csv")

    btn.click(fn=predict_csv, inputs=[file_in, thr, all_rows], outputs=[out_df, out_file])

if __name__ == "__main__":
    demo.launch()