Spaces:

tarekmasryo
/

fraud-detection-calibrated

Sleeping

App Files Files Community

fraud-detection-calibrated / app.py

tarekmasryo

add app.py + requirements + sample CSV

cbbf2e1 verified 3 months ago

raw

history blame

3.9 kB


	import gradio as gr
	import pandas as pd
	import numpy as np
	import joblib
	import json
	from huggingface_hub import hf_hub_download

	# === Config ===
	REPO_ID = "TarekMasryo/CreditCard-fraud-detection-ML" # your model repo
	MODEL_FILENAME = "model_rf_cal.joblib" # using RF-Cal by default
	META_FILENAME = "meta.json"

	# Engineered features expected by the model
	PCA_FEATURES = [f"V{i}" for i in range(1, 29)]
	ENGINEERED = ["Amount", "_log_amount", "Hour_from_start_mod24",
	"is_night_proxy", "is_business_hours_proxy"]
	FEATURES = PCA_FEATURES + ENGINEERED

	# === Load model & thresholds ===
	model_path = hf_hub_download(REPO_ID, MODEL_FILENAME)
	model = joblib.load(model_path)

	meta_path = hf_hub_download(REPO_ID, META_FILENAME)
	with open(meta_path, "r") as f:
	meta = json.load(f)

	# Default threshold (Validation P>=90%) for RF-Cal
	DEFAULT_THR = float(meta["thresholds"]["rf_cal"]["p90"])

	def ensure_engineered_columns(df: pd.DataFrame) -> pd.DataFrame:
	"""Create engineered columns if missing, using the same logic as training."""
	df = df.copy()
	# _log_amount
	if "_log_amount" not in df.columns and "Amount" in df.columns:
	df["_log_amount"] = np.log1p(df["Amount"].astype(float))

	# Hour_from_start_mod24 and proxies if Time exists (seconds from start)
	if "Hour_from_start_mod24" not in df.columns and "Time" in df.columns:
	hours = (np.floor(df["Time"].astype(float) / 3600) % 24).astype(int)
	df["Hour_from_start_mod24"] = hours

	if "is_night_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns:
	df["is_night_proxy"] = df["Hour_from_start_mod24"].isin([22,23,0,1,2,3,4,5]).astype(int)

	if "is_business_hours_proxy" not in df.columns and "Hour_from_start_mod24" in df.columns:
	df["is_business_hours_proxy"] = df["Hour_from_start_mod24"].between(9,17).astype(int)

	return df

	def predict_csv(file, threshold: float, return_all_rows: bool):
	# Load data
	df = pd.read_csv(file.name)
	df = ensure_engineered_columns(df)

	# Check required columns
	missing = [c for c in FEATURES if c not in df.columns]
	if missing:
	return f"❌ Missing required columns: {missing}. Provide these or include 'Time' and 'Amount' so the app can derive engineered features."

	# Predict
	probs = model.predict_proba(df[FEATURES])[:, 1]
	preds = (probs >= threshold).astype(int)

	out = df.copy()
	out["Fraud_Probability"] = probs
	out["Prediction"] = preds

	# Save to a temporary CSV for download
	out_path = "predictions.csv"
	(out if return_all_rows else out.head(200)).to_csv(out_path, index=False)

	# Display top rows + file for download
	display_df = out if return_all_rows else out.head(50)
	return display_df, out_path

	with gr.Blocks() as demo:
	gr.Markdown("# 💳 Credit Card Fraud Detection — Calibrated RF (HF Model)")
	gr.Markdown(
	"Upload a CSV with transaction rows. The app loads a calibrated Random Forest model "
	"and applies the validation P≥90% threshold by default. "
	"Required columns: V1..V28, Amount, and either engineered features or a raw Time column "
	"(seconds from start) so the app can derive them."
	)

	with gr.Row():
	file_in = gr.File(label="Upload CSV", file_types=[".csv"])
	with gr.Row():
	thr = gr.Slider(minimum=0.0, maximum=1.0, value=DEFAULT_THR, step=0.001,
	label=f"Decision Threshold (default P≥90% = {DEFAULT_THR:.3f})")
	all_rows = gr.Checkbox(label="Return all rows (uncheck to preview first 50)", value=False)

	btn = gr.Button("Predict")

	out_df = gr.Dataframe(label="Predictions")
	out_file = gr.File(label="Download predictions.csv")

	btn.click(fn=predict_csv, inputs=[file_in, thr, all_rows], outputs=[out_df, out_file])

	if __name__ == "__main__":
	demo.launch()