dungeon29 commited on
Commit
eb3c854
·
verified ·
1 Parent(s): 269756e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +610 -0
app.py ADDED
@@ -0,0 +1,610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from transformers import AutoTokenizer
4
+ from huggingface_hub import hf_hub_download
5
+ import gradio as gr
6
+ import requests
7
+ import re
8
+ import time
9
+ import sys
10
+ import logging
11
+ import urllib3 # Import urllib3 to handle warnings
12
+
13
+ # --- Suppress specific noisy asyncio errors on shutdown ---
14
+ if sys.version_info >= (3, 10):
15
+ logging.getLogger("asyncio").setLevel(logging.WARNING)
16
+
17
+ # --- Suppress InsecureRequestWarning ---
18
+ # This is expected behavior for a Phishing Detector as we often scan sites with invalid SSL
19
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
20
+
21
+ # --- import your architecture ---
22
+ # Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
23
+ # and update the import path accordingly.
24
+ from model import DeBERTaLSTMClassifier # <-- your class
25
+
26
+ # --- Import RAG modules ---
27
+ from rag_engine import RAGEngine
28
+ from llm_client import LLMClient
29
+
30
+ # --------- Config ----------
31
+ REPO_ID = "dungeon29/deberta-lstm-detect-phishing"
32
+ CKPT_NAME = "pytorch_model.bin"
33
+ MODEL_NAME = "microsoft/deberta-base" # base tokenizer/backbone
34
+ LABELS = ["benign", "phishing"] # adjust to your classes
35
+
36
+ # If your checkpoint contains hyperparams, you can fetch them like:
37
+ # checkpoint.get("config") or checkpoint.get("model_args")
38
+ # and pass into DeBERTaLSTMClassifier(**model_args)
39
+
40
+ # --------- Load model/tokenizer once (global) ----------
41
+ device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
43
+
44
+ ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_NAME)
45
+ checkpoint = torch.load(ckpt_path, map_location=device)
46
+
47
+ # If you saved hyperparams in the checkpoint, use them:
48
+ model_args = checkpoint.get("model_args", {}) # e.g., {"lstm_hidden":256, "num_labels":2, ...}
49
+ model = DeBERTaLSTMClassifier(**model_args)
50
+
51
+ # Load weights
52
+ try:
53
+ state_dict = torch.load(ckpt_path, map_location=device)
54
+
55
+ # Xử lý nếu file lưu dạng checkpoint đầy đủ (có key "model_state_dict")
56
+ if "model_state_dict" in state_dict:
57
+ state_dict = state_dict["model_state_dict"]
58
+
59
+ model.load_state_dict(state_dict, strict=False)
60
+
61
+ # Kiểm tra layer attention
62
+ if hasattr(model, 'attention') and 'attention.weight' not in state_dict:
63
+ print("⚠️ Loaded model without attention layer, using newly initialized attention weights")
64
+ else:
65
+ print("✅ Load weights successfully!")
66
+
67
+ except Exception as e:
68
+ print(f"❌ Error when loading weights: {e}")
69
+ raise e
70
+
71
+ model.to(device).eval()
72
+
73
+ # --------- Initialize RAG & LLM ----------
74
+ print("Initializing RAG Engine (LangChain)...")
75
+ rag_engine = RAGEngine()
76
+ print("RAG Engine ready.")
77
+
78
+ print("Initializing Qwen3-0.6B(GGUF) LLM (LangChain)...")
79
+ # Pass vector_store to LLMClient for RetrievalQA
80
+ llm_client = LLMClient(vector_store=rag_engine.vector_store)
81
+ print("LLM ready.")
82
+
83
+ # --------- Helper functions ----------
84
+ def is_url(text):
85
+ """Check if text is a URL"""
86
+ url_pattern = re.compile(
87
+ r'^https?://' # http:// or https://
88
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
89
+ r'localhost|' # localhost...
90
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
91
+ r'(?::\d+)?' # optional port
92
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
93
+ return url_pattern.match(text) is not None
94
+
95
+ def fetch_html_content(url, timeout=10):
96
+ """Fetch HTML content from URL (Raw HTML for Model) - Optimized with curl_cffi"""
97
+ try:
98
+ from curl_cffi import requests as cffi_requests
99
+
100
+ # Impersonate Chrome to bypass basic anti-bots (Cloudflare, etc.)
101
+ response = cffi_requests.get(
102
+ url,
103
+ impersonate="chrome",
104
+ timeout=timeout
105
+ )
106
+
107
+ # raise_for_status() equivalent
108
+ if response.status_code >= 400:
109
+ return None, f"Request error: {response.status_code}"
110
+
111
+ return response.text, response.status_code
112
+
113
+ except Exception as e:
114
+ # Fallback to standard requests if curl_cffi fails (unlikely) or simple error
115
+ return None, f"Fetch error: {str(e)}"
116
+
117
+ def predict_single_text(text, text_type="text"):
118
+ """Predict for a single text input"""
119
+ # Tokenize
120
+ # Increased max_length to 512 to capture more HTML content
121
+ inputs = tokenizer(
122
+ text,
123
+ return_tensors="pt",
124
+ truncation=True,
125
+ padding=True,
126
+ max_length=512
127
+ )
128
+ # DeBERTa typically doesn't use token_type_ids
129
+ inputs.pop("token_type_ids", None)
130
+ # Move to device
131
+ inputs = {k: v.to(device) for k, v in inputs.items()}
132
+
133
+ with torch.no_grad():
134
+ try:
135
+ # Try to get predictions with attention weights
136
+ result = model(**inputs, return_attention=True)
137
+ if isinstance(result, tuple) and len(result) == 3:
138
+ logits, attention_weights, deberta_attentions = result
139
+ has_attention = True
140
+ else:
141
+ logits = result
142
+ has_attention = False
143
+ except TypeError:
144
+ # Fallback for older model without return_attention parameter
145
+ logits = model(**inputs)
146
+ has_attention = False
147
+
148
+ probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
149
+
150
+ # Get tokens for visualization
151
+ tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0).tolist())
152
+
153
+ return probs, tokens, has_attention, attention_weights if has_attention else None
154
+
155
+ def combine_predictions(url_probs, html_probs, url_weight=0.3, html_weight=0.7):
156
+ """Combine URL and HTML content predictions"""
157
+ combined_probs = [
158
+ url_weight * url_probs[0] + html_weight * html_probs[0], # benign
159
+ url_weight * url_probs[1] + html_weight * html_probs[1] # phishing
160
+ ]
161
+ return combined_probs
162
+
163
+ # --------- Inference function ----------
164
+ def predict_fn(text: str):
165
+ if not text or not text.strip():
166
+ return "<div style='color: red; padding: 20px; text-align: center;'>⚠️ Please enter a URL or text to analyze.</div>"
167
+
168
+ # Check if input is URL
169
+ if is_url(text.strip()):
170
+ # Process URL
171
+ url = text.strip()
172
+
173
+ # Get prediction for URL itself
174
+ url_probs, url_tokens, url_has_attention, url_attention = predict_single_text(url, "URL")
175
+
176
+ # Try to fetch HTML content
177
+ html_content, status = fetch_html_content(url)
178
+
179
+ if html_content:
180
+ # Get prediction for HTML content (Raw HTML now)
181
+ html_probs, html_tokens, html_has_attention, html_attention = predict_single_text(html_content, "HTML")
182
+
183
+ # Combine predictions
184
+ combined_probs = combine_predictions(url_probs, html_probs)
185
+
186
+ # Use combined probabilities but show analysis for both
187
+ probs = combined_probs
188
+ tokens = url_tokens + ["[SEP]"] + html_tokens[:50] # Limit HTML tokens for display
189
+ has_attention = url_has_attention or html_has_attention
190
+ attention_weights = url_attention if url_has_attention else html_attention
191
+
192
+ analysis_type = "Combined URL + HTML Analysis"
193
+ fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
194
+
195
+ else:
196
+ # Fallback for URL-only analysis
197
+ probs = url_probs
198
+ tokens = url_tokens
199
+ has_attention = url_has_attention
200
+ attention_weights = url_attention
201
+
202
+ analysis_type = "URL-only Analysis"
203
+ fetch_status = f"⚠️ Could not fetch HTML content: {status}"
204
+ else:
205
+ # Process as regular text
206
+ probs, tokens, has_attention, attention_weights = predict_single_text(text, "text")
207
+ analysis_type = "Text Analysis"
208
+ fetch_status = ""
209
+
210
+ # Create detailed analysis
211
+ predicted_class = "phishing" if probs[1] > probs[0] else "benign"
212
+ confidence = max(probs)
213
+
214
+ detailed_analysis = f"""
215
+ <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
216
+ <div style="background: linear-gradient(135deg, {'#8b0000' if predicted_class == 'phishing' else '#006400'} 0%, {'#dc143c' if predicted_class == 'phishing' else '#228b22'} 100%); padding: 25px; border-radius: 20px; color: white; text-align: center; margin-bottom: 20px; box-shadow: 0 8px 32px rgba(0,0,0,0.5); border: 2px solid {'#ff4444' if predicted_class == 'phishing' else '#44ff44'};">
217
+ <h2 style="margin: 0 0 10px 0; font-size: 28px; color: white;">🔍 {analysis_type}</h2>
218
+ <div style="font-size: 36px; font-weight: bold; margin: 10px 0; color: white;">
219
+ {predicted_class.upper()}
220
+ </div>
221
+ <div style="font-size: 18px; color: #f0f0f0;">
222
+ Confidence: {confidence:.1%}
223
+ </div>
224
+ <div style="margin-top: 15px; font-size: 14px; color: #e0e0e0;">
225
+ {'This appears to be a phishing attempt!' if predicted_class == 'phishing' else '✅ This appears to be legitimate content.'}
226
+ </div>
227
+ </div>
228
+ """
229
+
230
+ if fetch_status:
231
+ detailed_analysis += f"""
232
+ <div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
233
+ <strong>Fetch Status:</strong> {fetch_status}
234
+ </div>
235
+ """
236
+
237
+ if has_attention and attention_weights is not None:
238
+ attention_scores = attention_weights.squeeze(0).tolist()
239
+
240
+ token_analysis = []
241
+ for i, (token, score) in enumerate(zip(tokens, attention_scores)):
242
+ # More lenient filtering - include more tokens for text analysis
243
+ if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>'] and len(token.strip()) > 0 and score > 0.005:
244
+ clean_token = token.replace(' ', '').replace('Ġ', '').strip() # Handle different tokenizer prefixes
245
+ if clean_token: # Only add if token has content after cleaning
246
+ token_analysis.append({
247
+ 'token': clean_token,
248
+ 'importance': score,
249
+ 'position': i
250
+ })
251
+
252
+ # Sort by importance
253
+ token_analysis.sort(key=lambda x: x['importance'], reverse=True)
254
+
255
+ detailed_analysis += f"""
256
+ ## Top important tokens:
257
+ <div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
258
+ <strong>Analysis Info:</strong> Found {len(token_analysis)} important tokens out of {len(tokens)} total tokens
259
+ </div>
260
+ <div style="font-family: Arial, sans-serif;">
261
+ """
262
+
263
+ for i, token_info in enumerate(token_analysis[:10]): # Top 10 tokens
264
+ bar_width = int(token_info['importance'] * 100)
265
+ color = "#ff4444" if predicted_class == "phishing" else "#44ff44"
266
+
267
+ detailed_analysis += f"""
268
+ <div style="margin: 8px 0; display: flex; align-items: center; background: #2d2d2d; padding: 8px; border-radius: 8px; border-left: 4px solid {color};">
269
+ <div style="width: 30px; text-align: right; margin-right: 10px; font-weight: bold; color: #ffffff;">
270
+ {i+1}.
271
+ </div>
272
+ <div style="width: 120px; margin-right: 10px; font-weight: bold; color: #e0e0e0; text-align: right;">
273
+ {token_info['token']}
274
+ </div>
275
+ <div style="width: 300px; background-color: #404040; border-radius: 10px; overflow: hidden; margin-right: 10px; border: 1px solid #555;">
276
+ <div style="width: {bar_width}%; background-color: {color}; height: 20px; border-radius: 10px; transition: width 0.3s ease;"></div>
277
+ </div>
278
+ <div style="color: #cccccc; font-size: 12px; font-weight: bold;">
279
+ {token_info['importance']:.1%}
280
+ </div>
281
+ </div>
282
+ """
283
+
284
+ detailed_analysis += "</div>\n"
285
+
286
+ detailed_analysis += f"""
287
+ ## Detailed analysis:
288
+ <div style="font-family: Arial, sans-serif; background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
289
+ <h3 style="margin: 0 0 15px 0; color: white;">Statistical Overview</h3>
290
+ <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
291
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
292
+ <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
293
+ <div style="font-size: 14px; color: #e0e0e0;">Total tokens</div>
294
+ </div>
295
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
296
+ <div style="font-size: 24px; font-weight: bold, color: white;">{len([t for t in token_analysis if t['importance'] > 0.05])}</div>
297
+ <div style="font-size: 14px, color: #e0e0e0;">High impact tokens (>5%)</div>
298
+ </div>
299
+ </div>
300
+ </div>
301
+ <div style="font-family: Arial, sans-serif; margin: 15px 0; background: #2d2d2d; padding: 20px; border-radius: 15px; border: 1px solid #555;">
302
+ <h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
303
+ <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
304
+ <span style="font-weight: bold; color: #ff4444;">Phishing</span>
305
+ <span style="font-weight: bold; color: #44ff44;">Benign</span>
306
+ </div>
307
+ <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
308
+ <div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
309
+ {probs[1]:.1%}
310
+ </div>
311
+ </div>
312
+ <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
313
+ Benign: {probs[0]:.1%}
314
+ </div>
315
+ </div>
316
+ """
317
+ else:
318
+ # Fallback analysis without attention weights
319
+ detailed_analysis += f"""
320
+ <div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
321
+ <h3 style="margin: 0 0 15px 0; color: white;">Basic Analysis</h3>
322
+ <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 15px;">
323
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
324
+ <div style="font-size: 24px; font-weight: bold; color: white;">{probs[1]:.1%}</div>
325
+ <div style="font-size: 14px; color: #e0e0e0;">Phishing</div>
326
+ </div>
327
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
328
+ <div style="font-size: 24px; font-weight: bold; color: white;">{probs[0]:.1%}</div>
329
+ <div style="font-size: 14px; color: #e0e0e0;">Benign</div>
330
+ </div>
331
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
332
+ <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
333
+ <div style="font-size: 14px; color: #e0e0e0;">Tokens</div>
334
+ </div>
335
+ </div>
336
+ </div>
337
+ <div style="font-family: Arial, sans-serif; margin: 15px 0; background: #2d2d2d; padding: 20px; border-radius: 15px; border: 1px solid #555;">
338
+ <h3 style="color: #ffffff; margin: 0 0 15px 0;">🔤 Tokens in text:</h3>
339
+ <div style="display: flex; flex-wrap: wrap; gap: 8px;">""" + ''.join([f'<span style="background: #404040; color: #64b5f6; padding: 4px 8px; border-radius: 15px; font-size: 12px; border: 1px solid #666;">{token.replace(" ", "")}</span>' for token in tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]) + f"""</div>
340
+ <div style="margin-top: 15px; padding: 10px; background: #3d2914; border-radius: 8px; border-left: 4px solid #ff9800;">
341
+ <strong style="color: #ffcc02;">Debug info:</strong> <span style="color: #e0e0e0;">Found {len(tokens)} total tokens, {len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])} content tokens</span>
342
+ </div>
343
+ </div>
344
+ <div style="background: #3d2914; padding: 15px; border-radius: 10px; border-left: 4px solid #ff9800; margin: 15px 0;">
345
+ <p style="margin: 0; color: #ffcc02; font-size: 14px;">
346
+ <strong>Note:</strong> Detailed attention weights analysis is not available for the current model.
347
+ </p>
348
+ </div>
349
+ """
350
+
351
+ # Build label->prob mapping for Gradio Label output
352
+ if len(LABELS) == len(probs):
353
+ prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
354
+ else:
355
+ prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
356
+
357
+ return prediction_result, detailed_analysis
358
+
359
+ # --------- RAG Inference function ----------
360
+ def rag_predict_fn(text: str, model_selection: str):
361
+ if not text or not text.strip():
362
+ return "Please enter text to analyze."
363
+
364
+ start_time = time.time()
365
+
366
+ # Check if input is a URL
367
+ input_text = text.strip()
368
+ is_link = is_url(input_text)
369
+
370
+ analysis_context = input_text
371
+ status_msg = ""
372
+
373
+ analysis_context = input_text
374
+ status_msg = ""
375
+
376
+ # 1. Direct URL Input
377
+ if is_link:
378
+ target_url = input_text
379
+ print(f"🌐 Detected Direct URL: {target_url}")
380
+ fetched_content, status = fetch_html_content(target_url)
381
+
382
+ if fetched_content:
383
+ truncated_content = fetched_content[:4000]
384
+ analysis_context = f"URL: {target_url}\n\nWebsite Content:\n{truncated_content}\n..."
385
+ status_msg = f"✅ Successfully fetched content from {target_url} (Status: {status})."
386
+ else:
387
+ analysis_context = f"URL: {target_url}\n\n(Could not fetch website content. Error: {status})"
388
+ status_msg = f"⚠️ Failed to fetch URL content: {status}"
389
+
390
+ # 2. Text with Embedded URL
391
+ else:
392
+ # Regex to find the first URL in text
393
+ url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
394
+ match = url_pattern.search(input_text)
395
+
396
+ if match:
397
+ target_url = match.group(0)
398
+ print(f"📧 Detected Embedded URL: {target_url}")
399
+ fetched_content, status = fetch_html_content(target_url)
400
+
401
+ if fetched_content:
402
+ truncated_content = fetched_content[:4000]
403
+ analysis_context = f"Input Text:\n{input_text}\n\n---\nExtracted URL Context ({target_url}):\n{truncated_content}\n..."
404
+ status_msg = f"✅ Found & Fetched embedded URL: {target_url} (Status: {status})"
405
+ else:
406
+ status_msg = f"⚠️ Found embedded URL {target_url} but could not fetch content: {status}"
407
+ else:
408
+ status_msg = "📝 Analyzing raw text input (No active links found)."
409
+
410
+
411
+ # Call LLM (which now handles retrieval internally via LangChain)
412
+ # Ensure llm_client is global or passed correctly. It is initialized globally in app.py
413
+ response = llm_client.analyze(analysis_context, model_selection=model_selection)
414
+
415
+ end_time = time.time()
416
+ elapsed_time = end_time - start_time
417
+
418
+ # Parse LLM Response (New Format)
419
+ classification = "UNKNOWN"
420
+ confidence = "N/A"
421
+ explanation = response
422
+
423
+ # Simple parsing logic
424
+ lines = response.split('\n')
425
+ for line in lines:
426
+ line = line.strip()
427
+ if line.upper().startswith("CLASSIFICATION:"):
428
+ classification = line.split(":", 1)[1].strip().upper()
429
+ elif line.upper().startswith("CONFIDENCE SCORE:"):
430
+ confidence = line.split(":", 1)[1].strip()
431
+ elif line.upper().startswith("EXPLANATION:"):
432
+ explanation = line.split(":", 1)[1].strip()
433
+
434
+ # If explanation is still the full response, try to clean it up if other fields were found
435
+ if classification != "UNKNOWN" and explanation == response:
436
+ # Fallback
437
+ pass
438
+
439
+ # Determine Color/Icon
440
+ if "PHISHING" in classification:
441
+ label = "PHISHING"
442
+ color_grad = "linear-gradient(135deg, #ff4b1f 0%, #ff9068 100%)"
443
+ icon = "⛔"
444
+ border_col = "#ff4b1f"
445
+ elif "BENIGN" in classification:
446
+ label = "BENIGN"
447
+ color_grad = "linear-gradient(135deg, #11998e 0%, #38ef7d 100%)"
448
+ icon = "✅"
449
+ border_col = "#11998e"
450
+ else:
451
+ label = "UNCERTAIN"
452
+ color_grad = "linear-gradient(135deg, #f8b500 0%, #fceabb 100%)"
453
+ icon = "⚠️"
454
+ border_col = "#f8b500"
455
+
456
+ # HTML Output
457
+ html_output = f"""
458
+ <div style="font-family: 'Segoe UI', Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 25px; border-radius: 16px; box-shadow: 0 10px 30px rgba(0,0,0,0.5); border: 1px solid #333;">
459
+ <div style="background: {color_grad}; padding: 30px; border-radius: 12px; color: white; text-align: center; margin-bottom: 25px; box-shadow: 0 4px 15px rgba(0,0,0,0.3); position: relative; overflow: hidden;">
460
+ <div style="position: relative; z-index: 2;">
461
+ <h2 style="margin: 0 0 5px 0; font-size: 42px; font-weight: 800; letter-spacing: 1px; text-shadow: 0 2px 4px rgba(0,0,0,0.2);">{icon} {label}</h2>
462
+ <div style="font-size: 24px; font-weight: 600; opacity: 0.95; margin-bottom: 15px;">Confidence: {confidence}</div>
463
+ <div style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: left; font-size: 16px; line-height: 1.5; backdrop-filter: blur(5px);">
464
+ <strong>Explanation:</strong><br>
465
+ {explanation}
466
+ </div>
467
+ </div>
468
+ </div>
469
+
470
+ <div style="display: flex; justify-content: space-between; align-items: center; color: #888; font-size: 13px; padding: 0 10px;">
471
+ <div>
472
+ ⏱️ Processing Time: <b>{elapsed_time:.2f}s</b>
473
+ </div>
474
+ <div>
475
+ 🛡️ CyberGuard AI Analysis via {model_selection}
476
+ </div>
477
+ </div>
478
+
479
+ <div style="background: #2d2d2d; padding: 15px; border-radius: 8px; margin-top: 20px; border-left: 4px solid {border_col}; color: #ccc; font-size: 14px;">
480
+ <strong>Input Status:</strong> {status_msg}<br>
481
+ <span style="font-size: 12px; opacity: 0.7;">AI can make mistakes. Always verify critical URLs manually.</span>
482
+ </div>
483
+ </div>
484
+ """
485
+
486
+ return html_output
487
+
488
+ # --------- Gradio UI ----------
489
+ css_style="""
490
+ .gradio-container {
491
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
492
+ background-color: #1e1e1e !important;
493
+ color: #ffffff !important;
494
+ }
495
+ /* Customize Buttons */
496
+ .gradio-container button.primary, .gradio-container button.secondary {
497
+ background-color: #4a4a4a !important;
498
+ color: #ffffff !important;
499
+ border: 1px solid #666 !important;
500
+ }
501
+ .gradio-container button.primary:hover, .gradio-container button.secondary:hover {
502
+ background-color: #5a5a5a !important;
503
+ color: #ffffff !important;
504
+ }
505
+ /* Customize Textboxes (Inputs) */
506
+ .gradio-container textarea, .gradio-container input {
507
+ background-color: #3d3d3d !important;
508
+ color: #ffffff !important;
509
+ border: 1px solid #666 !important;
510
+ }
511
+ /* Customize Blocks/Panels */
512
+ .gradio-container .block {
513
+ background-color: #2d2d2d !important;
514
+ border: 1px solid #444 !important;
515
+ }
516
+ """
517
+ with gr.Blocks() as demo:
518
+ gr.HTML(f"<style>{css_style}</style>")
519
+ gr.Markdown("# 🛡️ Phishing Detector")
520
+
521
+ with gr.Tabs():
522
+ # --- Tab 1: Standard Detection ---
523
+ with gr.TabItem("🔍 Standard Detection"):
524
+ gr.Markdown("""
525
+ Enter a URL or text for analysis using the DeBERTa + LSTM model.
526
+
527
+ **Features:**
528
+ - **URL Analysis**: For URLs, the system will fetch HTML content and combine both URL and content analysis
529
+ - **Combined Prediction**: Uses weighted combination of URL structure and webpage content analysis
530
+ - **Visual Analysis**: Predict phishing/benign probability with visual charts
531
+ - **Token Importance**: Display the most important tokens in classification
532
+ - **Detailed Insights**: Comprehensive analysis of the impact of each token
533
+
534
+ **How it works for URLs:**
535
+ 1. Analyze the URL structure itself
536
+ 2. Fetch the webpage HTML content
537
+ 3. Analyze the webpage content
538
+ 4. Combine both results for final prediction (30% URL + 70% content)
539
+ """)
540
+
541
+ with gr.Row():
542
+ with gr.Column(scale=2):
543
+ input_box = gr.Textbox(
544
+ label="URL or text",
545
+ placeholder="Example: http://suspicious-site.example or paste any text",
546
+ lines=3
547
+ )
548
+ btn_submit = gr.Button("🔍 Analyze", variant="primary")
549
+
550
+ gr.Examples(
551
+ examples=[
552
+ ["http://rendmoiunserviceeee.com"],
553
+ ["https://www.google.com"],
554
+ ["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
555
+ ["https://mail-secure-login-verify.example/path?token=suspicious"],
556
+ ["http://paypaI-security-update.net/login"],
557
+ ["Your package has been delivered successfully. Thank you for using our service."],
558
+ ["https://github.com/user/repo"],
559
+ ["Dear customer, your account has been suspended. Click here to verify."],
560
+ ],
561
+ inputs=input_box
562
+ )
563
+
564
+ with gr.Column(scale=3):
565
+ output_html = gr.HTML(label="Analysis Result")
566
+
567
+ btn_submit.click(fn=predict_fn, inputs=input_box, outputs=output_html)
568
+
569
+ # --- Tab 2: LLM + RAG Analysis ---
570
+ with gr.TabItem("🤖 AI Assistant (RAG)"):
571
+ gr.Markdown("""
572
+ **AI Assistant** uses **Qwen3** + **LangChain** to explain *why* a message is suspicious.
573
+
574
+ """)
575
+
576
+ with gr.Row():
577
+ with gr.Column(scale=1):
578
+ # Model Selection
579
+ model_selector = gr.Radio(
580
+ choices=["Qwen3-32B (API)", "Qwen3-4B (Local)"],
581
+ value="Qwen3-32B (API)",
582
+ label="Model Selection",
583
+ info="Select 'API' for better reasoning (requires Internet) or 'Local' for offline privacy."
584
+ )
585
+
586
+ rag_input = gr.Textbox(
587
+ label="Suspicious Text/URL",
588
+ placeholder="Paste the email content or URL here...",
589
+ lines=5
590
+ )
591
+
592
+ btn_rag = gr.Button("🤖 Ask AI Assistant", variant="primary")
593
+
594
+ gr.Examples(
595
+ examples=[
596
+ ["Your PayPal account has been suspended. Click http://paypal-verify.com to unlock."],
597
+ ["Tài khoản ngân hàng của bạn bị khóa. Nhấn vào đây để mở khóa ngay."],
598
+ ["Your package is ready for delivery. Track here: https://fedex-track.com"],
599
+ ],
600
+ inputs=rag_input
601
+ )
602
+
603
+ with gr.Column(scale=1):
604
+ # Changed from gr.Markdown to gr.HTML for custom styling
605
+ rag_output = gr.HTML(label="AI Analysis")
606
+
607
+ btn_rag.click(fn=rag_predict_fn, inputs=[rag_input, model_selector], outputs=rag_output)
608
+
609
+ if __name__ == "__main__":
610
+ demo.launch(ssr_mode=False)