Spaces:

melissachall
/

apt-classifier-demo

Sleeping

App Files Files Community

melissachall commited on Sep 9

Commit

298b5ca

verified ·

1 Parent(s): e044e45

Create app.py

Browse files

Files changed (1) hide show

app.py +968 -0

app.py ADDED Viewed

	@@ -0,0 +1,968 @@

+#!/usr/bin/env python3
+"""
+APT Classification System - Version corrigée et synchronisée
+Correction des incohérences entre Streamlit et Gradio
+"""
+import gradio as gr
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+import numpy as np
+import json
+import time
+from datetime import datetime
+import plotly.graph_objects as go
+import re
+import requests
+import os
+import io
+from typing import Dict, List, Optional
+import logging
+from dataclasses import dataclass
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+@dataclass
+class ClassificationResult:
+    predicted_class: str
+    confidence: float
+    top5_probabilities: Dict[str, float]
+    processing_time: float
+    extracted_features: Dict[str, List[str]]
+    attribution_factors: List[str]
+    timestamp: str
+class CySecBERTMaxPerformance(nn.Module):
+    """Version EXACTEMENT identique à Streamlit"""
+    def __init__(
+        self,
+        model_name: str = "markusbayer/CySecBERT",
+        num_classes: int = 12,  # ⚠️ IMPORTANT: Doit correspondre au modèle sauvegardé
+        max_length: int = 384,
+        dropout_rate: float = 0.15
+    ):
+        super(CySecBERTMaxPerformance, self).__init__()
+        self.model_name = model_name
+        self.num_classes = num_classes
+        self.max_length = max_length
+        # CySecBERT specialized for cybersecurity
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.config = AutoConfig.from_pretrained(model_name)
+        self.bert = AutoModel.from_pretrained(model_name)
+        # EXPANDED architecture for maximum capacity
+        self.dropout = nn.Dropout(dropout_rate)
+        self.intermediate1 = nn.Linear(self.config.hidden_size, 512)
+        self.intermediate_dropout1 = nn.Dropout(dropout_rate * 0.6)
+        self.intermediate2 = nn.Linear(512, 256)
+        self.intermediate_dropout2 = nn.Dropout(dropout_rate * 0.7)
+        # Batch normalization for stability
+        self.batch_norm1 = nn.BatchNorm1d(512)
+        self.batch_norm2 = nn.BatchNorm1d(256)
+        self.classifier = nn.Linear(256, num_classes)
+        # Optimized activations
+        self.relu = nn.ReLU()
+        self.gelu = nn.GELU()
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        # [CLS] token with minimal dropout
+        cls_output = outputs.last_hidden_state[:, 0]
+        cls_output = self.dropout(cls_output)
+        # First LARGE intermediate layer
+        intermediate1 = self.gelu(self.intermediate1(cls_output))
+        intermediate1 = self.intermediate_dropout1(intermediate1)
+        if intermediate1.size(0) > 1:
+            intermediate1 = self.batch_norm1(intermediate1)
+        # Second intermediate layer
+        intermediate2 = self.relu(self.intermediate2(intermediate1))
+        intermediate2 = self.intermediate_dropout2(intermediate2)
+        if intermediate2.size(0) > 1:
+            intermediate2 = self.batch_norm2(intermediate2)
+        # Final classification
+        logits = self.classifier(intermediate2)
+        return {
+            'logits': logits,
+            'probabilities': torch.softmax(logits, dim=-1)
+        }
+class APTClassifier:
+    def __init__(self):
+        self.device = torch.device('cpu')
+        self.model = None
+        self.class_names = []
+        self.label_encoder = None
+        # ✅ PROFILS APT IDENTIQUES À STREAMLIT (corrigés)
+        self.apt_profiles = {
+            'APT1': {
+                'country': 'China',
+                'flag': '🇨🇳',
+                'aliases': ['Comment Crew', 'Comment Group', 'PLA Unit 61398', 'Shanghai Group'],
+                'description': 'Chinese cyber espionage group attributed to the People\'s Liberation Army Unit 61398. Known for large-scale intellectual property theft and targeting of over 140 organizations across 20 industries.',
+                'first_observed': '2006',
+                'attribution_confidence': 'High',
+                'sponsor': 'State-sponsored (PLA Unit 61398)',
+                'malware': ['WEBC2', 'BACKDOOR.BARKIOFORK', 'AURIGA', 'BANGAT', 'BISCUIT'],
+                'tools': ['HTRAN', 'GSECDUMP', 'GETMAIL', 'MAPIGET'],
+                'targets': ['Intellectual property', 'Government agencies', 'Industrial companies', 'Legal services', 'IT companies'],
+                'sectors': ['Information Technology', 'Energy', 'Financial Services', 'Government', 'Healthcare'],
+                'regions': ['United States', 'Canada', 'United Kingdom', 'India'],
+                'ttps': ['T1566.001', 'T1059.003', 'T1071.001', 'T1083', 'T1005'],
+                'mitre_groups': ['G0006'],
+                'notable_campaigns': ['Operation Aurora (2009)', 'RSA SecurID breach (2011)', 'Elderwood campaigns'],
+                'motivations': ['Espionage', 'Intellectual property theft'],
+                'sophistication': 'Medium to High'
+            },
+            'APT28': {
+                'country': 'Russia',
+                'flag': '🇷🇺',
+                'aliases': ['Fancy Bear', 'Sofacy', 'Sednit', 'STRONTIUM', 'Pawn Storm', 'Swallowtail'],
+                'description': 'Russian military intelligence cyber operations unit attributed to GRU Unit 26165. Highly sophisticated group known for targeting government, military, and security organizations worldwide.',
+                'first_observed': '2007',
+                'attribution_confidence': 'High',
+                'sponsor': 'State-sponsored (GRU Unit 26165)',
+                'malware': ['X-Agent', 'Sofacy', 'GAMEFISH', 'Zebrocy', 'CHOPSTICK', 'EVILTOSS'],
+                'tools': ['Responder', 'Mimikatz', 'Compiled HTML Help', 'PowerShell Empire'],
+                'targets': ['Government agencies', 'Military organizations', 'Defense contractors', 'Aerospace', 'Media'],
+                'sectors': ['Government', 'Defense', 'Aerospace', 'Media', 'Think Tanks'],
+                'regions': ['United States', 'Europe', 'Asia-Pacific', 'Middle East'],
+                'ttps': ['T1566.001', 'T1059.001', 'T1055', 'T1027', 'T1083', 'T1203'],
+                'mitre_groups': ['G0007'],
+                'notable_campaigns': ['DNC hack (2016)', 'Olympic Destroyer (2018)', 'UEFI rootkit campaigns'],
+                'motivations': ['Espionage', 'Political influence', 'Military intelligence'],
+                'sophistication': 'Very High'
+            },
+            'APT29': {
+                'country': 'Russia',
+                'flag': '🇷🇺',
+                'aliases': ['Cozy Bear', 'The Dukes', 'NOBELIUM', 'Midnight Blizzard', 'UNC2452'],
+                'description': 'Russian foreign intelligence service (SVR) cyber unit. Extremely sophisticated group known for stealth, persistence, and advanced techniques in espionage operations.',
+                'first_observed': '2008',
+                'attribution_confidence': 'High',
+                'sponsor': 'State-sponsored (SVR)',
+                'malware': ['HAMMERTOSS', 'COZYCAR', 'SeaDuke', 'SUNBURST', 'TEARDROP', 'BEACON'],
+                'tools': ['PowerShell', 'WMI', 'Cobalt Strike', 'AdFind', 'BloodHound'],
+                'targets': ['Government agencies', 'Think tanks', 'Healthcare organizations', 'Technology companies'],
+                'sectors': ['Government', 'Healthcare', 'Technology', 'Research', 'NGOs'],
+                'regions': ['United States', 'Europe', 'Global'],
+                'ttps': ['T1566.002', 'T1071.001', 'T1055', 'T1027', 'T1078', 'T1490'],
+                'mitre_groups': ['G0016'],
+                'notable_campaigns': ['SolarWinds supply chain attack (2020)', 'COVID-19 research targeting', 'Azure/M365 attacks'],
+                'motivations': ['Espionage', 'Intelligence gathering', 'Political influence'],
+                'sophistication': 'Very High'
+            },
+            'Lazarus': {
+                'country': 'North Korea',
+                'flag': '🇰🇵',
+                'aliases': ['Lazarus Group', 'Hidden Cobra', 'ZINC', 'TEMP.Hermit', 'Labyrinth Chollima'],
+                'description': 'North Korean state-sponsored hacking group known for financially motivated attacks, cryptocurrency theft, and destructive operations. Connected to RGB (Reconnaissance General Bureau).',
+                'first_observed': '2009',
+                'attribution_confidence': 'High',
+                'sponsor': 'State-sponsored (RGB)',
+                'malware': ['WannaCry', 'HOPLIGHT', 'TYPEFRAME', 'BADCALL', 'FALLCHILL', 'ELECTRICFISH'],
+                'tools': ['PowerShell', 'Mimikatz', 'PsExec', 'Living-off-the-land binaries'],
+                'targets': ['Financial institutions', 'Cryptocurrency exchanges', 'Entertainment companies', 'Defense contractors'],
+                'sectors': ['Financial Services', 'Entertainment', 'Cryptocurrency', 'Defense', 'Healthcare'],
+                'regions': ['Global', 'South Korea', 'United States', 'Europe'],
+                'ttps': ['T1566.001', 'T1059.003', 'T1055', 'T1027', 'T1486', 'T1490'],
+                'mitre_groups': ['G0032'],
+                'notable_campaigns': ['Sony Pictures attack (2014)', 'WannaCry ransomware (2017)', 'SWIFT banking attacks'],
+                'motivations': ['Financial gain', 'Espionage', 'Destruction', 'Sanctions evasion'],
+                'sophistication': 'High'
+            },
+            'Equation': {
+                'country': 'United States (suspected)',
+                'flag': '🇺🇸',
+                'aliases': ['Equation Group', 'EQGRP', 'Tilded Team'],
+                'description': 'Highly sophisticated cyber espionage group suspected to be linked to the NSA. Known for advanced persistent threats, zero-day exploits, and firmware-level implants.',
+                'first_observed': '2001',
+                'attribution_confidence': 'Medium',
+                'sponsor': 'State-sponsored (suspected NSA)',
+                'malware': ['DOUBLEFANTASY', 'EQUATIONDRUG', 'GRAYFISH', 'FANNY', 'STUXNET'],
+                'tools': ['EternalBlue', 'EternalRomance', 'DoublePulsar', 'FuzzBunch'],
+                'targets': ['High-value targets', 'Government agencies', 'Telecommunications', 'Research institutions'],
+                'sectors': ['Government', 'Telecommunications', 'Research', 'Technology', 'Energy'],
+                'regions': ['Middle East', 'Asia', 'Europe', 'Global'],
+                'ttps': ['T1055', 'T1027', 'T1083', 'T1068', 'T1542.009', 'T1014'],
+                'mitre_groups': ['G0020'],
+                'notable_campaigns': ['Operation Equation (2008-2015)', 'STUXNET collaboration', 'Flame malware'],
+                'motivations': ['Espionage', 'Intelligence gathering', 'Sabotage'],
+                'sophistication': 'Extremely High'
+            },
+            'Carbanak': {
+                'country': 'International',
+                'flag': '🌍',
+                'aliases': ['FIN7', 'Carbanak Group', 'Anunak', 'Carbon Spider'],
+                'description': 'Financially motivated cybercriminal organization responsible for stealing over $1 billion from financial institutions worldwide through ATM and point-of-sale attacks.',
+                'first_observed': '2013',
+                'attribution_confidence': 'High',
+                'sponsor': 'Cybercriminal',
+                'malware': ['Carbanak', 'CARBANAK', 'HALFBAKED', 'BABYMETAL', 'GRIFFON'],
+                'tools': ['Cobalt Strike', 'Mimikatz', 'PowerShell Empire', 'Metasploit'],
+                'targets': ['Financial institutions', 'Banks', 'Payment processors', 'Hospitality', 'Retail'],
+                'sectors': ['Financial Services', 'Hospitality', 'Retail', 'Restaurant'],
+                'regions': ['Global', 'United States', 'Europe', 'Asia'],
+                'ttps': ['T1566.001', 'T1059.003', 'T1055', 'T1027', 'T1021.001', 'T1083'],
+                'mitre_groups': ['G0008', 'G0046'],
+                'notable_campaigns': ['Carbanak banking attacks', 'FIN7 point-of-sale attacks', 'Restaurant POS campaigns'],
+                'motivations': ['Financial gain'],
+                'sophistication': 'High'
+            },
+            'APT40': {
+                'country': 'China',
+                'flag': '🇨🇳',
+                'aliases': ['Leviathan', 'TEMP.Periscope', 'TEMP.Jumper', 'Kryptonite Panda'],
+                'description': 'Chinese state-sponsored cyber espionage group focused on maritime industries, engineering companies, and research organizations to support China\'s Belt and Road Initiative.',
+                'first_observed': '2013',
+                'attribution_confidence': 'High',
+                'sponsor': 'State-sponsored (MSS Hainan)',
+                'malware': ['BADFLICK', 'PHOTO', 'HOMEFRY', 'MURKYTOP', 'LUNCHMONEY'],
+                'tools': ['China Chopper', 'Mimikatz', 'PowerShell', 'WMI'],
+                'targets': ['Maritime industries', 'Engineering companies', 'Research organizations', 'Government agencies'],
+                'sectors': ['Maritime', 'Engineering', 'Research', 'Government', 'Healthcare'],
+                'regions': ['United States', 'Europe', 'Asia-Pacific'],
+                'ttps': ['T1566.001', 'T1190', 'T1059.003', 'T1055', 'T1027'],
+                'mitre_groups': ['G0065'],
+                'notable_campaigns': ['Maritime industry targeting', 'COVID-19 research theft', 'Belt and Road surveillance'],
+                'motivations': ['Espionage', 'Economic advantage', 'Strategic intelligence'],
+                'sophistication': 'High'
+            }
+        }
+        # Cybersecurity indicators (identiques à Streamlit)
+        self.security_indicators = {
+            'malware': r'\b(trojan|virus|worm|ransomware|backdoor|rootkit|spyware|adware|botnet|rat|loader)\b',
+            'techniques': r'\bT\d{4}(\.\d{3})?\b',
+            'domains': r'\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b',
+            'ips': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
+            'hashes': r'\b[a-fA-F0-9]{32,64}\b',
+            'cve': r'\bCVE-\d{4}-\d{4,}\b',
+            'tools': r'\b(cobalt strike|metasploit|mimikatz|powershell|psexec|wmi|bloodhound)\b'
+        }
+        self.load_model()
+    def download_model_from_hf(self):
+        """Téléchargement robuste avec vérification du checksum"""
+        try:
+            model_url = "https://huggingface.co/melissachall/cysecbert-apt-classifier/resolve/main/best_cysecbert_max_performance.pt"
+            logger.info(f"Downloading model from: {model_url}")
+            response = requests.get(model_url, timeout=300, stream=True)
+            if response.status_code == 200:
+                model_path = "downloaded_model.pt"
+                total_size = int(response.headers.get('content-length', 0))
+                with open(model_path, "wb") as f:
+                    downloaded = 0
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                            downloaded += len(chunk)
+                            if total_size > 0:
+                                percent = (downloaded / total_size) * 100
+                                if downloaded % 1000000 == 0:  # Log every MB
+                                    logger.info(f"Download progress: {percent:.1f}%")
+                logger.info(f"✅ Model downloaded: {downloaded} bytes")
+                # ✅ VALIDATION CRITIQUE DU MODÈLE TÉLÉCHARGÉ
+                try:
+                    test_checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
+                    # Vérifier que les champs critiques existent
+                    required_fields = ['model_state_dict', 'class_names']
+                    missing_fields = [field for field in required_fields if field not in test_checkpoint]
+                    if missing_fields:
+                        raise ValueError(f"Missing critical fields in checkpoint: {missing_fields}")
+                    class_names = test_checkpoint.get('class_names', [])
+                    if len(class_names) == 0:
+                        raise ValueError("Checkpoint has empty class_names")
+                    logger.info(f"✅ Model validation passed. Classes: {class_names}")
+                    return model_path
+                except Exception as e:
+                    logger.error(f"❌ Downloaded model validation failed: {e}")
+                    if os.path.exists(model_path):
+                        os.remove(model_path)
+                    return None
+            else:
+                logger.error(f"❌ HTTP {response.status_code} for {model_url}")
+                return None
+        except Exception as e:
+            logger.error(f"❌ Download error: {e}")
+            return None
+    def load_model(self):
+        """Chargement EXACTEMENT identique à Streamlit"""
+        try:
+            # Étape 1: Télécharger le modèle
+            model_path = self.download_model_from_hf()
+            if not model_path or not os.path.exists(model_path):
+                raise RuntimeError("❌ Cannot download model from HuggingFace")
+            # Étape 2: Charger le checkpoint
+            logger.info(f"Loading checkpoint from {model_path}")
+            checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
+            # ✅ ÉTAPE CRITIQUE: Récupérer les métadonnées EXACTEMENT comme Streamlit
+            self.class_names = checkpoint.get('class_names', [])
+            if not self.class_names:
+                raise RuntimeError("❌ Checkpoint missing class_names. Upload complete .pt file with metadata.")
+            self.label_encoder = checkpoint.get('label_encoder')
+            num_classes = len(self.class_names)
+            logger.info(f"✅ Class names from checkpoint: {self.class_names}")
+            logger.info(f"✅ Number of classes: {num_classes}")
+            # ✅ VÉRIFICATION: Les class_names doivent correspondre aux profils
+            profile_classes = set(self.apt_profiles.keys())
+            checkpoint_classes = set(self.class_names)
+            if profile_classes != checkpoint_classes:
+                logger.warning(f"⚠️ MISMATCH DETECTED!")
+                logger.warning(f"   Profile classes: {profile_classes}")
+                logger.warning(f"   Checkpoint classes: {checkpoint_classes}")
+                logger.warning(f"   Missing in profiles: {checkpoint_classes - profile_classes}")
+                logger.warning(f"   Extra in profiles: {profile_classes - checkpoint_classes}")
+            # Étape 3: Créer le modèle avec les bonnes dimensions
+            self.model = CySecBERTMaxPerformance(
+                num_classes=num_classes,
+                dropout_rate=checkpoint.get('config', {}).get('dropout_rate', 0.15)
+            ).to(self.device)
+            # Étape 4: Charger les poids
+            if 'model_state_dict' not in checkpoint:
+                raise RuntimeError("❌ Checkpoint missing model_state_dict")
+            self.model.load_state_dict(checkpoint['model_state_dict'])
+            self.model.eval()
+            logger.info("✅ MODEL LOADED SUCCESSFULLY - IDENTICAL TO STREAMLIT!")
+            # Nettoyage
+            if os.path.exists(model_path):
+                os.remove(model_path)
+            return True
+        except Exception as e:
+            logger.error(f"❌ Model loading error: {e}")
+            raise RuntimeError(f"Cannot load model: {e}")
+    def extract_features(self, text: str) -> Dict[str, List[str]]:
+        """IDENTIQUE à Streamlit"""
+        features = {}
+        text_lower = text.lower()
+        for feature_type, pattern in self.security_indicators.items():
+            matches = re.findall(pattern, text_lower, re.IGNORECASE)
+            features[feature_type] = list(set(matches))[:10]  # Limit to 10 items
+        return features
+    def get_attribution_factors(self, text: str, predicted_class: str) -> List[str]:
+        """IDENTIQUE à Streamlit"""
+        factors = []
+        text_lower = text.lower()
+        if predicted_class in self.apt_profiles:
+            profile = self.apt_profiles[predicted_class]
+            # Check for group mentions
+            if predicted_class.lower() in text_lower:
+                factors.append(f"Direct mention of {predicted_class}")
+            # Check for aliases
+            for alias in profile.get('aliases', []):
+                if alias.lower() in text_lower:
+                    factors.append(f"Alias detected: {alias}")
+            # Check for known malware
+            for malware in profile.get('malware', []):
+                if malware.lower() in text_lower:
+                    factors.append(f"Known malware: {malware}")
+            # Check for tools
+            for tool in profile.get('tools', []):
+                if tool.lower() in text_lower:
+                    factors.append(f"Known tool: {tool}")
+            # Check for target sectors
+            for target in profile.get('targets', []):
+                if target.lower() in text_lower:
+                    factors.append(f"Target sector match: {target}")
+            # Check for TTPs
+            for ttp in profile.get('ttps', []):
+                if ttp in text:
+                    factors.append(f"MITRE technique: {ttp}")
+        return factors
+    def classify(self, text: str, confidence_threshold: float = 0.5) -> ClassificationResult:
+        """Classification EXACTEMENT identique à Streamlit"""
+        start_time = time.time()
+        # Vérifications strictes
+        if self.model is None:
+            raise RuntimeError("❌ Model not loaded")
+        if not hasattr(self.model, 'tokenizer') or self.model.tokenizer is None:
+            raise RuntimeError("❌ Tokenizer not loaded")
+        logger.info("🚀 Using CySecBERTMaxPerformance (identical to Streamlit)")
+        # Tokenisation IDENTIQUE
+        encoding = self.model.tokenizer(
+            text,
+            max_length=self.model.max_length,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        input_ids = encoding['input_ids'].to(self.device)
+        attention_mask = encoding['attention_mask'].to(self.device)
+        # Prédiction
+        with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
+            probabilities = outputs['probabilities'].cpu().numpy()[0]
+        # Top 5 IDENTIQUE
+        top5_indices = np.argsort(probabilities)[::-1][:5]
+        predicted_class = self.class_names[top5_indices[0]]
+        confidence = float(probabilities[top5_indices[0]])
+        # Distribution top 5
+        top5_probabilities = {
+            self.class_names[idx]: float(probabilities[idx])
+            for idx in top5_indices
+        }
+        logger.info(f"✅ Prediction: {predicted_class} ({confidence:.1%})")
+        logger.info(f"✅ Top 5: {top5_probabilities}")
+        # Features et attribution
+        extracted_features = self.extract_features(text)
+        attribution_factors = self.get_attribution_factors(text, predicted_class)
+        processing_time = time.time() - start_time
+        return ClassificationResult(
+            predicted_class=predicted_class,
+            confidence=confidence,
+            top5_probabilities=top5_probabilities,
+            processing_time=processing_time,
+            extracted_features=extracted_features,
+            attribution_factors=attribution_factors,
+            timestamp=datetime.now().isoformat()
+        )
+# ===== FONCTIONS UTILITAIRES IDENTIQUES =====
+def process_uploaded_file(uploaded_file):
+    """Process uploaded file and extract text content"""
+    if uploaded_file is None:
+        return ""
+    try:
+        file_name = uploaded_file.name.lower()
+        if file_name.endswith('.txt'):
+            content = uploaded_file.read()
+            if isinstance(content, bytes):
+                return content.decode('utf-8', errors='ignore')
+            return str(content)
+        elif file_name.endswith('.json'):
+            content = uploaded_file.read()
+            if isinstance(content, bytes):
+                content = content.decode('utf-8')
+            try:
+                json_data = json.loads(content)
+                text_fields = []
+                def extract_text_from_json(obj, depth=0):
+                    if depth > 3:
+                        return
+                    if isinstance(obj, dict):
+                        for key, value in obj.items():
+                            if isinstance(value, str) and len(value) > 10:
+                                text_fields.append(f"{key}: {value}")
+                            elif isinstance(value, (dict, list)):
+                                extract_text_from_json(value, depth + 1)
+                    elif isinstance(obj, list):
+                        for item in obj:
+                            extract_text_from_json(item, depth + 1)
+                extract_text_from_json(json_data)
+                return "\n".join(text_fields)
+            except:
+                return content
+        else:
+            # Generic text extraction
+            content = uploaded_file.read()
+            if isinstance(content, bytes):
+                return content.decode('utf-8', errors='ignore')
+            return str(content)
+    except Exception as e:
+        logger.error(f"File processing error: {e}")
+        return f"Error processing file: {str(e)}"
+def create_prediction_plot(top5_probs):
+    """Créer le graphique des top 5 prédictions"""
+    fig = go.Figure(go.Bar(
+        x=list(top5_probs.values()),
+        y=list(top5_probs.keys()),
+        orientation='h',
+        marker=dict(
+            color=['#667eea', '#764ba2', '#f093fb', '#f5576c', '#4facfe'][:len(top5_probs)],
+            line=dict(color='rgba(50,50,50,0.8)', width=1)
+        ),
+        text=[f"{prob:.2%}" for prob in top5_probs.values()],
+        textposition='auto',
+        textfont=dict(size=12, color='white')
+    ))
+    fig.update_layout(
+        title=dict(
+            text="🎯 Top 5 APT Group Predictions",
+            font=dict(size=18, color='#2c3e50'),
+            x=0.5
+        ),
+        xaxis=dict(
+            title=dict(text="Confidence Score", font=dict(size=14)),
+            tickfont=dict(size=12),
+            range=[0, max(top5_probs.values()) * 1.1]
+        ),
+        yaxis=dict(
+            title=dict(text="APT Groups", font=dict(size=14)),
+            tickfont=dict(size=12)
+        ),
+        height=400,
+        margin=dict(l=100, r=50, t=80, b=50),
+        plot_bgcolor='rgba(248,249,250,0.8)',
+        paper_bgcolor='white'
+    )
+    return fig
+def format_apt_profile(predicted_class, classifier):
+    """Formater le profil APT (IDENTIQUE à Streamlit)"""
+    if predicted_class not in classifier.apt_profiles:
+        return f"<div style='padding: 1rem; background: #f8d7da; border-radius: 8px; color: #721c24;'>⚠️ No profile available for '{predicted_class}'. This might indicate a class mapping issue.</div>"
+    profile = classifier.apt_profiles[predicted_class]
+    html = f"""
+    <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 15px; margin: 1rem 0; box-shadow: 0 10px 30px rgba(0,0,0,0.15);">
+        <h3 style="margin: 0 0 1.5rem 0; font-size: 1.8rem; text-align: center;">{profile.get('flag', '🌍')} {predicted_class} - Complete Profile</h3>
+        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 2rem;">
+            <div>
+                <h4 style="color: #ffd700; margin-bottom: 1rem;">📋 Basic Information</h4>
+                <p><strong>Origin:</strong> {profile.get('country', 'Unknown')}</p>
+                <p><strong>First Observed:</strong> {profile.get('first_observed', 'Unknown')}</p>
+                <p><strong>Attribution Confidence:</strong> {profile.get('attribution_confidence', 'Unknown')}</p>
+                <p><strong>Sponsor:</strong> {profile.get('sponsor', 'Unknown')}</p>
+                <p><strong>Sophistication:</strong> {profile.get('sophistication', 'Unknown')}</p>
+                <h4 style="color: #ffd700; margin: 1rem 0;">🎭 Known Aliases</h4>
+                <ul style="margin: 0.5rem 0;">
+                    {''.join([f"<li style='margin: 0.3rem 0;'>{alias}</li>" for alias in profile.get('aliases', [])])}
+                </ul>
+                <h4 style="color: #ffd700; margin: 1rem 0;">🦠 Associated Malware</h4>
+                <div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">
+                    {''.join([f"<span style='background: rgba(255,255,255,0.2); padding: 0.3rem 0.6rem; border-radius: 12px; font-size: 0.9rem;'>{malware}</span>" for malware in profile.get('malware', [])])}
+                </div>
+            </div>
+            <div>
+                <h4 style="color: #ffd700; margin-bottom: 1rem;">🎯 Typical Targets</h4>
+                <ul style="margin: 0.5rem 0;">
+                    {''.join([f"<li style='margin: 0.3rem 0;'>{target}</li>" for target in profile.get('targets', [])])}
+                </ul>
+                <h4 style="color: #ffd700; margin: 1rem 0;">🛠️ Known Tools</h4>
+                <div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">
+                    {''.join([f"<span style='background: rgba(255,255,255,0.2); padding: 0.3rem 0.6rem; border-radius: 12px; font-size: 0.9rem;'>{tool}</span>" for tool in profile.get('tools', [])])}
+                </div>
+                <h4 style="color: #ffd700; margin: 1rem 0;">⚙️ MITRE ATT&CK TTPs</h4>
+                <div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">
+                    {''.join([f"<span style='background: rgba(255,255,255,0.3); padding: 0.3rem 0.6rem; border-radius: 12px; font-family: monospace; font-size: 0.9rem;'>{ttp}</span>" for ttp in profile.get('ttps', [])])}
+                </div>
+            </div>
+        </div>
+        <div style="margin-top: 1.5rem; padding: 1rem; background: rgba(255,255,255,0.1); border-radius: 10px;">
+            <h4 style="color: #ffd700; margin: 0 0 0.5rem 0;">📖 Description</h4>
+            <p style="line-height: 1.6; margin: 0;">{profile.get('description', 'No description available')}</p>
+        </div>
+        <div style="margin-top: 1rem; padding: 1rem; background: rgba(255,255,255,0.1); border-radius: 10px;">
+            <h4 style="color: #ffd700; margin: 0 0 0.5rem 0;">🚨 Notable Campaigns</h4>
+            <ul style="margin: 0.5rem 0;">
+                {''.join([f"<li style='margin: 0.3rem 0;'>{campaign}</li>" for campaign in profile.get('notable_campaigns', [])])}
+            </ul>
+        </div>
+    </div>
+    """
+    return html
+def classify_text(text, uploaded_file, confidence_threshold, show_features, show_attribution, show_profile):
+    """Fonction principale de classification - IDENTIQUE à Streamlit"""
+    # Traitement de l'input
+    input_text = ""
+    file_processed = False
+    if uploaded_file is not None:
+        file_content = process_uploaded_file(uploaded_file)
+        if file_content.strip():
+            input_text = file_content
+            file_processed = True
+    if not input_text and text.strip():
+        input_text = text
+    if not input_text:
+        return (
+            "Please enter text or upload a file",
+            "No confidence",
+            None,
+            "0.000s",
+            "No input provided",
+            "No features extracted" if show_features else "",
+            "No attribution factors" if show_attribution else "",
+            "No profile available" if show_profile else "",
+            {},
+            "File processed successfully" if file_processed else ""
+        )
+    # Classification
+    try:
+        classifier = APTClassifier()
+        result = classifier.classify(input_text, confidence_threshold)
+        # Formatage des résultats
+        confidence_color = "#27ae60" if result.confidence > 0.8 else "#f39c12" if result.confidence > 0.6 else "#e74c3c"
+        main_result = f"""
+        <div style="background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); padding: 2rem; border-radius: 15px; border-left: 6px solid #667eea; box-shadow: 0 8px 25px rgba(0,0,0,0.1);">
+            <div style="text-align: center; margin-bottom: 1.5rem;">
+                <h2 style="color: #2c3e50; margin: 0;">🛡️ APT Classification Result</h2>
+                <p style="color: #7f8c8d; margin: 0.5rem 0;">✅ Model synchronized with Streamlit version</p>
+            </div>
+            <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 1rem; margin-bottom: 1rem;">
+                <div style="text-align: center; padding: 1rem; background: white; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1);">
+                    <h3 style="color: #667eea; margin: 0 0 0.5rem 0;">Predicted Group</h3>
+                    <p style="font-size: 1.5rem; font-weight: bold; color: #2c3e50; margin: 0;">{result.predicted_class}</p>
+                </div>
+                <div style="text-align: center; padding: 1rem; background: white; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1);">
+                    <h3 style="color: #667eea; margin: 0 0 0.5rem 0;">Confidence</h3>
+                    <p style="font-size: 1.5rem; font-weight: bold; color: {confidence_color}; margin: 0;">{result.confidence:.2%}</p>
+                </div>
+                <div style="text-align: center; padding: 1rem; background: white; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1);">
+                    <h3 style="color: #667eea; margin: 0 0 0.5rem 0;">Processing Time</h3>
+                    <p style="font-size: 1.5rem; font-weight: bold; color: #2c3e50; margin: 0;">{result.processing_time:.3f}s</p>
+                </div>
+            </div>
+            <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 10px;">
+                <p style="margin: 0; color: #667eea;"><strong>Analysis completed at:</strong> {datetime.fromisoformat(result.timestamp).strftime('%H:%M:%S UTC')}</p>
+            </div>
+        </div>
+        """
+        # Graphique
+        plot = create_prediction_plot(result.top5_probabilities)
+        # Features
+        features_html = ""
+        if show_features and any(result.extracted_features.values()):
+            features_html = "<div style='background: #f8f9fa; padding: 1.5rem; border-radius: 10px; border-left: 4px solid #17a2b8;'>"
+            features_html += "<h4 style='color: #2c3e50; margin-bottom: 1rem;'>🔍 Extracted Cybersecurity Features</h4>"
+            icon_map = {'malware': '🦠', 'techniques': '⚙️', 'domains': '🌐', 'ips': '🔢', 'hashes': '#️⃣', 'cve': '🚨', 'tools': '🛠️'}
+            for feature_type, feature_list in result.extracted_features.items():
+                if feature_list:
+                    icon = icon_map.get(feature_type, '📌')
+                    features_html += f"<p><strong>{icon} {feature_type.title()}:</strong></p>"
+                    for feature in feature_list[:5]:
+                        features_html += f"<span style='background: #e9ecef; padding: 0.3rem 0.6rem; margin: 0.2rem; border-radius: 12px; font-family: monospace; font-size: 0.9rem; display: inline-block;'>{feature}</span>"
+            features_html += "</div>"
+        # Attribution
+        attribution_html = ""
+        if show_attribution and result.attribution_factors:
+            attribution_html = "<div style='background: #f8f9fa; padding: 1.5rem; border-radius: 10px; border-left: 4px solid #28a745;'>"
+            attribution_html += "<h4 style='color: #2c3e50; margin-bottom: 1rem;'>🎯 Attribution Factors</h4>"
+            for factor in result.attribution_factors:
+                attribution_html += f"<div style='background: #e8f5e8; padding: 0.8rem; margin: 0.5rem 0; border-radius: 8px; border-left: 3px solid #28a745;'>{factor}</div>"
+            attribution_html += "</div>"
+        # Profil
+        profile_html = ""
+        if show_profile:
+            profile_html = format_apt_profile(result.predicted_class, classifier)
+        # Export data
+        export_data = {
+            'predicted_class': result.predicted_class,
+            'confidence': result.confidence,
+            'processing_time': result.processing_time,
+            'top5_probabilities': result.top5_probabilities,
+            'extracted_features': result.extracted_features,
+            'attribution_factors': result.attribution_factors,
+            'timestamp': result.timestamp,
+            'model_info': 'CySecBERTMaxPerformance - Synchronized with Streamlit'
+        }
+        file_status = f"✅ File '{uploaded_file.name}' processed successfully ({len(input_text)} characters)" if file_processed else ""
+        return (
+            result.predicted_class,
+            f"{result.confidence:.2%}",
+            plot,
+            f"{result.processing_time:.3f}s",
+            main_result,
+            features_html,
+            attribution_html,
+            profile_html,
+            export_data,
+            file_status
+        )
+    except Exception as e:
+        error_msg = f"❌ Classification error: {str(e)}"
+        logger.error(error_msg)
+        return (
+            "Error",
+            "0%",
+            None,
+            "0.000s",
+            f"<div style='background: #f8d7da; padding: 1rem; border-radius: 8px; color: #721c24;'>{error_msg}</div>",
+            "",
+            "",
+            "",
+            {},
+            ""
+        )
+# ===== INTERFACE GRADIO =====
+# CSS optimisé
+css = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+.gradio-container {
+    font-family: 'Inter', sans-serif !important;
+    max-width: 1200px !important;
+    margin: 0 auto !important;
+}
+.header-container {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 3rem 2rem;
+    border-radius: 20px;
+    text-align: center;
+    margin-bottom: 2rem;
+    box-shadow: 0 15px 35px rgba(102, 126, 234, 0.3);
+}
+.header-title {
+    color: white;
+    font-size: 3.5rem;
+    font-weight: 700;
+    margin: 0;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
+}
+.header-subtitle {
+    color: rgba(255,255,255,0.95);
+    font-size: 1.3rem;
+    font-weight: 400;
+    margin: 1rem 0 0 0;
+}
+.status-success {
+    background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
+    color: white;
+    padding: 1rem 2rem;
+    border-radius: 10px;
+    margin: 1rem 0;
+    text-align: center;
+    font-weight: 500;
+    box-shadow: 0 4px 15px rgba(40, 167, 69, 0.3);
+}
+"""
+# Examples optimisés pour les 7 groupes de Streamlit
+example_texts = [
+    "Advanced persistent threat campaign attributed to APT28 (Fancy Bear) targeting government entities. Spear-phishing emails deliver X-Agent payload with T1566.001 techniques. Network analysis reveals C2 communications consistent with Sofacy operations and GRU Unit 26165 characteristics.",
+    "Financial institutions targeted by Lazarus Group operations. Watering hole attacks deploy custom malware for cryptocurrency theft. TTPs include T1566.001 and T1059.003, consistent with Hidden Cobra methodologies. HOPLIGHT and TYPEFRAME malware observed.",
+    "Government agencies report sophisticated malware attributed to APT29 (Cozy Bear). Advanced T1566.002 techniques with HAMMERTOSS C2. Campaign characteristics consistent with Russian SVR operations. NOBELIUM techniques with SUNBURST and TEARDROP malware detected.",
+    "Chinese cyber espionage operations attributed to APT1 targeting intellectual property. Comment Crew techniques observed with WEBC2 and BACKDOOR.BARKIOFORK malware. Campaign consistent with PLA Unit 61398 operations targeting industrial companies.",
+    "Advanced threat operations attributed to Equation Group. Zero-day exploits and firmware-level implants detected. DOUBLEFANTASY and EQUATIONDRUG malware with characteristics consistent with NSA-linked operations."
+]
+# Interface Gradio principale
+with gr.Blocks(theme=gr.themes.Soft(), css=css, title="APT Classification System - Fixed") as demo:
+    # Header
+    gr.HTML("""
+    <div class="header-container">
+        <h1 class="header-title">🛡️ APT Classification System</h1>
+        <p class="header-subtitle">CySecBERTMaxPerformance - Synchronized with Streamlit</p>
+        <p style="color: rgba(255,255,255,0.9); margin: 0.5rem 0;">🔄 Fixed version - Identical behavior to Streamlit interface</p>
+    </div>
+    """)
+    # Status
+    gr.HTML("""
+    <div class="status-success">
+        ✅ <strong>SYNCHRONIZED MODEL</strong> • Fixed class mapping • Identical to Streamlit version
+    </div>
+    """)
+    with gr.Row():
+        # Main input column
+        with gr.Column(scale=2):
+            gr.Markdown("### 📝 Threat Intelligence Input")
+            with gr.Tab("Text Input"):
+                text_input = gr.Textbox(
+                    lines=6,
+                    placeholder="Describe the cybersecurity incident, including TTPs, malware, targets, and attribution indicators...",
+                    label="Incident Description",
+                    show_label=False
+                )
+                gr.Examples(
+                    examples=[[text] for text in example_texts],
+                    inputs=text_input,
+                    label="📚 Test Cases for APT Groups"
+                )
+            with gr.Tab("File Upload"):
+                file_input = gr.File(
+                    file_types=[".txt", ".log", ".json", ".csv"],
+                    label="Upload Threat Intelligence Report",
+                    file_count="single"
+                )
+                file_status = gr.HTML("")
+        # Configuration column
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Analysis Configuration")
+            confidence_threshold = gr.Slider(
+                0.0, 1.0, value=0.3, step=0.05,
+                label="🎯 Confidence Threshold",
+                info="Minimum confidence for predictions"
+            )
+            gr.Markdown("### 📊 Display Options")
+            show_features = gr.Checkbox(value=True, label="🔍 Extract Features")
+            show_attribution = gr.Checkbox(value=True, label="🎯 Show Attribution")
+            show_profile = gr.Checkbox(value=True, label="📋 Complete Profile")
+            analyze_button = gr.Button(
+                "🔍 ANALYZE THREAT",
+                variant="primary",
+                size="lg"
+            )
+            gr.Markdown("### 📈 Model Information")
+            gr.HTML("""
+            <div style="background: #f8f9fa; padding: 1rem; border-radius: 8px; font-size: 0.9rem;">
+                <strong>Status:</strong> ✅ Fixed & Synchronized<br>
+                <strong>Architecture:</strong> CySecBERT + Custom Layers<br>
+                <strong>Classes:</strong> 7 APT groups (same as Streamlit)<br>
+                <strong>Source:</strong> HuggingFace Hub download<br>
+                <strong>Validation:</strong> Class mapping verified
+            </div>
+            """)
+    # Results section
+    gr.Markdown("## 📊 Analysis Results")
+    with gr.Row():
+        predicted_class = gr.Textbox(label="🎯 Predicted APT Group", interactive=False)
+        confidence_display = gr.Textbox(label="📊 Confidence Score", interactive=False)
+        processing_time = gr.Textbox(label="⚡ Processing Time", interactive=False)
+    # Main results
+    main_result = gr.HTML(label="Main Results")
+    # Visualization
+    prediction_plot = gr.Plot(label="Top 5 Predictions Visualization")
+    # Additional results
+    with gr.Row():
+        features_output = gr.HTML(label="Extracted Features")
+        attribution_output = gr.HTML(label="Attribution Factors")
+    # APT Profile
+    profile_output = gr.HTML(label="Complete APT Profile")
+    # Export
+    gr.Markdown("### 💾 Export Results")
+    export_data = gr.JSON(label="Analysis Data", visible=False)
+    # Event handler
+    analyze_button.click(
+        classify_text,
+        inputs=[text_input, file_input, confidence_threshold, show_features, show_attribution, show_profile],
+        outputs=[predicted_class, confidence_display, prediction_plot, processing_time, main_result, features_output, attribution_output, profile_output, export_data, file_status]
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )