Spaces:

mekbus
/

xss-php

Sleeping

App Files Files Community

daniel commited on 6 days ago

Commit

c3717d3

0 Parent(s):

Clean Space - loads model from Hub

Browse files

Files changed (14) hide show

.gitattributes +2 -0
.gitignore +11 -0
Dockerfile +20 -0
README.md +47 -0
api/__init__.py +1 -0
api/dependencies.py +32 -0
api/main.py +56 -0
api/routes/__init__.py +1 -0
api/routes/health.py +38 -0
api/routes/scan.py +245 -0
api/services/__init__.py +1 -0
api/services/model_service.py +165 -0
requirements.txt +7 -0
test_api.py +94 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.bin filter=lfs diff=lfs merge=lfs -text
2	+ *.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+models/
+*.log
+.env
+.DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y gcc g++ && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r requirements.txt
+COPY --chown=user api/ ./api/
+# Model loaded from HuggingFace Hub at runtime
+ENV PHP_MODEL_REPO=mekbus/codebert-xss-php
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# XSS Vulnerability Scanner API
+A FastAPI-based API for detecting XSS vulnerabilities in JavaScript and PHP code using fine-tuned CodeBERT models.
+## Features
+- **JavaScript XSS Detection** - Trained on 14,000+ patterns
+- **PHP XSS Detection** - Trained on 9,700+ balanced patterns
+- **Multi-vulnerability detection** - Finds multiple vulnerabilities per file
+- **Chunking support** - Handles large files by splitting into chunks
+## API Endpoints
+### Health Check
+```
+GET /api/v1/health
+```
+### Scan Code
+```
+POST /api/v1/scan
+{
+    "code": "<?php echo $_GET['name']; ?>",
+    "language": "php"
+}
+```
+### Languages Supported
+- `php` - PHP code
+- `js` / `javascript` - JavaScript code
+## Models
+This Space uses fine-tuned CodeBERT models:
+- PHP Model: `checkpoint-1867` (92% accuracy on test cases)
+- JS Model: `best_model` (trained on 14k real-world patterns)
+## Local Development
+```bash
+pip install -r requirements.txt
+python -m api.main
+```
+## License
+MIT License

api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Empty __init__ files for Python modules

api/dependencies.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Dependency injection for FastAPI
+"""
+from typing import Optional
+from fastapi import HTTPException
+from api.services.model_service import ModelService
+# Global model service instance
+model_service: Optional[ModelService] = None
+def get_model_service() -> ModelService:
+    """Dependency injection for model service"""
+    if model_service is None:
+        raise HTTPException(status_code=503, detail="Models not loaded")
+    return model_service
+async def initialize_models():
+    """Initialize models on startup"""
+    global model_service
+    print("🚀 Loading CodeBERT models...")
+    model_service = ModelService()
+    await model_service.load_models()
+    print("✅ Models loaded successfully!")
+def cleanup_models():
+    """Cleanup on shutdown"""
+    global model_service
+    print("👋 Shutting down...")
+    model_service = None

api/main.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+from contextlib import asynccontextmanager
+from api.routes import scan, health
+from api import dependencies
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load models on startup, cleanup on shutdown"""
+    await dependencies.initialize_models()
+    yield
+    dependencies.cleanup_models()
+app = FastAPI(
+    title="XSS Detection API",
+    description="CodeBERT-based XSS vulnerability detection for PHP and JavaScript",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with your frontend URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Include routers
+app.include_router(scan.router, prefix="/api/v1", tags=["scan"])
+app.include_router(health.router, prefix="/api/v1", tags=["health"])
+@app.get("/")
+async def root():
+    return {
+        "service": "XSS Detection API",
+        "version": "1.0.0",
+        "status": "running",
+        "docs": "/docs"
+    }
+if __name__ == "__main__":
+    uvicorn.run(
+        "api.main:app",
+        host="0.0.0.0",
+        port=8080,
+        reload=True,
+        log_level="info"
+    )

api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Empty __init__ file

api/routes/health.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+from api.dependencies import get_model_service
+from api.services.model_service import ModelService
+router = APIRouter()
+class HealthResponse(BaseModel):
+    status: str
+    php_model_loaded: bool
+    js_model_loaded: bool
+    device: str
+@router.get("/health", response_model=HealthResponse)
+async def health_check(model_service: ModelService = Depends(get_model_service)):
+    """
+    Health check endpoint for load balancer
+    """
+    return HealthResponse(
+        status="healthy",
+        php_model_loaded=model_service.php_model is not None,
+        js_model_loaded=model_service.js_model is not None,
+        device=str(model_service.device)
+    )
+@router.get("/metrics")
+async def metrics():
+    """
+    Prometheus metrics endpoint (placeholder)
+    """
+    return {
+        "status": "ok",
+        "metrics": "Not implemented yet"
+    }

api/routes/scan.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from fastapi import APIRouter, HTTPException, Depends
+from pydantic import BaseModel, Field
+from typing import List, Optional
+from enum import Enum
+from api.dependencies import get_model_service
+from api.services.model_service import ModelService
+router = APIRouter()
+class LanguageEnum(str, Enum):
+    PHP = "php"
+    JS = "js"
+    JAVASCRIPT = "javascript"
+class VulnerabilityDetail(BaseModel):
+    type: str = "xss"
+    severity: str
+    line_number: Optional[int] = None
+    description: str
+    code_snippet: str
+    suggestion: str
+class ScanRequest(BaseModel):
+    code: str = Field(..., description="Source code to analyze")
+    language: LanguageEnum = Field(..., description="Programming language (php or js)")
+    file_path: Optional[str] = Field(None, description="File path for context")
+class ScanResult(BaseModel):
+    is_vulnerable: bool
+    confidence: float
+    label: str
+    vulnerabilities: List[VulnerabilityDetail] = []
+    processing_time_ms: Optional[int] = None
+    cached: bool = False
+class BatchScanRequest(BaseModel):
+    files: List[ScanRequest]
+class BatchScanResult(BaseModel):
+    job_id: str
+    total_files: int
+    results: List[ScanResult]
+@router.post("/scan", response_model=ScanResult)
+async def scan_code(
+    request: ScanRequest,
+    model_service: ModelService = Depends(get_model_service)
+):
+    """
+    Analyze a single code snippet for XSS vulnerabilities
+    """
+    try:
+        import time
+        start = time.time()
+        # Run prediction with multi-vulnerability support
+        result = model_service.predict_multi(
+            request.code,
+            request.language.value
+        )
+        # Build vulnerability list from all detected vulnerabilities
+        vulnerabilities = []
+        for vuln_info in result['vulnerabilities']:
+            confidence = vuln_info['confidence']
+            # Determine severity based on confidence
+            if confidence >= 0.95:
+                severity = "critical"
+            elif confidence >= 0.85:
+                severity = "high"
+            elif confidence >= 0.70:
+                severity = "medium"
+            else:
+                severity = "low"
+            # Get code snippet for this line range
+            lines = request.code.split('\n')
+            start_line = vuln_info['start_line']
+            end_line = min(vuln_info['end_line'], len(lines))
+            code_snippet = '\n'.join(lines[start_line-1:min(start_line+5, end_line)])
+            vuln = VulnerabilityDetail(
+                type="xss",
+                severity=severity,
+                line_number=start_line,
+                description=f"Potential XSS vulnerability detected with {confidence:.1%} confidence (lines {start_line}-{end_line})",
+                code_snippet=code_snippet[:500],  # Limit snippet length
+                suggestion=_get_suggestion(request.language.value)
+            )
+            vulnerabilities.append(vuln)
+        processing_time = int((time.time() - start) * 1000)
+        # Use max confidence for overall result
+        max_confidence = result['max_confidence']
+        is_vulnerable = result['is_vulnerable']
+        label = "VULNERABLE" if is_vulnerable else "SAFE"
+        return ScanResult(
+            is_vulnerable=is_vulnerable,
+            confidence=max_confidence,
+            label=label,
+            vulnerabilities=vulnerabilities,
+            processing_time_ms=processing_time,
+            cached=False
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/scan/batch", response_model=BatchScanResult)
+async def scan_batch(
+    request: BatchScanRequest,
+    model_service: ModelService = Depends(get_model_service)
+):
+    """
+    Analyze multiple code files in batch
+    """
+    import uuid
+    job_id = str(uuid.uuid4())
+    results = []
+    for file_request in request.files:
+        try:
+            result = await scan_code(file_request, model_service)
+            results.append(result)
+        except Exception as e:
+            # Add error result
+            results.append(ScanResult(
+                is_vulnerable=False,
+                confidence=0.0,
+                label="ERROR",
+                vulnerabilities=[],
+                processing_time_ms=0,
+                cached=False
+            ))
+    return BatchScanResult(
+        job_id=job_id,
+        total_files=len(request.files),
+        results=results
+    )
+def _extract_vulnerable_code(code: str, language: str) -> tuple:
+    """
+    Extract the most likely vulnerable code snippet and line number.
+    Returns (code_snippet, line_number)
+    """
+    import re
+    lines = code.split('\n')
+    # Define vulnerable patterns by language
+    if language == "php":
+        patterns = [
+            # Direct output of user input superglobals
+            r'echo\s+\$_(GET|POST|REQUEST|COOKIE)',
+            r'print\s+\$_(GET|POST|REQUEST|COOKIE)',
+            # Echo with array access (database output) - common stored XSS
+            r'echo\s+["\'].*\.\s*\$\w+\[',
+            r'echo\s+["\'].*\$\w+\[.*\]',
+            # Print with concatenation
+            r'print\s+["\'].*\.\s*\$',
+            # Unescaped variable in echo
+            r'echo\s+\$\w+\s*;',
+            r'print\s+\$\w+\s*;',
+            # Short echo tag with variable
+            r'<\?=\s*\$\w+',
+            # Dangerous functions
+            r'eval\s*\(',
+            r'innerHTML\s*=',
+            # SQL with user input (can lead to stored XSS)
+            r'query\s*\(.*\$_(GET|POST|REQUEST)',
+            r'INSERT INTO.*\$\w+',
+            r'mysql_query\s*\(.*\$',
+            # Direct concatenation in HTML
+            r'echo\s+["\']<[^>]+>\s*["\'].*\.\s*\$',
+        ]
+    else:  # JavaScript
+        patterns = [
+            r'innerHTML\s*=',
+            r'outerHTML\s*=',
+            r'document\.write\s*\(',
+            r'eval\s*\(',
+            r'\.html\s*\(',  # jQuery
+            r'insertAdjacentHTML\s*\(',
+            r'location\s*=.*\+',  # URL manipulation
+            r'window\.location\s*=',
+        ]
+    # Search for patterns and find matching lines
+    for i, line in enumerate(lines, 1):
+        for pattern in patterns:
+            if re.search(pattern, line, re.IGNORECASE):
+                # Get context: 2 lines before and after
+                start = max(0, i - 3)
+                end = min(len(lines), i + 2)
+                context_lines = lines[start:end]
+                # Mark the vulnerable line
+                snippet = '\n'.join(context_lines)
+                return snippet, i
+    # If no specific pattern found, skip comments and find real code
+    for i, line in enumerate(lines, 1):
+        stripped = line.strip()
+        # Skip empty lines, comments, and PHP opening tag
+        if (stripped and
+            not stripped.startswith('//') and
+            not stripped.startswith('/*') and
+            not stripped.startswith('*') and
+            not stripped.startswith('#') and
+            stripped != '<?php' and
+            not stripped.startswith('/**')):
+            # Found first real code line, get context
+            start = max(0, i - 1)
+            end = min(len(lines), i + 5)
+            context_lines = lines[start:end]
+            snippet = '\n'.join(context_lines)
+            return snippet, i
+    # Fallback: return truncated code
+    return code[:300] + "..." if len(code) > 300 else code, 1
+def _get_suggestion(language: str) -> str:
+    """Get language-specific security suggestion"""
+    if language == "php":
+        return "Use htmlspecialchars($var, ENT_QUOTES, 'UTF-8') for output encoding"
+    elif language in ["js", "javascript"]:
+        return "Use textContent instead of innerHTML, or sanitize with DOMPurify"
+    return "Sanitize user input before output"

api/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Empty __init__ file

api/services/model_service.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Model service for XSS detection - loads model from Hugging Face Hub
+"""
+import os
+import re
+import torch
+from typing import Tuple, List
+from transformers import RobertaTokenizer, RobertaForSequenceClassification
+class ModelService:
+    def __init__(self):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        print(f"Using device: {self.device}")
+        # Load tokenizer
+        self.tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
+        # Load PHP model from HuggingFace Hub
+        php_model_repo = os.getenv('PHP_MODEL_REPO', 'mekbus/codebert-xss-php')
+        try:
+            self.php_model = RobertaForSequenceClassification.from_pretrained(php_model_repo)
+            self.php_model.to(self.device)
+            self.php_model.eval()
+            print(f"✅ PHP model loaded from {php_model_repo}")
+        except Exception as e:
+            print(f"⚠️  PHP model not found: {e}")
+            self.php_model = None
+        # Load JS model from HuggingFace Hub
+        js_model_repo = os.getenv('JS_MODEL_REPO', 'mekbus/codebert-xss-js')
+        try:
+            self.js_model = RobertaForSequenceClassification.from_pretrained(js_model_repo)
+            self.js_model.to(self.device)
+            self.js_model.eval()
+            print(f"✅ JS model loaded from {js_model_repo}")
+        except Exception as e:
+            print(f"⚠️  JS model not found: {e}")
+            self.js_model = None
+    def extract_php_blocks(self, code: str) -> str:
+        """Extract PHP code from mixed PHP/HTML and remove comments"""
+        php_blocks = re.findall(r'<\?(?:php)?(.*?)(?:\?>|$)', code, re.DOTALL | re.IGNORECASE)
+        if php_blocks:
+            processed_blocks = []
+            for block in php_blocks:
+                block = block.strip()
+                if block.startswith('='):
+                    block = 'echo ' + block[1:].strip() + ';'
+                processed_blocks.append(block)
+            php_code = '\n'.join(processed_blocks)
+        else:
+            php_code = code
+        # Remove comments
+        php_code = re.sub(r'/\*.*?\*/', '', php_code, flags=re.DOTALL)
+        php_code = re.sub(r'//.*$', '', php_code, flags=re.MULTILINE)
+        php_code = re.sub(r'#.*$', '', php_code, flags=re.MULTILINE)
+        php_code = re.sub(r'\n\s*\n+', '\n', php_code.strip())
+        return php_code
+    def extract_js_code(self, code: str) -> str:
+        """Extract and clean JavaScript code"""
+        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
+        code = re.sub(r'//.*$', '', code, flags=re.MULTILINE)
+        code = re.sub(r'\n\s*\n+', '\n', code.strip())
+        return code
+    def chunk_code(self, code: str, max_tokens: int = 400, overlap: int = 50) -> List[str]:
+        """Split large code into overlapping chunks"""
+        lines = code.split('\n')
+        chunks = []
+        max_lines = 50
+        overlap_lines = 6
+        i = 0
+        while i < len(lines):
+            chunk_lines = lines[i:i + max_lines]
+            chunk = '\n'.join(chunk_lines)
+            if chunk.strip():
+                chunks.append(chunk)
+            i += max_lines - overlap_lines
+        return chunks if chunks else [code]
+    def predict_single(self, code: str, model) -> Tuple[float, float]:
+        """Make a single prediction"""
+        inputs = self.tokenizer(
+            code,
+            return_tensors='pt',
+            truncation=True,
+            max_length=512,
+            padding=True
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=1)
+            return probs[0][0].item(), probs[0][1].item()
+    def predict(self, code: str, language: str) -> Tuple[bool, float, str]:
+        """Predict if code is vulnerable"""
+        result = self.predict_multi(code, language)
+        if result['vulnerabilities']:
+            max_vuln = max(result['vulnerabilities'], key=lambda x: x['confidence'])
+            return True, max_vuln['confidence'], "VULNERABLE"
+        else:
+            return False, result['max_confidence'], "SAFE"
+    def predict_multi(self, code: str, language: str) -> dict:
+        """Predict vulnerabilities - returns multiple if found"""
+        if language == 'php':
+            model = self.php_model
+            code = self.extract_php_blocks(code)
+        elif language in ['js', 'javascript']:
+            model = self.js_model
+            code = self.extract_js_code(code)
+        else:
+            raise ValueError(f"Unsupported language: {language}")
+        if model is None:
+            raise RuntimeError(f"{language.upper()} model not loaded")
+        vulnerabilities = []
+        max_vuln_prob = 0.0
+        threshold = 0.5
+        use_chunking = len(code) > 2500
+        if use_chunking:
+            chunks = self.chunk_code(code)
+            print(f"📄 Large {language.upper()} file: {len(chunks)} chunks")
+            lines = code.split('\n')
+            for i, chunk in enumerate(chunks):
+                safe_prob, vuln_prob = self.predict_single(chunk, model)
+                if vuln_prob > max_vuln_prob:
+                    max_vuln_prob = vuln_prob
+                if vuln_prob >= threshold:
+                    start_line = i * 44 + 1
+                    end_line = min(start_line + 49, len(lines))
+                    vulnerabilities.append({
+                        'chunk_id': i + 1,
+                        'start_line': start_line,
+                        'end_line': end_line,
+                        'confidence': vuln_prob
+                    })
+        else:
+            safe_prob, vuln_prob = self.predict_single(code, model)
+            max_vuln_prob = vuln_prob
+            if vuln_prob >= threshold:
+                vulnerabilities.append({
+                    'chunk_id': 1,
+                    'start_line': 1,
+                    'end_line': len(code.split('\n')),
+                    'confidence': vuln_prob
+                })
+        return {
+            'is_vulnerable': len(vulnerabilities) > 0,
+            'max_confidence': max_vuln_prob,
+            'vulnerabilities': vulnerabilities
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+transformers==4.35.2
+torch==2.1.1
+pydantic==2.5.0
+python-multipart==0.0.6
+huggingface-hub>=0.19.0

test_api.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Test script for XSS Detection API
+Run this after starting the server to verify everything works
+"""
+import requests
+import json
+BASE_URL = "http://localhost:8080/api/v1"
+def test_health():
+    """Test health endpoint"""
+    print("🔍 Testing health endpoint...")
+    response = requests.get(f"{BASE_URL}/health")
+    print(f"Status: {response.status_code}")
+    print(f"Response: {json.dumps(response.json(), indent=2)}\n")
+def test_php_vulnerable():
+    """Test PHP vulnerable code"""
+    print("🔍 Testing PHP vulnerable code...")
+    payload = {
+        "code": "<?php echo $_GET['input']; ?>",
+        "language": "php",
+        "file_path": "test.php"
+    }
+    response = requests.post(f"{BASE_URL}/scan", json=payload)
+    print(f"Status: {response.status_code}")
+    print(f"Response: {json.dumps(response.json(), indent=2)}\n")
+def test_php_safe():
+    """Test PHP safe code"""
+    print("🔍 Testing PHP safe code...")
+    payload = {
+        "code": "<?php echo htmlspecialchars($_GET['input'], ENT_QUOTES, 'UTF-8'); ?>",
+        "language": "php",
+        "file_path": "safe.php"
+    }
+    response = requests.post(f"{BASE_URL}/scan", json=payload)
+    print(f"Status: {response.status_code}")
+    print(f"Response: {json.dumps(response.json(), indent=2)}\n")
+def test_js_vulnerable():
+    """Test JS vulnerable code"""
+    print("🔍 Testing JS vulnerable code...")
+    payload = {
+        "code": "document.getElementById('output').innerHTML = userInput;",
+        "language": "js",
+        "file_path": "test.js"
+    }
+    response = requests.post(f"{BASE_URL}/scan", json=payload)
+    print(f"Status: {response.status_code}")
+    print(f"Response: {json.dumps(response.json(), indent=2)}\n")
+def test_batch():
+    """Test batch scanning"""
+    print("🔍 Testing batch scan...")
+    payload = {
+        "files": [
+            {
+                "code": "<?php echo $_POST['name']; ?>",
+                "language": "php"
+            },
+            {
+                "code": "<?php echo htmlspecialchars($_POST['name']); ?>",
+                "language": "php"
+            }
+        ]
+    }
+    response = requests.post(f"{BASE_URL}/scan/batch", json=payload)
+    print(f"Status: {response.status_code}")
+    print(f"Response: {json.dumps(response.json(), indent=2)}\n")
+if __name__ == "__main__":
+    print("🚀 Starting API tests...\n")
+    try:
+        test_health()
+        test_php_vulnerable()
+        test_php_safe()
+        test_js_vulnerable()
+        test_batch()
+        print("✅ All tests completed!")
+    except requests.exceptions.ConnectionError:
+        print("❌ Error: Cannot connect to API server")
+        print("Make sure the server is running: python -m api.main")
+    except Exception as e:
+        print(f"❌ Error: {e}")