Spaces:

Really-amin
/

Hoghoghi

Paused

App Files Files Community

Really-amin commited on Aug 4

Commit

bd97f78

verified ·

1 Parent(s): 43ccd9f

Upload main.py

Browse files

Files changed (1) hide show

main.py +273 -0

main.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import tempfile
+import logging
+import traceback
+from pathlib import Path
+from typing import Dict, Any
+from datetime import datetime
+from fastapi import FastAPI, File, UploadFile, HTTPException, Request
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse, FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# تنظیم logging بر اساس متغیر محیطی LOG_LEVEL
+log_level = os.getenv("LOG_LEVEL", "INFO").upper()
+logging.basicConfig(level=getattr(logging, log_level, logging.INFO))
+logger = logging.getLogger(__name__)
+# بارگذاری کتابخانه‌های پردازش PDF و تصویر
+try:
+    import fitz  # PyMuPDF
+    from PIL import Image
+    import numpy as np
+    PDF_AVAILABLE = True
+    logger.info("✅ PDF processing libraries loaded")
+except ImportError as e:
+    PDF_AVAILABLE = False
+    logger.warning(f"⚠️ PDF libraries not available: {e}")
+# بارگذاری مدل‌های ML
+try:
+    from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+    import torch
+    ML_AVAILABLE = True
+    logger.info("✅ ML libraries loaded")
+except ImportError as e:
+    ML_AVAILABLE = False
+    logger.warning(f"⚠️ ML libraries not available: {e}")
+# مدل پاسخ OCR
+class OCRResponse(BaseModel):
+    success: bool
+    text: str
+    method: str
+    metadata: Dict[str, Any]
+# مدل وضعیت سیستم
+class SystemStatus(BaseModel):
+    status: str
+    services: Dict[str, Any]
+    timestamp: str
+# سرویس OCR
+class OCRService:
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.model_loaded = False
+    async def _load_model_async(self):
+        try:
+            logger.info("Loading TrOCR model...")
+            model_name = "microsoft/trocr-base-printed"
+            self.processor = TrOCRProcessor.from_pretrained(model_name)
+            self.model = VisionEncoderDecoderModel.from_pretrained(model_name)
+            self.model_loaded = True
+            logger.info("✅ TrOCR model loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to load TrOCR model: {e}")
+            self.model_loaded = False
+    async def extract_text_from_pdf(self, file_path: str) -> OCRResponse:
+        if not PDF_AVAILABLE:
+            return OCRResponse(success=False, text="", method="error", metadata={"error": "PDF processing not available"})
+        try:
+            doc = fitz.open(file_path)
+            pages_text = []
+            total_chars = 0
+            total_pages = doc.page_count
+            for page_num in range(min(total_pages, 10)):
+                page = doc[page_num]
+                text = page.get_text()
+                pages_text.append(text)
+                total_chars += len(text)
+            doc.close()
+            full_text = "\n\n--- Page Break ---\n\n".join(pages_text)
+            return OCRResponse(
+                success=True,
+                text=full_text,
+                method="PyMuPDF",
+                metadata={
+                    "pages_processed": len(pages_text),
+                    "total_pages": total_pages,
+                    "total_characters": total_chars,
+                    "file_size_kb": os.path.getsize(file_path) / 1024
+                }
+            )
+        except Exception as e:
+            logger.error(f"PDF processing error: {e}")
+            return OCRResponse(success=False, text="", method="error", metadata={"error": str(e)})
+    async def extract_text_from_image(self, file_path: str) -> OCRResponse:
+        try:
+            image = Image.open(file_path)
+            if self.model_loaded and self.processor and self.model:
+                pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
+                generated_ids = self.model.generate(pixel_values)
+                generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                return OCRResponse(
+                    success=True,
+                    text=generated_text,
+                    method="TrOCR",
+                    metadata={
+                        "image_size": image.size,
+                        "image_mode": image.mode,
+                        "model": "microsoft/trocr-base-printed"
+                    }
+                )
+            else:
+                return OCRResponse(
+                    success=True,
+                    text=f"Image processed: {image.size} pixels, {image.mode} mode\nTrOCR model not loaded - text extraction limited",
+                    method="Basic",
+                    metadata={
+                        "image_size": image.size,
+                        "image_mode": image.mode,
+                        "note": "TrOCR model not available"
+                    }
+                )
+        except Exception as e:
+            logger.error(f"Image processing error: {e}")
+            return OCRResponse(success=False, text="", method="error", metadata={"error": str(e)})
+ocr_service = OCRService()
+app = FastAPI(
+    title="Legal Dashboard API",
+    description="Advanced Legal Document Processing System",
+    version="2.0.0",
+    docs_url="/api/docs",
+    redoc_url="/api/redoc"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# تنظیم مسیر دایرکتوری فرانت‌اند استاتیک
+BASE_DIR = Path(__file__).parent
+frontend_dir = BASE_DIR / "frontend"
+if frontend_dir.exists():
+    logger.info(f"✅ Frontend directory found: {frontend_dir}")
+    app.mount("/static", StaticFiles(directory=frontend_dir), name="static")
+else:
+    logger.warning("⚠️ Frontend directory not found. UI will not load correctly.")
+@app.on_event("startup")
+async def startup_event():
+    if ML_AVAILABLE:
+        try:
+            logger.info("🚀 Loading OCR models on startup...")
+            await ocr_service._load_model_async()
+        except Exception as e:
+            logger.error(f"❌ Failed to load models on startup: {e}")
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    html_file = frontend_dir / "index.html"
+    if html_file.exists():
+        return FileResponse(html_file)
+    return HTMLResponse("""
+        <h1>⚠️ Frontend not found</h1>
+        <p>Please ensure 'frontend/index.html' exists in the project root.</p>
+    """)
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "message": "Legal Dashboard is running",
+        "timestamp": datetime.now().isoformat(),
+        "services": {
+            "pdf_processing": PDF_AVAILABLE,
+            "ml_models": ML_AVAILABLE,
+            "ocr_model_loaded": ocr_service.model_loaded
+        }
+    }
+@app.get("/system/status", response_model=SystemStatus)
+async def get_system_status():
+    return SystemStatus(
+        status="healthy",
+        services={
+            "pdf_processing": {
+                "available": PDF_AVAILABLE,
+                "status": "✅ Available" if PDF_AVAILABLE else "❌ Not Available"
+            },
+            "ml_models": {
+                "available": ML_AVAILABLE,
+                "status": "✅ Available" if ML_AVAILABLE else "❌ Not Available"
+            },
+            "ocr_model": {
+                "loaded": ocr_service.model_loaded,
+                "status": "✅ Loaded" if ocr_service.model_loaded else "⏳ Loading..." if ML_AVAILABLE else "❌ Not Available"
+            }
+        },
+        timestamp=datetime.now().isoformat()
+    )
+@app.post("/api/ocr/extract-pdf", response_model=OCRResponse)
+async def extract_pdf_text(file: UploadFile = File(...)):
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="File must be a PDF")
+    temp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            content = await file.read()
+            temp_file.write(content)
+            temp_path = temp_file.name
+        return await ocr_service.extract_text_from_pdf(temp_path)
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            os.unlink(temp_path)
+@app.post("/api/ocr/extract-image", response_model=OCRResponse)
+async def extract_image_text(file: UploadFile = File(...)):
+    allowed_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']
+    if not any(file.filename.lower().endswith(ext) for ext in allowed_extensions):
+        raise HTTPException(status_code=400, detail="File must be an image (JPG, PNG, BMP, TIFF)")
+    temp_path = None
+    try:
+        file_extension = Path(file.filename).suffix
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+            content = await file.read()
+            temp_file.write(content)
+            temp_path = temp_file.name
+        return await ocr_service.extract_text_from_image(temp_path)
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            os.unlink(temp_path)
+@app.get("/api/test")
+async def test_endpoint():
+    return {
+        "message": "API is working!",
+        "pdf_available": PDF_AVAILABLE,
+        "ml_available": ML_AVAILABLE,
+        "ocr_model_loaded": ocr_service.model_loaded,
+        "timestamp": datetime.now().isoformat()
+    }
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    logger.error(f"Global exception: {exc}")
+    logger.error(traceback.format_exc())
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "Internal server error",
+            "message": str(exc),
+            "path": str(request.url)
+        }
+    )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False, log_level="info")