Spaces:

camlas
/

toxicity

Running

App Files Files Community

rudradcruze commited on 6 days ago

Commit

1c25c67

1 Parent(s): ff10d38

upload toxicity api application

Browse files

Files changed (12) hide show

.env_example +1 -0
.gitignore +427 -0
Dockerfile +39 -0
LICENSE +66 -0
README.md +382 -5
app-worked-backup-1.py +304 -0
app-worked-backup-2.py +702 -0
app.py +813 -0
convert_base64.ipynb +72 -0
images/camlas-background.png +0 -0
requirements.txt +19 -0
utils/model_classes.py +72 -0

.env_example ADDED Viewed

	@@ -0,0 +1 @@


1	+ HF_TOKEN="your_huggingface_token_here"

.gitignore ADDED Viewed

	@@ -0,0 +1,427 @@

+# ==============================================================================
+# OVARIAN CANCER DETECTION PROJECT - .GITIGNORE
+# ==============================================================================
+# Byte-compiled / optimized / DLL files
+*__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# poetry
+poetry.lock
+# pdm
+.pdm.toml
+# PEP 582
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+.idea/
+# ==============================================================================
+# MACHINE LEARNING & DATA SCIENCE SPECIFIC
+# ==============================================================================
+# Model files (commented out since we need to deploy them)
+# *.pt
+# *.pth
+# *.pkl
+# *.joblib
+# *.h5
+# *.hdf5
+# Datasets (keep models but ignore large datasets)
+data/
+dataset/
+datasets/
+*.csv
+*.tsv
+*.json
+*.jsonl
+*.parquet
+# Large files
+*.zip
+*.tar.gz
+*.rar
+*.7z
+# Training outputs
+logs/
+runs/
+experiments/
+outputs/
+checkpoints/
+wandb/
+mlruns/
+# Tensorboard logs
+events.out.tfevents.*
+# ==============================================================================
+# HUGGINGFACE & API SPECIFIC
+# ==============================================================================
+# HuggingFace cache
+.cache/
+transformers_cache/
+huggingface_hub/
+# API keys and tokens (CRITICAL SECURITY)
+.env
+.env.local
+.env.development
+.env.test
+.env.production
+*.token
+*_token
+api_keys.txt
+secrets.txt
+credentials.json
+config.json
+# HuggingFace specific
+hf_token.txt
+huggingface_token
+.huggingface_token
+# ==============================================================================
+# GRADIO SPECIFIC
+# ==============================================================================
+# Gradio temporary files
+gradio_cached_examples/
+flagged/
+gradio_queue.db
+# ==============================================================================
+# OPERATING SYSTEM FILES
+# ==============================================================================
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+Icon?
+._*
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+# Windows
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+*.tmp
+*.temp
+Desktop.ini
+$RECYCLE.BIN/
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+*.lnk
+# Linux
+*~
+.fuse_hidden*
+.directory
+.Trash-*
+.nfs*
+# ==============================================================================
+# IDE AND EDITOR FILES
+# ==============================================================================
+# Visual Studio Code
+.vscode/
+*.code-workspace
+# JetBrains IDEs
+.idea/
+*.iws
+*.iml
+*.ipr
+# Sublime Text
+*.sublime-project
+*.sublime-workspace
+# Vim
+*.swp
+*.swo
+*~
+.viminfo
+# Emacs
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+# Atom
+.atom/
+# ==============================================================================
+# DEVELOPMENT AND TESTING
+# ==============================================================================
+# Testing
+.tox/
+.coverage
+htmlcov/
+.pytest_cache/
+test_results/
+test_outputs/
+# Local development
+local/
+tmp/
+temp/
+.tmp/
+.temp/
+# Backup files
+*.bak
+*.backup
+*.old
+*_backup
+*_old
+# ==============================================================================
+# PROJECT SPECIFIC
+# ==============================================================================
+# Original dataset folder (if you have it locally)
+Original/
+original_dataset/
+# Feature extraction outputs (if regenerating)
+feature_extraction_outputs/
+extracted_features/
+# Training artifacts (if retraining)
+training_logs/
+model_checkpoints/
+training_outputs/
+# Test images and results
+test_images/
+test_results/
+prediction_outputs/
+# Documentation builds
+docs/build/
+documentation/build/
+# Deployment artifacts (optional)
+deployment_logs/
+build_logs/
+# Personal notes and scratch files
+notes.txt
+todo.txt
+scratch.py
+test.py
+debug.py
+playground.py
+# ==============================================================================
+# SECURITY SENSITIVE FILES (CRITICAL)
+# ==============================================================================
+# Never commit these files containing sensitive information
+**/secrets/**
+**/credentials/**
+**/*_secret*
+**/*_key*
+**/*_password*
+**/*_token*
+**/*credentials*
+private_key*
+public_key*
+*.pem
+*.key
+*.crt
+*.cert
+# ==============================================================================
+# LARGE FILES AND BINARIES
+# ==============================================================================
+# Large model files (uncomment if models are too large for Git)
+models/*.pt
+models/*.pth
+model_cache/*.pt
+model_cache/*.pth
+models/
+model_cache/
+*.bin
+*.pt
+*.pkl
+*.h5
+*.onnx
+# Videos and large media
+*.mp4
+*.avi
+*.mov
+*.mkv
+*.webm
+*.gif
+# Large images (keep examples small)
+# *.png
+# *.jpg
+# *.jpeg
+# *.tiff
+# *.bmp
+models/feature_extractor.pt
+models/feature_scaler.pt
+models/multi_head_self_attention_classifier.pt
+*model_cache
+venv
+# ==============================================================================
+# END OF .GITIGNORE
+# ==============================================================================

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# Create a non-root user for security
+RUN useradd -m -u 1000 user
+USER user
+# Set PATH for user local binaries
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better Docker layer caching
+COPY --chown=user requirements.txt requirements.txt
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Create models directory with proper permissions
+RUN mkdir -p /app/models
+# Copy utils directory (model classes)
+COPY --chown=user ./utils /app/utils
+# Copy main application
+COPY --chown=user ./app.py /app/
+# Copy any additional files you might have
+COPY --chown=user ./*.py /app/
+# Expose port 7860 (required for HuggingFace Spaces)
+EXPOSE 7860
+# Command to run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,66 @@

+MIT License
+Copyright (c) 2025 CAMLAs (Computer Vision and Machine Learning Lab)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---
+MEDICAL DISCLAIMER:
+This software is intended for research and educational purposes only.
+It is NOT intended for clinical diagnosis, medical decision-making, or
+patient care. The software should NOT be used as a substitute for
+professional medical advice, diagnosis, or treatment.
+Users of this software acknowledge that:
+1. The software is experimental and may contain errors or inaccuracies
+2. Medical decisions should always be made by qualified healthcare professionals
+3. The developers and CAMLAs organization are not responsible for any
+   medical decisions or outcomes resulting from the use of this software
+4. Users assume all risks associated with the use of this software
+By using this software, you agree to these terms and acknowledge that you
+understand the limitations and appropriate use cases for this technology.
+---
+ATTRIBUTION:
+If you use this software in academic research, please cite:
+CAMLAs Research Team. (2025). Ovarian Cancer Detection API using Hybrid
+ConvNeXt-NASNet Architecture. HuggingFace Spaces.
+https://huggingface.co/spaces/CAMLAs/ovarian-cancer
+---
+THIRD-PARTY LICENSES:
+This software uses the following third-party libraries and frameworks:
+-   PyTorch: BSD-style license (https://github.com/pytorch/pytorch/blob/master/LICENSE)
+-   timm: Apache License 2.0 (https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE)
+-   scikit-learn: BSD License (https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
+-   Gradio: Apache License 2.0 (https://github.com/gradio-app/gradio/blob/main/LICENSE)
+-   NumPy: BSD License (https://github.com/numpy/numpy/blob/main/LICENSE.txt)
+-   Pillow: HPND License (https://github.com/python-pillow/Pillow/blob/main/LICENSE)
+All third-party libraries retain their original licenses and copyrights.

README.md CHANGED Viewed

@@ -1,11 +1,388 @@
 ---
-title: Toxicity
-emoji: 👀
 colorFrom: green
-colorTo: green
 sdk: docker
 pinned: false
-short_description: Toxicity Prediction API
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Toxicity Prediction API
+description: A FastAPI-based REST API for predicting protein sequence toxicity using ProtBERT embeddings and MHSA-GRU classifier.
+short_description: Toxicity Prediction API
+version: 1.0.0
+emoji: 🧬
 colorFrom: green
+colorTo: blue
 sdk: docker
+app_file: app.py
 pinned: false
+license: mit
+tags:
+    - protein-toxicity
+    - protbert
+    - mhsa-gru
+    - pytorch
+    - fastapi
+---
+# Toxicity Prediction API
+A FastAPI-based REST API for predicting protein sequence toxicity using ProtBERT embeddings and MHSA-GRU classifier.
+Developed by the CAMLAs research team - [Francis Rudra D Cruze](https://linkedin.com/in/rudradcruze).
+## 🚀 Features
+-   **ProtBERT Feature Extraction**: Uses state-of-the-art protein language model
+-   **MHSA-GRU Classification**: Multi-Head Self-Attention with GRU for accurate predictions
+-   **Single & Batch Predictions**: Process one or multiple sequences
+-   **HuggingFace Integration**: Automatic model loading from private repository
+-   **Production Ready**: Health checks, error handling, and comprehensive logging
+## 📋 Requirements
+-   Python 3.8+
+-   CUDA-capable GPU (optional, but recommended)
+-   HuggingFace account with access to private repository
+## 🔧 Installation
+1. **Clone the repository**
+```bash
+git clone https://huggingface.co/spaces/camlas/toxicity
+cd toxicity
+```
+2. **Create virtual environment**
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. **Install dependencies**
+```bash
+pip install -r requirements.txt
+```
+4. **Create `.env` file**
+```bash
+echo "HF_TOKEN=your_huggingface_token_here" > .env
+```
+Get your HuggingFace token from: https://huggingface.co/settings/tokens
+## 🎯 Usage
+### Start the API Server
+```bash
+python app.py
+```
+Or with uvicorn directly:
+```bash
+uvicorn app:app --host 0.0.0.0 --port 8000 --reload
+```
+The API will be available at: `http://localhost:8000`
+### Run Tests
+```bash
+python test_api.py
+```
+## 📡 API Endpoints
+### 1. Root Endpoint
+**GET** `/`
+Returns API information and available endpoints.
+```bash
+curl http://localhost:8000/
+```
+### 2. Health Check
+**GET** `/health`
+Check API status and model loading status.
+```bash
+curl http://localhost:8000/health
+```
+**Response:**
+```json
+{
+    "status_code": 200,
+    "status": "healthy",
+    "service": "Toxicity Prediction API",
+    "api_version": "1.0.0",
+    "model_version": "MHSA-GRU-Transformer-v1.0",
+    "models_loaded": true,
+    "device": "cuda",
+    "timestamp": "2025-01-21T10:30:00Z"
+}
+```
+### 3. Single Prediction
+**POST** `/predict`
+Predict toxicity for a single protein sequence.
+**Request:**
+```bash
+curl -X POST http://localhost:8000/predict \
+  -H "Content-Type: application/json" \
+  -d '{"sequence": "MKTAYIAKQRQISFVKSHFSRQLE"}'
+```
+**Response:**
+```json
+{
+    "status_code": 200,
+    "status": "success",
+    "success": true,
+    "data": {
+        "sequence": "MKTAYIAKQRQISFVKSHFSRQLE",
+        "sequence_length": 24,
+        "prediction": {
+            "predicted_class": "Toxic",
+            "confidence": 0.85,
+            "confidence_level": "high",
+            "toxicity_score": 0.925,
+            "non_toxicity_score": 0.075
+        },
+        "metadata": {
+            "embedding_model": "ProtBERT",
+            "embedding_type": "Bert",
+            "model_version": "MHSA-GRU-Transformer-v1.0",
+            "device": "cuda"
+        }
+    },
+    "timestamp": "2025-01-21T10:30:00Z",
+    "api_version": "1.0.0",
+    "processing_time_ms": 45.2
+}
+```
+### 4. Batch Prediction
+**POST** `/predict/batch`
+Predict toxicity for multiple sequences at once.
+**Request in Postman/cURL:**
+```bash
+curl -X POST http://localhost:8000/predict/batch \
+  -H "Content-Type: application/json" \
+  -d '{
+    "sequences": [
+      "MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES",
+      "MFGLPQQEVSEEEKRAHQEQTEKTLKQAAYVAAFLWVSPMIWHLVKKQWK",
+      "MKTAYIAKQRQISFVKSHFSRQLE"
+    ]
+  }'
+```
+**Request Body (JSON):**
+```json
+{
+    "sequences": [
+        "MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES",
+        "MFGLPQQEVSEEEKRAHQEQTEKTLKQAAYVAAFLWVSPMIWHLVKKQWK"
+    ]
+}
+```
+**Response:**
+```json
+{
+    "status_code": 200,
+    "status": "success",
+    "success": true,
+    "data": {
+        "total_sequences": 2,
+        "results": [
+            {
+                "sequence": "MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES",
+                "sequence_length": 51,
+                "predicted_class": "Toxic",
+                "toxicity_score": 0.925,
+                "confidence": 0.85
+            },
+            {
+                "sequence": "MFGLPQQEVSEEEKRAHQEQTEKTLKQAAYVAAFLWVSPMIWHLVKKQWK",
+                "sequence_length": 51,
+                "predicted_class": "Non-Toxic",
+                "toxicity_score": 0.125,
+                "confidence": 0.75
+            }
+        ],
+        "metadata": {
+            "embedding_model": "ProtBERT",
+            "embedding_type": "Bert",
+            "model_version": "MHSA-GRU-Transformer-v1.0",
+            "device": "cuda"
+        }
+    },
+    "timestamp": "2025-01-21T10:30:00Z",
+    "api_version": "1.0.0",
+    "processing_time_ms": 125.8
+}
+```
+## 🐍 Python Usage Examples
+### Single Prediction
+```python
+import requests
+response = requests.post(
+    "http://localhost:8000/predict",
+    json={"sequence": "MKTAYIAKQRQISFVKSHFSRQLE"}
+)
+result = response.json()
+print(f"Predicted Class: {result['data']['prediction']['predicted_class']}")
+print(f"Toxicity Score: {result['data']['prediction']['toxicity_score']:.4f}")
+print(f"Confidence: {result['data']['prediction']['confidence']:.4f}")
+```
+### Batch Prediction
+```python
+sequences = [
+    "MKTAYIAKQRQISFVKSHFSRQLE",
+    "ARNDCEQGHILKMFPSTWYV",
+    "MVHLTPEEKS"
+]
+response = requests.post(
+    "http://localhost:8000/predict/batch",
+    json={"sequences": sequences}
+)
+results = response.json()
+for i, pred in enumerate(results['data']['results'], 1):
+    print(f"Sequence {i}: {pred['predicted_class']} ({pred['toxicity_score']:.4f})")
+```
+## 📁 Project Structure
+```
+toxicity-api/
+├── app.py                 # Main FastAPI application
+├── requirements.txt       # Python dependencies
+├── test_api.py           # Test suite
+├── .env                  # Environment variables (create this)
+├── models/               # Downloaded models (auto-created)
+└── README.md            # This file
+```
+## 🔒 HuggingFace Repository Structure
+Your private repository `camlas/toxicity` should contain:
+```
+camlas/toxicity/
+├── mhsa_gru_classifier.pth    # Trained MHSA-GRU model
+├── scaler.pkl                  # Feature scaler
+├── config.json                 # ProtBERT config
+├── model.safetensors          # ProtBERT weights
+├── vocab.txt                   # ProtBERT vocabulary
+├── tokenizer_config.json      # Tokenizer configuration
+└── special_tokens_map.json    # Special tokens mapping
+```
+## 🎨 Model Architecture
+1. **Feature Extraction**: ProtBERT (1024-dimensional embeddings)
+2. **Feature Scaling**: StandardScaler
+3. **Classification**: MHSA-GRU
+    - Multi-Head Self-Attention (3 layers)
+    - Bidirectional GRU (2 layers)
+    - Fully connected layers with dropout
+## ⚠️ Error Codes
+-   `MISSING_SEQUENCE`: No sequence provided in request
+-   `SEQUENCE_TOO_SHORT`: Sequence length < 10 amino acids
+-   `MODEL_NOT_LOADED`: Models failed to load from HuggingFace
+-   `INTERNAL_ERROR`: Unexpected server error
+## 📊 Performance
+-   Single prediction: ~40-50ms (GPU)
+-   Batch prediction (10 sequences): ~100-150ms (GPU)
+-   Model loading time: ~10-15 seconds (first time)
+## 🐛 Troubleshooting
+### Models not loading
+1. Check your HuggingFace token in `.env`
+2. Verify you have access to the private repository
+3. Check internet connection
+4. Look at console logs for specific errors
+### CUDA out of memory
+-   Reduce batch size
+-   Use CPU instead: Set `device = "cpu"` in code
+-   Process sequences one at a time
+### Slow predictions
+-   Ensure GPU is being used (check `/health` endpoint)
+-   First prediction is always slower (model initialization)
+## 🌐 Public Usage Guidelines
+-   **Free to Use**: No authentication or API keys required.
+-   **Rate Limiting**: Fair usage is expected. Please do not abuse the service.
+-   **Educational Purpose**: Designed for research and educational use.
+-   **Medical Disclaimer**: Not for clinical diagnosis. See disclaimer below.
+-   **Availability**: Best effort uptime, not guaranteed 24/7.
+## ⚠️ Medical Disclaimer
+**IMPORTANT**: This API is designed for **research and educational purposes only**. It should **NOT** be used for clinical diagnosis or medical decision-making. Always consult qualified medical professionals for diagnostic decisions.
+## 🏢 About CAMLAs
+**CAMLAs** (Centre for Advanced Machine Learning & Applications) is a research organization focused on advancing AI applications in medical imaging and healthcare.
+**Team Members:**
+-   **S M Hasan Mahmud** – Principal Investigator & Supervisor
+    _Roles:_ Writing – Original Draft, Writing – Review & Editing, Conceptualization, Supervision, Project Administration
+-   **Francis Rudra D Cruze** – Lead Developer & Researcher
+    _Roles:_ Methodology, Software, Formal Analysis, Investigation, Resources, Visualization
+## 📞 Support & Contact
+-   **Issues**: [GitHub Repository Issues](https://github.com/camlas/ovarian-cancer)
+-   **Email**: [email protected]
+-   **Documentation**: This README
+-   **API Status**: Check `/health` endpoint
+-   **Website Integration**: Perfect for ovarian.francisrudra.com
+## 📄 License
+This project is licensed under the MIT License - see the LICENSE file for details.
 ---
+**CAMLAs** - Center for Advanced Machine Learning and Applications
+_Advancing Medical AI Research with Public FastAPI_ 🌐🚀

app-worked-backup-1.py ADDED Viewed

	@@ -0,0 +1,304 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+import numpy as np
+import os
+import time
+import joblib
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Optional
+from contextlib import asynccontextmanager
+from dotenv import load_dotenv
+import shutil
+from huggingface_hub import hf_hub_download
+# Transformers imports specifically for ProtBERT
+from transformers import BertTokenizer, BertModel
+# Import your custom model structure
+from utils.model_classes import MHSA_GRU
+load_dotenv()
+# ========================= CONFIGURATION ==========================
+# Repository details (Where your trained classifier/scaler live)
+MODEL_REPO = {
+    "repo_id": "camlas/toxicity",
+    "files": {
+        "classifier": "mhsa_gru_classifier.pth",
+        "scaler": "scaler.pkl"
+    }
+}
+# Feature Extraction Config - UPDATED FOR PROTBERT
+TRANSFORMER_CONFIG = {
+    "model_name": "Rostlab/prot_bert",
+    "model_type": "ProtBERT",
+    "tokenizer_class": BertTokenizer,
+    "model_class": BertModel
+}
+CLASSES = ["Non-Toxic", "Toxic"]
+API_VERSION = "2.0.0-protbert"
+MODEL_VERSION = "ProtBERT-MHSA-GRU-v1"
+# Global variables to hold loaded models
+models = {
+    "transformer": None,
+    "tokenizer": None,
+    "classifier": None,
+    "scaler": None
+}
+# Device selection
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ========================= HELPER FUNCTIONS =========================
+def ensure_models_directory():
+    models_dir = "models"
+    Path(models_dir).mkdir(exist_ok=True)
+    return models_dir
+def download_model_from_hub(model_key: str) -> Optional[str]:
+    """Download custom trained models (Classifier/Scaler) from Private HF Repo"""
+    try:
+        filename = MODEL_REPO["files"][model_key]
+        repo_id = MODEL_REPO["repo_id"]
+        models_dir = ensure_models_directory()
+        local_path = os.path.join(models_dir, filename)
+        # If file exists locally, use it
+        if os.path.exists(local_path):
+            print(f"✅ Found {model_key} locally: {local_path}")
+            return local_path
+        print(f"📥 Downloading {model_key} from {repo_id}...")
+        token = os.getenv("HF_TOKEN")
+        if not token:
+            print("⚠️ Warning: HF_TOKEN not found in .env. Private repos will fail.")
+        temp_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type="model",
+            token=token
+        )
+        shutil.copy2(temp_path, local_path)
+        return local_path
+    except Exception as e:
+        print(f"❌ Error downloading {model_key}: {e}")
+        return None
+def load_feature_extractor():
+    """Load the ProtBERT Model from HuggingFace"""
+    print(f"🔄 Loading Transformer: {TRANSFORMER_CONFIG['model_name']}...")
+    try:
+        # Load specifically with do_lower_case=False for ProtBERT
+        tokenizer = TRANSFORMER_CONFIG['tokenizer_class'].from_pretrained(
+            TRANSFORMER_CONFIG['model_name'],
+            do_lower_case=False
+        )
+        model = TRANSFORMER_CONFIG['model_class'].from_pretrained(
+            TRANSFORMER_CONFIG['model_name']
+        )
+        model.to(device)
+        model.eval()
+        models["tokenizer"] = tokenizer
+        models["transformer"] = model
+        print("✅ ProtBERT Transformer loaded successfully")
+        return True
+    except Exception as e:
+        print(f"❌ Error loading Transformer: {e}")
+        return False
+def load_classifier_and_scaler():
+    """Load the custom MHSA-GRU classifier and Scaler"""
+    try:
+        # 1. Load Scaler
+        scaler_path = download_model_from_hub("scaler")
+        if scaler_path:
+            models["scaler"] = joblib.load(scaler_path)
+            print("✅ Scaler loaded")
+        # 2. Load Classifier
+        clf_path = download_model_from_hub("classifier")
+        if clf_path:
+            # ProtBERT output dimension is 1024
+            input_dim = 1024
+            print(f"ℹ️ Initializing MHSA_GRU with input_dim={input_dim} (ProtBERT)")
+            classifier = MHSA_GRU(
+                input_dim=input_dim,
+                hidden_dim=256, # Matching your training code
+                num_heads=8,
+                num_gru_layers=2,
+                dropout=0.3
+            )
+            state_dict = torch.load(clf_path, map_location=device)
+            classifier.load_state_dict(state_dict)
+            classifier.to(device)
+            classifier.eval()
+            models["classifier"] = classifier
+            print("✅ Classifier loaded")
+        return models["scaler"] is not None and models["classifier"] is not None
+    except Exception as e:
+        print(f"❌ Error loading custom models: {e}")
+        return False
+def preprocess_sequence(sequence: str):
+    """
+    Preprocess sequence for ProtBERT.
+    ProtBERT expects spaces between amino acids: 'M K T A Y...'
+    """
+    # Clean and uppercase
+    sequence = sequence.upper().strip().replace('\n', '').replace('\r', '')
+    # Add spaces between residues
+    spaced_sequence = " ".join(list(sequence))
+    return spaced_sequence
+def extract_features(sequence: str):
+    """Run sequence through ProtBERT to get [CLS] embeddings"""
+    tokenizer = models["tokenizer"]
+    model = models["transformer"]
+    processed_seq = preprocess_sequence(sequence)
+    inputs = tokenizer(
+        [processed_seq],
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512 # ProtBERT max length
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        # Extract [CLS] token embedding (Index 0)
+        # shape: (batch_size, hidden_dim) -> (1, 1024)
+        features = outputs.last_hidden_state[:, 0, :]
+        return features.cpu().numpy()
+# ========================= FASTAPI LIFESPAN =========================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    print("🚀 Starting Toxicity Detection API (ProtBERT Edition)...")
+    # Check if utils/model_classes.py exists
+    if not os.path.exists("utils/model_classes.py"):
+        print("❌ Error: utils/model_classes.py not found. Please create it.")
+    success_tf = load_feature_extractor()
+    success_custom = load_classifier_and_scaler()
+    if not (success_tf and success_custom):
+        print("⚠️ Warning: Not all models loaded successfully")
+    yield
+    print("🔄 Shutting down API...")
+app = FastAPI(
+    title="Peptide Toxicity Detection API",
+    description="API using ProtBERT features + MHSA-GRU classifier",
+    version=API_VERSION,
+    lifespan=lifespan
+)
+# ========================= PYDANTIC MODELS =========================
+class SequenceRequest(BaseModel):
+    sequence: str
+class PredictionResponse(BaseModel):
+    sequence_preview: str
+    is_toxic: bool
+    label: str
+    score: float
+    confidence_level: str
+    model_used: str
+    processing_time_ms: float
+    timestamp: str
+# ========================= ENDPOINTS =========================
+@app.get("/")
+async def root():
+    return {"message": "Toxicity Detection API is running. Use /predict to analyze sequences."}
+@app.get("/health")
+async def health_check():
+    loaded = all(v is not None for v in models.values())
+    return {
+        "status": "healthy" if loaded else "degraded",
+        "models_loaded": {k: v is not None for k, v in models.items()},
+        "device": str(device),
+        "model_version": MODEL_VERSION,
+        "feature_extractor": TRANSFORMER_CONFIG["model_name"]
+    }
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(request: SequenceRequest):
+    start_time = time.time()
+    if not all(models.values()):
+        raise HTTPException(status_code=503, detail="Models are not fully initialized.")
+    if not request.sequence:
+        raise HTTPException(status_code=400, detail="Empty sequence provided.")
+    try:
+        # 1. Extract Features (ProtBERT [CLS] Token)
+        # This handles the 'M K T' spacing internally
+        raw_features = extract_features(request.sequence)
+        # 2. Scale Features
+        # Use the scaler loaded from your repo
+        scaled_features = models["scaler"].transform(raw_features)
+        # 3. Predict (MHSA-GRU)
+        features_tensor = torch.FloatTensor(scaled_features).to(device)
+        with torch.no_grad():
+            # Get probability (sigmoid output)
+            probability = models["classifier"](features_tensor).item()
+        # 4. Interpret Results
+        # Threshold 0.5
+        prediction_class = 1 if probability > 0.5 else 0
+        predicted_label = CLASSES[prediction_class]
+        # Confidence calculation
+        confidence_score = abs(probability - 0.5) * 2
+        confidence_level = "High" if confidence_score > 0.8 else "Medium" if confidence_score > 0.5 else "Low"
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        return PredictionResponse(
+            sequence_preview=request.sequence[:20] + "..." if len(request.sequence) > 20 else request.sequence,
+            is_toxic=(prediction_class == 1),
+            label=predicted_label,
+            score=probability,
+            confidence_level=confidence_level,
+            model_used="ProtBERT + MHSA-GRU",
+            processing_time_ms=processing_time,
+            timestamp=datetime.now(timezone.utc).isoformat()
+        )
+    except Exception as e:
+        print(f"Error during prediction: {e}")
+        raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

app-worked-backup-2.py ADDED Viewed

	@@ -0,0 +1,702 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Optional, List
+import time
+from datetime import datetime, timezone
+import os
+import warnings
+from huggingface_hub import hf_hub_download
+from contextlib import asynccontextmanager
+import uvicorn
+from dotenv import load_dotenv
+import shutil
+import joblib
+from pathlib import Path
+from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel, DistilBertTokenizer, DistilBertModel
+load_dotenv()
+warnings.filterwarnings('ignore')
+# ========================= MODEL CLASSES =========================
+class MultiHeadSelfAttention(nn.Module):
+    """Multi-Head Self-Attention mechanism"""
+    def __init__(self, embed_dim, num_heads, dropout=0.3):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.attention = nn.MultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        attn_output, _ = self.attention(x, x, x)
+        x = self.layer_norm(x + self.dropout(attn_output))
+        return x
+class MHSA_GRU(nn.Module):
+    """Multi-Head Self-Attention with GRU model"""
+    def __init__(self, input_dim, hidden_dim=256, num_heads=8, num_gru_layers=2, dropout=0.3):
+        super(MHSA_GRU, self).__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.input_projection = nn.Linear(input_dim, hidden_dim)
+        self.mhsa1 = MultiHeadSelfAttention(hidden_dim, num_heads, dropout)
+        self.mhsa2 = MultiHeadSelfAttention(hidden_dim, num_heads, dropout)
+        self.gru = nn.GRU(
+            input_size=hidden_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_gru_layers,
+            batch_first=True,
+            dropout=dropout if num_gru_layers > 1 else 0,
+            bidirectional=False
+        )
+        self.mhsa3 = MultiHeadSelfAttention(hidden_dim, num_heads, dropout)
+        self.dropout = nn.Dropout(dropout)
+        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
+        self.fc2 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
+        self.fc3 = nn.Linear(hidden_dim // 4, 1)
+        self.bn1 = nn.BatchNorm1d(hidden_dim // 2)
+        self.bn2 = nn.BatchNorm1d(hidden_dim // 4)
+    def forward(self, x):
+        batch_size = x.size(0)
+        x = self.input_projection(x)
+        x = x.unsqueeze(1)
+        x = self.mhsa1(x)
+        x = self.mhsa2(x)
+        gru_out, hidden = self.gru(x)
+        x = self.mhsa3(gru_out)
+        x = x[:, -1, :]
+        x = self.dropout(x)
+        x = torch.relu(self.bn1(self.fc1(x)))
+        x = self.dropout(x)
+        x = torch.relu(self.bn2(self.fc2(x)))
+        x = self.dropout(x)
+        x = self.fc3(x)
+        return torch.sigmoid(x)
+# ========================= CONFIGURATION =========================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+API_VERSION = "1.0.0"
+MODEL_VERSION = "MHSA-GRU-Transformer-v1.0"
+# Model repository configuration
+MODEL_REPO = {
+    "repo_id": "camlas/toxicity",
+    "files": {
+        "classifier": "mhsa_gru_classifier.pth",
+        "scaler": "scaler.pkl",
+        "config": "config.json",
+        "model_weights": "model.safetensors",
+        "vocab": "vocab.txt",
+        "tokenizer_config": "tokenizer_config.json",
+        "special_tokens_map": "special_tokens_map.json"
+    }
+}
+# Global model variables
+classifier = None
+scaler = None
+transformer_model = None
+transformer_tokenizer = None
+EMBEDDING_TYPE = "Bert"
+MODEL_NAME = "ProtBERT"
+# ========================= PYDANTIC MODELS =========================
+class SequenceRequest(BaseModel):
+    sequence: str
+class BatchSequenceRequest(BaseModel):
+    sequences: List[str]
+class PredictionResponse(BaseModel):
+    status_code: int
+    status: str
+    success: bool
+    data: Optional[dict] = None
+    error: Optional[str] = None
+    error_code: Optional[str] = None
+    timestamp: str
+    api_version: str
+    processing_time_ms: float
+class HealthResponse(BaseModel):
+    status_code: int
+    status: str
+    service: str
+    api_version: str
+    model_version: str
+    models_loaded: bool
+    models_loaded_count: int
+    total_models_required: int
+    model_sources: dict
+    repository_info: dict
+    device: str
+    timestamp: str
+# ========================= HELPER FUNCTIONS =========================
+def create_kmers(sequence, k=6):
+    """Convert DNA sequence to k-mer tokens (for DNABERT)"""
+    kmers = []
+    for i in range(len(sequence) - k + 1):
+        kmer = sequence[i:i+k]
+        kmers.append(kmer)
+    return ' '.join(kmers)
+def ensure_models_directory():
+    models_dir = "models"
+    if not os.path.exists(models_dir):
+        os.makedirs(models_dir)
+        print(f"✅ Created {models_dir} directory")
+    return models_dir
+def download_model_from_hub(model_name: str) -> Optional[str]:
+    """Download individual model files from HuggingFace Hub"""
+    try:
+        if model_name not in MODEL_REPO["files"]:
+            raise ValueError(f"Unknown model: {model_name}")
+        filename = MODEL_REPO["files"][model_name]
+        repo_id = MODEL_REPO["repo_id"]
+        models_dir = ensure_models_directory()
+        local_path = os.path.join(models_dir, filename)
+        if os.path.exists(local_path):
+            print(f"✅ Found {model_name} in local models directory: {local_path}")
+            return local_path
+        print(f"📥 Downloading {model_name} ({filename}) from {repo_id}...")
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+        if not token:
+            print("⚠️ Warning: No HF token found. This may fail for private repositories.")
+        temp_model_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type="model",
+            token=token
+        )
+        shutil.copy2(temp_model_path, local_path)
+        print(f"✅ {model_name} downloaded and stored!")
+        return local_path
+    except Exception as e:
+        print(f"❌ Error downloading {model_name}: {e}")
+        return None
+def extract_features_from_sequence(sequence: str):
+    """Extract features from sequence using ProtBERT"""
+    global transformer_model, transformer_tokenizer
+    if transformer_model is None or transformer_tokenizer is None:
+        raise ValueError("ProtBERT model not loaded")
+    # ProtBERT expects sequences with spaces between amino acids
+    # Convert "MKTAYIAKQR" to "M K T A Y I A K Q R"
+    processed_seq = ' '.join(list(sequence.upper()))
+    # Tokenize
+    inputs = transformer_tokenizer(
+        processed_seq,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Extract features
+    with torch.no_grad():
+        outputs = transformer_model(**inputs)
+        # Use [CLS] token embedding
+        cls_embeddings = outputs.last_hidden_state[:, 0, :]
+    return cls_embeddings.cpu().numpy()
+def load_all_models():
+    """Load all models from HuggingFace Hub"""
+    global classifier, scaler, transformer_model, transformer_tokenizer
+    models_dir = ensure_models_directory()
+    models_loaded = {
+        "classifier": False,
+        "scaler": False,
+        "transformer_model": False,
+        "transformer_tokenizer": False
+    }
+    print(f"🚀 Loading models from {MODEL_REPO['repo_id']}...")
+    print("=" * 60)
+    try:
+        # Download all necessary files
+        print("📥 Downloading ProtBERT model files...")
+        files_to_download = ["config", "model_weights", "vocab",
+                            "tokenizer_config", "special_tokens_map"]
+        for file_key in files_to_download:
+            download_model_from_hub(file_key)
+        # Load ProtBERT Tokenizer
+        print("🔄 Loading ProtBERT tokenizer...")
+        try:
+            transformer_tokenizer = BertTokenizer.from_pretrained(
+                models_dir,
+                do_lower_case=False,
+                local_files_only=True
+            )
+            models_loaded["transformer_tokenizer"] = True
+            print("✅ ProtBERT tokenizer loaded!")
+        except Exception as e:
+            print(f"❌ Error loading tokenizer: {e}")
+            # Try loading from HuggingFace directly
+            print("🔄 Trying to load tokenizer directly from HuggingFace...")
+            token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            transformer_tokenizer = BertTokenizer.from_pretrained(
+                MODEL_REPO["repo_id"],
+                do_lower_case=False,
+                token=token
+            )
+            models_loaded["transformer_tokenizer"] = True
+            print("✅ ProtBERT tokenizer loaded from HuggingFace!")
+        # Load ProtBERT Model
+        print("🔄 Loading ProtBERT model...")
+        try:
+            transformer_model = BertModel.from_pretrained(
+                models_dir,
+                local_files_only=True
+            )
+            models_loaded["transformer_model"] = True
+            print("✅ ProtBERT model loaded!")
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            # Try loading from HuggingFace directly
+            print("🔄 Trying to load model directly from HuggingFace...")
+            token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            transformer_model = BertModel.from_pretrained(
+                MODEL_REPO["repo_id"],
+                token=token
+            )
+            models_loaded["transformer_model"] = True
+            print("✅ ProtBERT model loaded from HuggingFace!")
+        transformer_model.to(device)
+        transformer_model.eval()
+        # Load Classifier
+        print("🔄 Loading classifier (MHSA-GRU)...")
+        clf_path = os.path.join(models_dir, MODEL_REPO["files"]["classifier"])
+        if not os.path.exists(clf_path):
+            print("📥 Classifier not found locally, downloading...")
+            clf_path = download_model_from_hub("classifier")
+        if clf_path and os.path.exists(clf_path):
+            checkpoint = torch.load(clf_path, map_location=device, weights_only=False)
+            # Handle different checkpoint formats
+            if 'input_dim' in checkpoint:
+                input_dim = checkpoint['input_dim']
+            else:
+                # ProtBERT embedding size is 1024
+                input_dim = 1024
+            classifier = MHSA_GRU(input_dim, hidden_dim=256)
+            # Load state dict
+            if 'model_state_dict' in checkpoint:
+                classifier.load_state_dict(checkpoint['model_state_dict'])
+            else:
+                classifier.load_state_dict(checkpoint)
+            classifier.to(device)
+            classifier.eval()
+            models_loaded["classifier"] = True
+            print(f"✅ Classifier loaded! (input_dim: {input_dim})")
+        # Load Scaler
+        print("🔄 Loading feature scaler...")
+        scaler_path = os.path.join(models_dir, MODEL_REPO["files"]["scaler"])
+        if not os.path.exists(scaler_path):
+            print("📥 Scaler not found locally, downloading...")
+            scaler_path = download_model_from_hub("scaler")
+        if scaler_path and os.path.exists(scaler_path):
+            scaler = joblib.load(scaler_path)
+            models_loaded["scaler"] = True
+            print("✅ Scaler loaded!")
+        loaded_count = sum(models_loaded.values())
+        total_count = len(models_loaded)
+        print(f"\n📊 Model Loading Summary:")
+        print(f"   • Successfully loaded: {loaded_count}/{total_count}")
+        print(f"   • Repository: {MODEL_REPO['repo_id']}")
+        print(f"   • Embedding Model: {MODEL_NAME}")
+        print(f"   • Device: {device}")
+        critical_models = ["classifier", "scaler", "transformer_model", "transformer_tokenizer"]
+        critical_loaded = all(models_loaded[m] for m in critical_models)
+        if critical_loaded:
+            print("🎉 All critical models loaded successfully!")
+            return True
+        else:
+            print("⚠️ Some critical models failed to load")
+            print(f"   Models status: {models_loaded}")
+            return False
+    except Exception as e:
+        print(f"❌ Error loading models: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+# ========================= FASTAPI APPLICATION =========================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    print("🚀 Starting Toxicity Prediction API...")
+    success = load_all_models()
+    if not success:
+        print("⚠️ Warning: Not all models loaded successfully")
+    yield
+    # Shutdown
+    print("🔄 Shutting down API...")
+app = FastAPI(
+    title="Toxicity Prediction API",
+    description="API for toxicity prediction using MHSA-GRU with Transformer embeddings",
+    version="1.0.0",
+    lifespan=lifespan
+)
+@app.get("/")
+async def root():
+    return {
+        "message": "Toxicity Prediction API",
+        "version": API_VERSION,
+        "endpoints": {
+            "/predict": "POST - Predict toxicity for a single sequence",
+            "/predict/batch": "POST - Predict toxicity for multiple sequences",
+            "/health": "GET - Check API health and model status"
+        }
+    }
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(request: SequenceRequest):
+    start_time = time.time()
+    timestamp = datetime.now(timezone.utc).isoformat()
+    try:
+        if not request.sequence or len(request.sequence) == 0:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "status_code": 400,
+                    "status": "error",
+                    "success": False,
+                    "error": "No sequence provided",
+                    "error_code": "MISSING_SEQUENCE",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Check if models are loaded
+        if classifier is None or scaler is None or transformer_model is None:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status_code": 503,
+                    "status": "error",
+                    "success": False,
+                    "error": "Models not loaded properly",
+                    "error_code": "MODEL_NOT_LOADED",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Validate sequence
+        sequence = request.sequence.upper().strip()
+        if len(sequence) < 10:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "status_code": 400,
+                    "status": "error",
+                    "success": False,
+                    "error": "Sequence too short (minimum 10 characters)",
+                    "error_code": "SEQUENCE_TOO_SHORT",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Step 1: Extract features using ProtBERT
+        features = extract_features_from_sequence(sequence)
+        # Step 2: Scale features
+        scaled_features = scaler.transform(features)
+        # Step 3: Predict using MHSA-GRU
+        features_tensor = torch.FloatTensor(scaled_features).to(device)
+        with torch.no_grad():
+            probability = classifier(features_tensor).cpu().numpy()[0, 0]
+        # Determine prediction
+        prediction_class = 1 if probability > 0.5 else 0
+        predicted_label = "Toxic" if prediction_class == 1 else "Non-Toxic"
+        confidence = float(abs(probability - 0.5) * 2)
+        # Determine confidence level
+        if confidence > 0.8:
+            confidence_level = "high"
+        elif confidence > 0.6:
+            confidence_level = "medium"
+        else:
+            confidence_level = "low"
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        return PredictionResponse(
+            status_code=200,
+            status="success",
+            success=True,
+            data={
+                "sequence": sequence[:100] + "..." if len(sequence) > 100 else sequence,
+                "sequence_length": len(sequence),
+                "prediction": {
+                    "predicted_class": predicted_label,
+                    "confidence": confidence,
+                    "confidence_level": confidence_level,
+                    "toxicity_score": float(probability),
+                    "non_toxicity_score": float(1 - probability)
+                },
+                "metadata": {
+                    "embedding_model": MODEL_NAME,
+                    "embedding_type": EMBEDDING_TYPE,
+                    "model_version": MODEL_VERSION,
+                    "device": str(device)
+                }
+            },
+            timestamp=timestamp,
+            api_version=API_VERSION,
+            processing_time_ms=processing_time
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "status_code": 500,
+                "status": "error",
+                "success": False,
+                "error": f"Internal server error: {str(e)}",
+                "error_code": "INTERNAL_ERROR",
+                "timestamp": timestamp,
+                "api_version": API_VERSION,
+                "processing_time_ms": processing_time
+            }
+        )
+@app.post("/predict/batch", response_model=PredictionResponse)
+async def predict_batch(request: BatchSequenceRequest):
+    start_time = time.time()
+    timestamp = datetime.now(timezone.utc).isoformat()
+    try:
+        if not request.sequences or len(request.sequences) == 0:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "status_code": 400,
+                    "status": "error",
+                    "success": False,
+                    "error": "No sequences provided",
+                    "error_code": "MISSING_SEQUENCES",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Check if models are loaded
+        if classifier is None or scaler is None or transformer_model is None:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status_code": 503,
+                    "status": "error",
+                    "success": False,
+                    "error": "Models not loaded properly",
+                    "error_code": "MODEL_NOT_LOADED",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        results = []
+        for seq in request.sequences:
+            sequence = seq.upper().strip()
+            # Extract features using ProtBERT
+            features = extract_features_from_sequence(sequence)
+            scaled_features = scaler.transform(features)
+            features_tensor = torch.FloatTensor(scaled_features).to(device)
+            with torch.no_grad():
+                probability = classifier(features_tensor).cpu().numpy()[0, 0]
+            prediction_class = 1 if probability > 0.5 else 0
+            predicted_label = "Toxic" if prediction_class == 1 else "Non-Toxic"
+            confidence = float(abs(probability - 0.5) * 2)
+            results.append({
+                "sequence": sequence[:100] + "..." if len(sequence) > 100 else sequence,
+                "sequence_length": len(sequence),
+                "predicted_class": predicted_label,
+                "toxicity_score": float(probability),
+                "confidence": confidence
+            })
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        return PredictionResponse(
+            status_code=200,
+            status="success",
+            success=True,
+            data={
+                "total_sequences": len(request.sequences),
+                "results": results,
+                "metadata": {
+                    "embedding_model": MODEL_NAME,
+                    "embedding_type": EMBEDDING_TYPE,
+                    "model_version": MODEL_VERSION,
+                    "device": str(device)
+                }
+            },
+            timestamp=timestamp,
+            api_version=API_VERSION,
+            processing_time_ms=processing_time
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "status_code": 500,
+                "status": "error",
+                "success": False,
+                "error": f"Internal server error: {str(e)}",
+                "error_code": "INTERNAL_ERROR",
+                "timestamp": timestamp,
+                "api_version": API_VERSION,
+                "processing_time_ms": processing_time
+            }
+        )
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    models_loaded = all([
+        classifier is not None,
+        scaler is not None,
+        transformer_model is not None,
+        transformer_tokenizer is not None
+    ])
+    model_sources = {
+        "classifier": {
+            "loaded": classifier is not None,
+            "source": "huggingface_hub",
+            "repository": MODEL_REPO["repo_id"]
+        },
+        "scaler": {
+            "loaded": scaler is not None,
+            "source": "huggingface_hub",
+            "repository": MODEL_REPO["repo_id"]
+        },
+        "transformer_model": {
+            "loaded": transformer_model is not None,
+            "model_name": MODEL_NAME,
+            "source": "huggingface_hub",
+            "repository": MODEL_REPO["repo_id"]
+        }
+    }
+    repository_info = {
+        "repository_id": MODEL_REPO["repo_id"],
+        "embedding_type": EMBEDDING_TYPE,
+        "model_name": MODEL_NAME,
+        "total_models": len(MODEL_REPO["files"])
+    }
+    return HealthResponse(
+        status_code=200 if models_loaded else 503,
+        status="healthy" if models_loaded else "unhealthy",
+        service="Toxicity Prediction API",
+        api_version=API_VERSION,
+        model_version=MODEL_VERSION,
+        models_loaded=models_loaded,
+        models_loaded_count=sum(1 for source in model_sources.values() if source["loaded"]),
+        total_models_required=4,
+        model_sources=model_sources,
+        repository_info=repository_info,
+        device=str(device),
+        timestamp=datetime.now(timezone.utc).isoformat()
+    )
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

app.py ADDED Viewed

	@@ -0,0 +1,813 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Optional, List
+import time
+from datetime import datetime, timezone
+import os
+import warnings
+from huggingface_hub import hf_hub_download
+from contextlib import asynccontextmanager
+import uvicorn
+from dotenv import load_dotenv
+import shutil
+import joblib
+from pathlib import Path
+from transformers import BertTokenizer, BertModel
+from utils.model_classes import MHSA_GRU, MultiHeadSelfAttention
+load_dotenv()
+warnings.filterwarnings('ignore')
+# ========================= CONFIGURATION =========================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+API_VERSION = "1.0.0"
+MODEL_VERSION = "MHSA-GRU-Transformer-v1.0"
+# Model repository configuration
+MODEL_REPO = {
+    "repo_id": "camlas/toxicity",
+    "files": {
+        "classifier": "mhsa_gru_classifier.pth",
+        "scaler": "scaler.pkl",
+        "config": "config.json",
+        "model_weights": "model.safetensors",
+        "vocab": "vocab.txt",
+        "tokenizer_config": "tokenizer_config.json",
+        "special_tokens_map": "special_tokens_map.json"
+    }
+}
+# Global model variables
+classifier = None
+scaler = None
+transformer_model = None
+transformer_tokenizer = None
+EMBEDDING_TYPE = "Bert"
+MODEL_NAME = "ProtBERT"
+# ========================= PYDANTIC MODELS =========================
+class SequenceRequest(BaseModel):
+    sequence: str
+class BatchSequenceRequest(BaseModel):
+    sequences: List[str]
+class PredictionResponse(BaseModel):
+    status_code: int
+    status: str
+    success: bool
+    data: Optional[dict] = None
+    error: Optional[str] = None
+    error_code: Optional[str] = None
+    timestamp: str
+    api_version: str
+    processing_time_ms: float
+class HealthResponse(BaseModel):
+    status_code: int
+    status: str
+    service: str
+    api_version: str
+    model_version: str
+    models_loaded: bool
+    models_loaded_count: int
+    total_models_required: int
+    model_sources: dict
+    repository_info: dict
+    device: str
+    timestamp: str
+# ========================= HELPER FUNCTIONS =========================
+def create_kmers(sequence, k=6):
+    """Convert DNA sequence to k-mer tokens (for DNABERT)"""
+    kmers = []
+    for i in range(len(sequence) - k + 1):
+        kmer = sequence[i:i+k]
+        kmers.append(kmer)
+    return ' '.join(kmers)
+def ensure_models_directory():
+    models_dir = "models"
+    if not os.path.exists(models_dir):
+        os.makedirs(models_dir)
+        print(f"✅ Created {models_dir} directory")
+    return models_dir
+def download_model_from_hub(model_name: str) -> Optional[str]:
+    """Download individual model files from HuggingFace Hub"""
+    try:
+        if model_name not in MODEL_REPO["files"]:
+            raise ValueError(f"Unknown model: {model_name}")
+        filename = MODEL_REPO["files"][model_name]
+        repo_id = MODEL_REPO["repo_id"]
+        models_dir = ensure_models_directory()
+        local_path = os.path.join(models_dir, filename)
+        if os.path.exists(local_path):
+            print(f"✅ Found {model_name} in local models directory: {local_path}")
+            return local_path
+        print(f"📥 Downloading {model_name} ({filename}) from {repo_id}...")
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+        if not token:
+            print("⚠️ Warning: No HF token found. This may fail for private repositories.")
+        temp_model_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type="model",
+            token=token
+        )
+        shutil.copy2(temp_model_path, local_path)
+        print(f"✅ {model_name} downloaded and stored!")
+        return local_path
+    except Exception as e:
+        print(f"❌ Error downloading {model_name}: {e}")
+        return None
+def extract_features_from_sequence(sequence: str):
+    """Extract features from sequence using ProtBERT"""
+    global transformer_model, transformer_tokenizer
+    if transformer_model is None or transformer_tokenizer is None:
+        raise ValueError("ProtBERT model not loaded")
+    # ProtBERT expects sequences with spaces between amino acids
+    # Convert "MKTAYIAKQR" to "M K T A Y I A K Q R"
+    processed_seq = ' '.join(list(sequence.upper()))
+    # Tokenize
+    inputs = transformer_tokenizer(
+        processed_seq,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Extract features
+    with torch.no_grad():
+        outputs = transformer_model(**inputs)
+        # Use [CLS] token embedding
+        cls_embeddings = outputs.last_hidden_state[:, 0, :]
+    return cls_embeddings.cpu().numpy()
+def load_all_models():
+    """Load all models from HuggingFace Hub"""
+    global classifier, scaler, transformer_model, transformer_tokenizer
+    models_dir = ensure_models_directory()
+    models_loaded = {
+        "classifier": False,
+        "scaler": False,
+        "transformer_model": False,
+        "transformer_tokenizer": False
+    }
+    print(f"🚀 Loading models from {MODEL_REPO['repo_id']}...")
+    print("=" * 60)
+    try:
+        # Download all necessary files
+        print("📥 Downloading ProtBERT model files...")
+        files_to_download = ["config", "model_weights", "vocab",
+                            "tokenizer_config", "special_tokens_map"]
+        for file_key in files_to_download:
+            download_model_from_hub(file_key)
+        # Load ProtBERT Tokenizer
+        print("🔄 Loading ProtBERT tokenizer...")
+        try:
+            transformer_tokenizer = BertTokenizer.from_pretrained(
+                models_dir,
+                do_lower_case=False,
+                local_files_only=True
+            )
+            models_loaded["transformer_tokenizer"] = True
+            print("✅ ProtBERT tokenizer loaded!")
+        except Exception as e:
+            print(f"❌ Error loading tokenizer: {e}")
+            # Try loading from HuggingFace directly
+            print("🔄 Trying to load tokenizer directly from HuggingFace...")
+            token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            transformer_tokenizer = BertTokenizer.from_pretrained(
+                MODEL_REPO["repo_id"],
+                do_lower_case=False,
+                token=token
+            )
+            models_loaded["transformer_tokenizer"] = True
+            print("✅ ProtBERT tokenizer loaded from HuggingFace!")
+        # Load ProtBERT Model
+        print("🔄 Loading ProtBERT model...")
+        try:
+            transformer_model = BertModel.from_pretrained(
+                models_dir,
+                local_files_only=True
+            )
+            models_loaded["transformer_model"] = True
+            print("✅ ProtBERT model loaded!")
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            # Try loading from HuggingFace directly
+            print("🔄 Trying to load model directly from HuggingFace...")
+            token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            transformer_model = BertModel.from_pretrained(
+                MODEL_REPO["repo_id"],
+                token=token
+            )
+            models_loaded["transformer_model"] = True
+            print("✅ ProtBERT model loaded from HuggingFace!")
+        transformer_model.to(device)
+        transformer_model.eval()
+        # Load Classifier
+        print("🔄 Loading classifier (MHSA-GRU)...")
+        clf_path = os.path.join(models_dir, MODEL_REPO["files"]["classifier"])
+        if not os.path.exists(clf_path):
+            print("📥 Classifier not found locally, downloading...")
+            clf_path = download_model_from_hub("classifier")
+        if clf_path and os.path.exists(clf_path):
+            checkpoint = torch.load(clf_path, map_location=device, weights_only=False)
+            # Handle different checkpoint formats
+            if 'input_dim' in checkpoint:
+                input_dim = checkpoint['input_dim']
+            else:
+                # ProtBERT embedding size is 1024
+                input_dim = 1024
+            classifier = MHSA_GRU(input_dim, hidden_dim=256)
+            # Load state dict
+            if 'model_state_dict' in checkpoint:
+                classifier.load_state_dict(checkpoint['model_state_dict'])
+            else:
+                classifier.load_state_dict(checkpoint)
+            classifier.to(device)
+            classifier.eval()
+            models_loaded["classifier"] = True
+            print(f"✅ Classifier loaded! (input_dim: {input_dim})")
+        # Load Scaler
+        print("🔄 Loading feature scaler...")
+        scaler_path = os.path.join(models_dir, MODEL_REPO["files"]["scaler"])
+        if not os.path.exists(scaler_path):
+            print("📥 Scaler not found locally, downloading...")
+            scaler_path = download_model_from_hub("scaler")
+        if scaler_path and os.path.exists(scaler_path):
+            scaler = joblib.load(scaler_path)
+            models_loaded["scaler"] = True
+            print("✅ Scaler loaded!")
+        loaded_count = sum(models_loaded.values())
+        total_count = len(models_loaded)
+        print(f"\n📊 Model Loading Summary:")
+        print(f"   • Successfully loaded: {loaded_count}/{total_count}")
+        print(f"   • Repository: {MODEL_REPO['repo_id']}")
+        print(f"   • Embedding Model: {MODEL_NAME}")
+        print(f"   • Device: {device}")
+        critical_models = ["classifier", "scaler", "transformer_model", "transformer_tokenizer"]
+        critical_loaded = all(models_loaded[m] for m in critical_models)
+        if critical_loaded:
+            print("🎉 All critical models loaded successfully!")
+            return True
+        else:
+            print("⚠️ Some critical models failed to load")
+            print(f"   Models status: {models_loaded}")
+            return False
+    except Exception as e:
+        print(f"❌ Error loading models: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+# ========================= FASTAPI APPLICATION =========================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    print("🚀 Starting Toxicity Prediction API...")
+    success = load_all_models()
+    if not success:
+        print("⚠️ Warning: Not all models loaded successfully")
+    yield
+    # Shutdown
+    print("🔄 Shutting down API...")
+app = FastAPI(
+    title="Toxicity Prediction API",
+    description="API for toxicity prediction using MHSA-GRU with Transformer embeddings",
+    version="1.0.0",
+    lifespan=lifespan
+)
+@app.get("/")
+async def root():
+    return {
+        "message": "Toxicity Prediction API",
+        "version": API_VERSION,
+        "endpoints": {
+            "/predict": "POST - Predict toxicity for a single sequence",
+            "/predict/batch": "POST - Predict toxicity for multiple sequences",
+            "/example": "GET - Try the API with a hardcoded example sequence",
+            "/health": "GET - Check API health and model status"
+        },
+        "example_usage": {
+            "single": {
+                "method": "POST",
+                "url": "/predict",
+                "body": {"sequence": "MKTAYIAKQRQISFVKSHFSRQLE"}
+            },
+            "batch": {
+                "method": "POST",
+                "url": "/predict/batch",
+                "body": {
+                    "sequences": [
+                        "MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES",
+                        "MFGLPQQEVSEEEKRAHQEQTEKTLKQAAYVAAFLWVSPMIWHLVKKQWK"
+                    ]
+                }
+            },
+            "example": {
+                "method": "GET",
+                "url": "/example",
+                "description": "No input needed - just call this endpoint"
+            }
+        }
+    }
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(request: SequenceRequest):
+    start_time = time.time()
+    timestamp = datetime.now(timezone.utc).isoformat()
+    try:
+        if not request.sequence or len(request.sequence) == 0:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "status_code": 400,
+                    "status": "error",
+                    "success": False,
+                    "error": "No sequence provided",
+                    "error_code": "MISSING_SEQUENCE",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Check if models are loaded
+        if classifier is None or scaler is None or transformer_model is None:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status_code": 503,
+                    "status": "error",
+                    "success": False,
+                    "error": "Models not loaded properly",
+                    "error_code": "MODEL_NOT_LOADED",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Validate sequence
+        sequence = request.sequence.upper().strip()
+        if len(sequence) < 10:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "status_code": 400,
+                    "status": "error",
+                    "success": False,
+                    "error": "Sequence too short (minimum 10 characters)",
+                    "error_code": "SEQUENCE_TOO_SHORT",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Step 1: Extract features using ProtBERT
+        features = extract_features_from_sequence(sequence)
+        # Step 2: Scale features
+        scaled_features = scaler.transform(features)
+        # Step 3: Predict using MHSA-GRU
+        features_tensor = torch.FloatTensor(scaled_features).to(device)
+        with torch.no_grad():
+            probability = classifier(features_tensor).cpu().numpy()[0, 0]
+        # Determine prediction
+        prediction_class = 1 if probability > 0.5 else 0
+        predicted_label = "Toxic" if prediction_class == 1 else "Non-Toxic"
+        confidence = float(abs(probability - 0.5) * 2)
+        # Determine confidence level
+        if confidence > 0.8:
+            confidence_level = "high"
+        elif confidence > 0.6:
+            confidence_level = "medium"
+        else:
+            confidence_level = "low"
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        return PredictionResponse(
+            status_code=200,
+            status="success",
+            success=True,
+            data={
+                "sequence": sequence[:100] + "..." if len(sequence) > 100 else sequence,
+                "sequence_length": len(sequence),
+                "prediction": {
+                    "predicted_class": predicted_label,
+                    "confidence": confidence,
+                    "confidence_level": confidence_level,
+                    "toxicity_score": float(probability),
+                    "non_toxicity_score": float(1 - probability)
+                },
+                "metadata": {
+                    "embedding_model": MODEL_NAME,
+                    "embedding_type": EMBEDDING_TYPE,
+                    "model_version": MODEL_VERSION,
+                    "device": str(device)
+                }
+            },
+            timestamp=timestamp,
+            api_version=API_VERSION,
+            processing_time_ms=processing_time
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "status_code": 500,
+                "status": "error",
+                "success": False,
+                "error": f"Internal server error: {str(e)}",
+                "error_code": "INTERNAL_ERROR",
+                "timestamp": timestamp,
+                "api_version": API_VERSION,
+                "processing_time_ms": processing_time
+            }
+        )
+@app.post("/predict/batch", response_model=PredictionResponse)
+async def predict_batch(request: BatchSequenceRequest):
+    """
+    Predict toxicity for multiple sequences at once.
+    Example request body:
+    {
+        "sequences": [
+            "MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES",
+            "MFGLPQQEVSEEEKRAHQEQTEKTLKQAAYVAAFLWVSPMIWHLVKKQWK"
+        ]
+    }
+    """
+    start_time = time.time()
+    timestamp = datetime.now(timezone.utc).isoformat()
+    try:
+        if not request.sequences or len(request.sequences) == 0:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "status_code": 400,
+                    "status": "error",
+                    "success": False,
+                    "error": "No sequences provided",
+                    "error_code": "MISSING_SEQUENCES",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        # Check if models are loaded
+        if classifier is None or scaler is None or transformer_model is None:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status_code": 503,
+                    "status": "error",
+                    "success": False,
+                    "error": "Models not loaded properly",
+                    "error_code": "MODEL_NOT_LOADED",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        results = []
+        for idx, seq in enumerate(request.sequences, 1):
+            try:
+                sequence = seq.upper().strip()
+                # Validate sequence length
+                if len(sequence) < 10:
+                    results.append({
+                        "sequence_index": idx,
+                        "sequence": sequence[:100] + "..." if len(sequence) > 100 else sequence,
+                        "sequence_length": len(sequence),
+                        "error": "Sequence too short (minimum 10 characters)",
+                        "predicted_class": None,
+                        "toxicity_score": None,
+                        "confidence": None
+                    })
+                    continue
+                # Extract features using ProtBERT
+                features = extract_features_from_sequence(sequence)
+                scaled_features = scaler.transform(features)
+                features_tensor = torch.FloatTensor(scaled_features).to(device)
+                with torch.no_grad():
+                    probability = classifier(features_tensor).cpu().numpy()[0, 0]
+                prediction_class = 1 if probability > 0.5 else 0
+                predicted_label = "Toxic" if prediction_class == 1 else "Non-Toxic"
+                confidence = float(abs(probability - 0.5) * 2)
+                # Determine confidence level
+                if confidence > 0.8:
+                    confidence_level = "high"
+                elif confidence > 0.6:
+                    confidence_level = "medium"
+                else:
+                    confidence_level = "low"
+                results.append({
+                    "sequence_index": idx,
+                    "sequence": sequence[:100] + "..." if len(sequence) > 100 else sequence,
+                    "sequence_length": len(sequence),
+                    "predicted_class": predicted_label,
+                    "toxicity_score": float(probability),
+                    "non_toxicity_score": float(1 - probability),
+                    "confidence": confidence,
+                    "confidence_level": confidence_level,
+                    "error": None
+                })
+            except Exception as e:
+                # Handle individual sequence errors without stopping the batch
+                results.append({
+                    "sequence_index": idx,
+                    "sequence": seq[:100] + "..." if len(seq) > 100 else seq,
+                    "sequence_length": len(seq),
+                    "error": f"Error processing sequence: {str(e)}",
+                    "predicted_class": None,
+                    "toxicity_score": None,
+                    "confidence": None
+                })
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        # Count successful predictions
+        successful_predictions = sum(1 for r in results if r.get("predicted_class") is not None)
+        return PredictionResponse(
+            status_code=200,
+            status="success",
+            success=True,
+            data={
+                "total_sequences": len(request.sequences),
+                "successful_predictions": successful_predictions,
+                "failed_predictions": len(request.sequences) - successful_predictions,
+                "results": results,
+                "metadata": {
+                    "embedding_model": MODEL_NAME,
+                    "embedding_type": EMBEDDING_TYPE,
+                    "model_version": MODEL_VERSION,
+                    "device": str(device)
+                }
+            },
+            timestamp=timestamp,
+            api_version=API_VERSION,
+            processing_time_ms=processing_time
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "status_code": 500,
+                "status": "error",
+                "success": False,
+                "error": f"Internal server error: {str(e)}",
+                "error_code": "INTERNAL_ERROR",
+                "timestamp": timestamp,
+                "api_version": API_VERSION,
+                "processing_time_ms": processing_time
+            }
+        )
+@app.get("/example", response_model=PredictionResponse)
+async def predict_example():
+    """
+    Predict using a hardcoded example protein sequence.
+    No input required - just call this endpoint to see how the API works.
+    Example sequence: MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES
+    """
+    start_time = time.time()
+    timestamp = datetime.now(timezone.utc).isoformat()
+    # Hardcoded example sequence
+    EXAMPLE_SEQUENCE = "MLLPATMSDKPDMAEIEKFDKSKLKKTETQEKNPLPSKETIEQEKQAGES"
+    try:
+        # Check if models are loaded
+        if classifier is None or scaler is None or transformer_model is None:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status_code": 503,
+                    "status": "error",
+                    "success": False,
+                    "error": "Models not loaded properly",
+                    "error_code": "MODEL_NOT_LOADED",
+                    "timestamp": timestamp,
+                    "api_version": API_VERSION,
+                    "processing_time_ms": round((time.time() - start_time) * 1000, 2)
+                }
+            )
+        sequence = EXAMPLE_SEQUENCE.upper().strip()
+        # Step 1: Extract features using ProtBERT
+        features = extract_features_from_sequence(sequence)
+        # Step 2: Scale features
+        scaled_features = scaler.transform(features)
+        # Step 3: Predict using MHSA-GRU
+        features_tensor = torch.FloatTensor(scaled_features).to(device)
+        with torch.no_grad():
+            probability = classifier(features_tensor).cpu().numpy()[0, 0]
+        # Determine prediction
+        prediction_class = 1 if probability > 0.5 else 0
+        predicted_label = "Toxic" if prediction_class == 1 else "Non-Toxic"
+        confidence = float(abs(probability - 0.5) * 2)
+        # Determine confidence level
+        if confidence > 0.8:
+            confidence_level = "high"
+        elif confidence > 0.6:
+            confidence_level = "medium"
+        else:
+            confidence_level = "low"
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        return PredictionResponse(
+            status_code=200,
+            status="success",
+            success=True,
+            data={
+                "note": "This is an example prediction using a hardcoded sequence",
+                "sequence": sequence,
+                "sequence_length": len(sequence),
+                "prediction": {
+                    "predicted_class": predicted_label,
+                    "confidence": confidence,
+                    "confidence_level": confidence_level,
+                    "toxicity_score": float(probability),
+                    "non_toxicity_score": float(1 - probability)
+                },
+                "metadata": {
+                    "embedding_model": MODEL_NAME,
+                    "embedding_type": EMBEDDING_TYPE,
+                    "model_version": MODEL_VERSION,
+                    "device": str(device),
+                    "source": "hardcoded_example"
+                }
+            },
+            timestamp=timestamp,
+            api_version=API_VERSION,
+            processing_time_ms=processing_time
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        processing_time = round((time.time() - start_time) * 1000, 2)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "status_code": 500,
+                "status": "error",
+                "success": False,
+                "error": f"Internal server error: {str(e)}",
+                "error_code": "INTERNAL_ERROR",
+                "timestamp": timestamp,
+                "api_version": API_VERSION,
+                "processing_time_ms": processing_time
+            }
+        )
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    models_loaded = all([
+        classifier is not None,
+        scaler is not None,
+        transformer_model is not None,
+        transformer_tokenizer is not None
+    ])
+    model_sources = {
+        "classifier": {
+            "loaded": classifier is not None,
+            "source": "huggingface_hub",
+            "repository": MODEL_REPO["repo_id"]
+        },
+        "scaler": {
+            "loaded": scaler is not None,
+            "source": "huggingface_hub",
+            "repository": MODEL_REPO["repo_id"]
+        },
+        "transformer_model": {
+            "loaded": transformer_model is not None,
+            "model_name": MODEL_NAME,
+            "source": "huggingface_hub",
+            "repository": MODEL_REPO["repo_id"]
+        }
+    }
+    repository_info = {
+        "repository_id": MODEL_REPO["repo_id"],
+        "embedding_type": EMBEDDING_TYPE,
+        "model_name": MODEL_NAME,
+        "total_models": len(MODEL_REPO["files"])
+    }
+    return HealthResponse(
+        status_code=200 if models_loaded else 503,
+        status="healthy" if models_loaded else "unhealthy",
+        service="Toxicity Prediction API",
+        api_version=API_VERSION,
+        model_version=MODEL_VERSION,
+        models_loaded=models_loaded,
+        models_loaded_count=sum(1 for source in model_sources.values() if source["loaded"]),
+        total_models_required=4,
+        model_sources=model_sources,
+        repository_info=repository_info,
+        device=str(device),
+        timestamp=datetime.now(timezone.utc).isoformat()
+    )
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

convert_base64.ipynb ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81ff91ce53ae83fe",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-10T07:07:50.829656Z",
+     "start_time": "2025-07-10T07:07:50.824248Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import base64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-10T07:08:19.010102Z",
+     "start_time": "2025-07-10T07:08:19.004314Z"
+    },
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "with open(\"examples/cancer_example.jpg\", \"rb\") as f:\n",
+    "    encoded = base64.b64encode(f.read()).decode()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35cac43020ae6db3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-10T07:08:35.977343Z",
+     "start_time": "2025-07-10T07:08:35.973715Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(encoded)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "3.12.2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

images/camlas-background.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+torch
+torchvision
+huggingface_hub
+numpy<2.3.0
+pandas
+scikit-learn
+Pillow
+matplotlib
+seaborn
+plotly
+requests
+dotenv
+fastapi
+uvicorn[standard]
+pydantic
+timm
+python-multipart
+transformers
+# opencv-python

utils/model_classes.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+class MultiHeadSelfAttention(nn.Module):
+    """Multi-Head Self-Attention mechanism"""
+    def __init__(self, embed_dim, num_heads, dropout=0.3):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.attention = nn.MultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        attn_output, _ = self.attention(x, x, x)
+        x = self.layer_norm(x + self.dropout(attn_output))
+        return x
+class MHSA_GRU(nn.Module):
+    """Multi-Head Self-Attention with GRU model"""
+    def __init__(self, input_dim, hidden_dim=256, num_heads=8, num_gru_layers=2, dropout=0.3):
+        super(MHSA_GRU, self).__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.input_projection = nn.Linear(input_dim, hidden_dim)
+        self.mhsa1 = MultiHeadSelfAttention(hidden_dim, num_heads, dropout)
+        self.mhsa2 = MultiHeadSelfAttention(hidden_dim, num_heads, dropout)
+        self.gru = nn.GRU(
+            input_size=hidden_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_gru_layers,
+            batch_first=True,
+            dropout=dropout if num_gru_layers > 1 else 0,
+            bidirectional=False
+        )
+        self.mhsa3 = MultiHeadSelfAttention(hidden_dim, num_heads, dropout)
+        self.dropout = nn.Dropout(dropout)
+        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
+        self.fc2 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
+        self.fc3 = nn.Linear(hidden_dim // 4, 1)
+        self.bn1 = nn.BatchNorm1d(hidden_dim // 2)
+        self.bn2 = nn.BatchNorm1d(hidden_dim // 4)
+    def forward(self, x):
+        batch_size = x.size(0)
+        x = self.input_projection(x)
+        x = x.unsqueeze(1)
+        x = self.mhsa1(x)
+        x = self.mhsa2(x)
+        gru_out, hidden = self.gru(x)
+        x = self.mhsa3(gru_out)
+        x = x[:, -1, :]
+        x = self.dropout(x)
+        x = torch.relu(self.bn1(self.fc1(x)))
+        x = self.dropout(x)
+        x = torch.relu(self.bn2(self.fc2(x)))
+        x = self.dropout(x)
+        x = self.fc3(x)
+        return torch.sigmoid(x)