Spaces:

Prashant26am
/

llava-chat

Sleeping

App Files Files Community

Prashant26am commited on May 24

Commit

e5d40e3

1 Parent(s): 8d272fe

fix: Update Gradio to 4.44.1 and improve interface

Browse files

Files changed (25) hide show

.github/workflows/ci.yml +142 -0
.pre-commit-config.yaml +69 -0
README.md +159 -52
app.py +0 -148
docs/api/README.md +121 -0
docs/guides/developer_guide.md +362 -0
docs/guides/user_guide.md +164 -0
examples/api_client.py +127 -0
examples/llava_demo.ipynb +1 -0
examples/process_image.py +103 -0
pyproject.toml +181 -0
requirements-dev.txt +38 -0
requirements.txt +18 -19
src/__init__.py +0 -0
src/api/__init__.py +0 -0
src/api/app.py +159 -0
src/configs/__init__.py +0 -0
src/configs/settings.py +46 -0
src/models/__init__.py +0 -0
src/models/llava_model.py +88 -0
main.py → src/models/main.py +0 -0
src/requirements.txt +26 -0
src/utils/__init__.py +0 -0
src/utils/logging.py +51 -0
tests/test_model.py +67 -0

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,142 @@

+name: CI
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r requirements-dev.txt
+    - name: Run pre-commit hooks
+      run: |
+        pre-commit install
+        pre-commit run --all-files
+    - name: Run tests
+      run: |
+        pytest --cov=src --cov-report=xml
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v4
+      with:
+        file: ./coverage.xml
+        fail_ci_if_error: true
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+        cache: 'pip'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-dev.txt
+    - name: Run black
+      run: black --check src tests
+    - name: Run isort
+      run: isort --check-only src tests
+    - name: Run flake8
+      run: flake8 src tests
+    - name: Run mypy
+      run: mypy src
+  build:
+    needs: [test, lint]
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+        cache: 'pip'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Build package
+      run: python -m build
+    - name: Check package
+      run: twine check dist/*
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v4
+      with:
+        name: dist
+        path: dist/
+  deploy:
+    needs: build
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v4
+    - name: Download artifacts
+      uses: actions/download-artifact@v4
+      with:
+        name: dist
+        path: dist
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+        cache: 'pip'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install twine
+    - name: Deploy to PyPI
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+      run: twine upload dist/*
+    - name: Deploy to Hugging Face
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        pip install huggingface_hub
+        huggingface-cli login --token $HF_TOKEN
+        huggingface-cli upload Prashant26am/llava-chat dist/* --repo-type space

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+    -   id: check-ast
+    -   id: check-json
+    -   id: check-merge-conflict
+    -   id: detect-private-key
+    -   id: debug-statements
+-   repo: https://github.com/psf/black
+    rev: 24.1.1
+    hooks:
+    -   id: black
+        language_version: python3.8
+-   repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+    -   id: isort
+        args: ["--profile", "black"]
+-   repo: https://github.com/pycqa/flake8
+    rev: 7.0.0
+    hooks:
+    -   id: flake8
+        additional_dependencies:
+        - flake8-docstrings
+        - flake8-bugbear
+        - flake8-comprehensions
+        - flake8-simplify
+        - flake8-unused-arguments
+        - flake8-variables-names
+        - pep8-naming
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.8.0
+    hooks:
+    -   id: mypy
+        additional_dependencies:
+        - types-Pillow
+        - types-requests
+        - types-setuptools
+        - types-urllib3
+-   repo: https://github.com/asottile/pyupgrade
+    rev: v3.15.0
+    hooks:
+    -   id: pyupgrade
+        args: [--py38-plus]
+-   repo: https://github.com/PyCQA/bandit
+    rev: 1.7.7
+    hooks:
+    -   id: bandit
+        args: ["-c", "pyproject.toml"]
+        additional_dependencies: ["bandit[toml]"]
+-   repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v4.0.0-alpha.8
+    hooks:
+    -   id: prettier
+        types_or: [javascript, jsx, ts, tsx, json, css, scss, md, yaml, yml]
+        additional_dependencies:
+        - [email protected]

README.md CHANGED Viewed

@@ -1,52 +1,159 @@
----
-title: LLaVA Chat
-emoji: 🖼️
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-sdk_version: 3.50.2
-app_file: app.py
-pinned: false
-license: mit
----
-# LLaVA Chat
-A powerful multimodal AI assistant that can understand and discuss images. Upload any image and chat with LLaVA about it!
-## Features
-- 🖼️ Upload and analyze any image
-- 💬 Natural conversation about image content
-- ⚙️ Adjustable generation parameters
-- 🎯 High-quality image understanding
-- 🚀 Fast and responsive interface
-## How to Use
-1. Upload an image using the image uploader
-2. Type your question or prompt about the image
-3. (Optional) Adjust the generation parameters:
-   - Max New Tokens: Control response length
-   - Temperature: Adjust response creativity
-   - Top P: Fine-tune response diversity
-4. Click "Generate Response" to get LLaVA's analysis
-## Example Prompts
-- "What can you see in this image?"
-- "Describe this scene in detail"
-- "What emotions does this image convey?"
-- "What's happening in this picture?"
-- "Can you identify any objects or people in this image?"
-## Model Details
-This Space uses the LLaVA (Large Language and Vision Assistant) model, which combines:
-- CLIP ViT-L/14 vision encoder
-- Vicuna-7B language model
-- Advanced multimodal understanding capabilities
-## License
-This project is licensed under the MIT License.

+# LLaVA Implementation
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![Gradio](https://img.shields.io/badge/Gradio-4.44.1-orange.svg)](https://gradio.app/)
+[![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Prashant26am/llava-chat)
+A modern implementation of LLaVA (Large Language and Vision Assistant) with a beautiful web interface. This project combines state-of-the-art vision and language models to create an interactive AI assistant that can understand and discuss images.
+## 🌟 Features
+- **Modern Web Interface**
+  - Beautiful Gradio-based UI
+  - Real-time image analysis
+  - Interactive chat experience
+  - Responsive design
+- **Advanced AI Capabilities**
+  - CLIP ViT-L/14 vision encoder
+  - Vicuna-7B language model
+  - Multimodal understanding
+  - Natural conversation flow
+- **Developer Friendly**
+  - Clean, modular codebase
+  - Comprehensive documentation
+  - Easy deployment options
+  - Extensible architecture
+## 📋 Project Structure
+```
+llava_implementation/
+├── src/                    # Source code
+│   ├── api/               # API endpoints and FastAPI app
+│   ├── models/            # Model implementations
+│   ├── utils/             # Utility functions
+│   └── configs/           # Configuration files
+├── tests/                 # Test suite
+├── docs/                  # Documentation
+│   ├── api/              # API documentation
+│   ├── examples/         # Usage examples
+│   └── guides/           # User and developer guides
+├── assets/               # Static assets
+│   ├── images/          # Example images
+│   └── icons/           # UI icons
+├── scripts/              # Utility scripts
+└── examples/             # Example images for the web interface
+```
+## 🚀 Quick Start
+### Prerequisites
+- Python 3.8+
+- CUDA-capable GPU (recommended)
+- Git
+### Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/Prashant-ambati/llava-implementation.git
+cd llava-implementation
+```
+2. Create and activate a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+### Running Locally
+1. Start the development server:
+```bash
+python src/api/app.py
+```
+2. Open your browser and navigate to:
+```
+http://localhost:7860
+```
+## 🌐 Web Deployment
+### Hugging Face Spaces
+The application is deployed on Hugging Face Spaces:
+- [Live Demo](https://huggingface.co/spaces/Prashant26am/llava-chat)
+- Automatic deployment from main branch
+- Free GPU resources
+- Public API access
+### Local Deployment
+For local deployment:
+```bash
+# Build the application
+python -m build
+# Run with production settings
+python src/api/app.py --production
+```
+## 📚 Documentation
+- [API Documentation](docs/api/README.md)
+- [User Guide](docs/guides/user_guide.md)
+- [Developer Guide](docs/guides/developer_guide.md)
+- [Examples](docs/examples/README.md)
+## 🛠️ Development
+### Running Tests
+```bash
+pytest tests/
+```
+### Code Style
+This project follows PEP 8 guidelines. To check your code:
+```bash
+flake8 src/
+black src/
+```
+### Contributing
+1. Fork the repository
+2. Create a feature branch
+3. Commit your changes
+4. Push to the branch
+5. Create a Pull Request
+## 📝 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## 🙏 Acknowledgments
+- [LLaVA Paper](https://arxiv.org/abs/2304.08485) by Microsoft Research
+- [Gradio](https://gradio.app/) for the web interface
+- [Hugging Face](https://huggingface.co/) for model hosting
+- [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) for the language model
+- [CLIP](https://openai.com/research/clip) for the vision model
+## 📞 Contact
+- GitHub Issues: [Report a bug](https://github.com/Prashant-ambati/llava-implementation/issues)
+- Email: [Your Email]
+- Twitter: [@YourTwitter]

app.py DELETED Viewed

@@ -1,148 +0,0 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-import os
-import tempfile
-from typing import Optional
-from pydantic import BaseModel
-import torch
-import gradio as gr
-from models.llava import LLaVA
-# Initialize model globally
-model = None
-def initialize_model():
-    global model
-    try:
-        model = LLaVA(
-            vision_model_path="openai/clip-vit-large-patch14-336",
-            language_model_path="lmsys/vicuna-7b-v1.5",
-            device="cuda" if torch.cuda.is_available() else "cpu",
-            load_in_8bit=True
-        )
-        print(f"Model initialized on {model.device}")
-        return True
-    except Exception as e:
-        print(f"Error initializing model: {e}")
-        return False
-def process_image(image, prompt, max_new_tokens=256, temperature=0.7, top_p=0.9):
-    if not model:
-        return "Error: Model not initialized"
-    try:
-        # Save the uploaded image temporarily
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
-            image.save(temp_file.name)
-            temp_path = temp_file.name
-        # Generate response
-        response = model.generate_from_image(
-            image_path=temp_path,
-            prompt=prompt,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_p=top_p
-        )
-        # Clean up temporary file
-        os.unlink(temp_path)
-        return response
-    except Exception as e:
-        return f"Error processing image: {str(e)}"
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="LLaVA Chat", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # LLaVA Chat
-        Upload an image and chat with LLaVA about it. This model can understand and describe images, answer questions about them, and engage in visual conversations.
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                image_input = gr.Image(type="pil", label="Upload Image")
-                prompt_input = gr.Textbox(
-                    label="Ask about the image",
-                    placeholder="What can you see in this image?",
-                    lines=3
-                )
-                with gr.Accordion("Advanced Settings", open=False):
-                    max_tokens = gr.Slider(
-                        minimum=32,
-                        maximum=512,
-                        value=256,
-                        step=32,
-                        label="Max New Tokens"
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.7,
-                        step=0.1,
-                        label="Temperature"
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.9,
-                        step=0.1,
-                        label="Top P"
-                    )
-                submit_btn = gr.Button("Generate Response", variant="primary")
-            with gr.Column(scale=1):
-                output = gr.Textbox(
-                    label="Model Response",
-                    lines=10,
-                    show_copy_button=True
-                )
-        # Set up the submit action
-        submit_btn.click(
-            fn=process_image,
-            inputs=[image_input, prompt_input, max_tokens, temperature, top_p],
-            outputs=output
-        )
-        # Add examples
-        gr.Examples(
-            examples=[
-                ["examples/cat.jpg", "What can you see in this image?"],
-                ["examples/landscape.jpg", "Describe this scene in detail."],
-                ["examples/food.jpg", "What kind of food is this and how would you describe it?"]
-            ],
-            inputs=[image_input, prompt_input]
-        )
-    return demo
-# Create FastAPI app
-app = FastAPI(title="LLaVA Web Interface")
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Create Gradio app
-demo = create_interface()
-# Mount Gradio app
-app = gr.mount_gradio_app(app, demo, path="/")
-if __name__ == "__main__":
-    # Initialize model
-    if initialize_model():
-        import uvicorn
-        uvicorn.run(app, host="0.0.0.0", port=7860)  # Hugging Face Spaces uses port 7860
-    else:
-        print("Failed to initialize model. Exiting...")

docs/api/README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+# LLaVA API Documentation
+## Overview
+The LLaVA API provides a simple interface for interacting with the LLaVA model through a Gradio web interface. The API allows users to upload images and receive AI-generated responses about the image content.
+## API Endpoints
+### Web Interface
+The main interface is served at the root URL (`/`) and provides the following components:
+#### Input Components
+1. **Image Upload**
+   - Type: Image uploader
+   - Format: PIL Image
+   - Purpose: Upload an image for analysis
+2. **Prompt Input**
+   - Type: Text input
+   - Purpose: Enter questions or prompts about the image
+   - Default placeholder: "What can you see in this image?"
+3. **Generation Parameters**
+   - Max New Tokens (64-2048, default: 512)
+   - Temperature (0.1-1.0, default: 0.7)
+   - Top P (0.1-1.0, default: 0.9)
+#### Output Components
+1. **Response**
+   - Type: Text output
+   - Purpose: Displays the model's response
+   - Features: Copy button, scrollable
+## Usage Examples
+### Basic Usage
+1. Upload an image using the image uploader
+2. Enter a prompt in the text input
+3. Click "Generate Response"
+4. View the response in the output box
+### Example Prompts
+- "What can you see in this image?"
+- "Describe this scene in detail"
+- "What emotions does this image convey?"
+- "What's happening in this picture?"
+- "Can you identify any objects or people in this image?"
+## Error Handling
+The API handles various error cases:
+1. **Invalid Images**
+   - Returns an error message if the image is invalid or corrupted
+   - Supports common image formats (JPEG, PNG, etc.)
+2. **Empty Prompts**
+   - Returns an error message if no prompt is provided
+   - Prompts should be non-empty strings
+3. **Model Errors**
+   - Returns descriptive error messages for model-related issues
+   - Includes logging for debugging
+## Configuration
+The API can be configured through environment variables or the settings file:
+- `API_HOST`: Server host (default: "0.0.0.0")
+- `API_PORT`: Server port (default: 7860)
+- `GRADIO_THEME`: Interface theme (default: "soft")
+- `DEFAULT_MAX_NEW_TOKENS`: Default token limit (default: 512)
+- `DEFAULT_TEMPERATURE`: Default temperature (default: 0.7)
+- `DEFAULT_TOP_P`: Default top-p value (default: 0.9)
+## Development
+### Running Locally
+```bash
+python src/api/app.py
+```
+### Running Tests
+```bash
+pytest tests/
+```
+### Code Style
+The project follows PEP 8 guidelines. To check your code:
+```bash
+flake8 src/
+black src/
+```
+## Security Considerations
+1. The API is designed for public use but should be deployed behind appropriate security measures
+2. Input validation is performed on all user inputs
+3. Large file uploads are handled safely
+4. Error messages are sanitized to prevent information leakage
+## Rate Limiting
+Currently, no rate limiting is implemented. Consider implementing rate limiting for production deployments.
+## Future Improvements
+1. Add authentication
+2. Implement rate limiting
+3. Add batch processing capabilities
+4. Support for video input
+5. Real-time streaming responses

docs/guides/developer_guide.md ADDED Viewed

	@@ -0,0 +1,362 @@

+# LLaVA Implementation Developer Guide
+## Overview
+This guide is intended for developers who want to contribute to or extend the LLaVA implementation. The project is structured as a Python package with a Gradio web interface, using modern best practices and tools.
+## Project Structure
+```
+llava_implementation/
+├── src/                    # Source code
+│   ├── api/               # API endpoints and FastAPI app
+│   │   ├── __init__.py
+│   │   └── app.py        # Gradio interface
+│   ├── models/            # Model implementations
+│   │   ├── __init__.py
+│   │   └── llava_model.py # LLaVA model wrapper
+│   ├── utils/             # Utility functions
+│   │   ├── __init__.py
+│   │   └── logging.py     # Logging utilities
+│   └── configs/           # Configuration files
+│       ├── __init__.py
+│       └── settings.py    # Application settings
+├── tests/                 # Test suite
+│   ├── __init__.py
+│   └── test_model.py      # Model tests
+├── docs/                  # Documentation
+│   ├── api/              # API documentation
+│   ├── examples/         # Usage examples
+│   └── guides/           # User and developer guides
+├── assets/               # Static assets
+│   ├── images/          # Example images
+│   └── icons/           # UI icons
+├── scripts/              # Utility scripts
+└── examples/             # Example images for the web interface
+```
+## Development Setup
+### Prerequisites
+- Python 3.8+
+- Git
+- CUDA-capable GPU (recommended)
+- Virtual environment tool (venv, conda, etc.)
+### Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/Prashant-ambati/llava-implementation.git
+cd llava-implementation
+```
+2. Create and activate a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+3. Install development dependencies:
+```bash
+pip install -r requirements.txt
+pip install -r requirements-dev.txt  # Development dependencies
+```
+### Development Tools
+1. **Code Formatting**
+   - Black for code formatting
+   - isort for import sorting
+   - flake8 for linting
+2. **Testing**
+   - pytest for testing
+   - pytest-cov for coverage
+   - pytest-mock for mocking
+3. **Type Checking**
+   - mypy for static type checking
+   - types-* packages for type hints
+## Code Style
+### Python Style Guide
+1. Follow PEP 8 guidelines
+2. Use type hints
+3. Write docstrings (Google style)
+4. Keep functions focused and small
+5. Use meaningful variable names
+### Example
+```python
+from typing import Optional, List
+from PIL import Image
+def process_image(
+    image: Image.Image,
+    prompt: str,
+    max_tokens: Optional[int] = None
+) -> List[str]:
+    """
+    Process an image with the given prompt.
+    Args:
+        image: Input image as PIL Image
+        prompt: Text prompt for the model
+        max_tokens: Optional maximum tokens to generate
+    Returns:
+        List of generated responses
+    Raises:
+        ValueError: If image is invalid
+        RuntimeError: If model fails to process
+    """
+    # Implementation
+```
+## Testing
+### Running Tests
+```bash
+# Run all tests
+pytest
+# Run with coverage
+pytest --cov=src
+# Run specific test file
+pytest tests/test_model.py
+# Run with verbose output
+pytest -v
+```
+### Writing Tests
+1. Use pytest fixtures
+2. Mock external dependencies
+3. Test edge cases
+4. Include both unit and integration tests
+Example test:
+```python
+import pytest
+from PIL import Image
+def test_process_image(model, sample_image):
+    """Test image processing functionality."""
+    prompt = "What color is this image?"
+    response = model.process_image(
+        image=sample_image,
+        prompt=prompt
+    )
+    assert isinstance(response, str)
+    assert len(response) > 0
+```
+## Model Development
+### Adding New Models
+1. Create a new model class in `src/models/`
+2. Implement required methods
+3. Add tests
+4. Update documentation
+Example:
+```python
+class NewModel:
+    """New model implementation."""
+    def __init__(self, config: dict):
+        """Initialize the model."""
+        self.config = config
+        self.model = self._load_model()
+    def process(self, *args, **kwargs):
+        """Process inputs and generate output."""
+        pass
+```
+### Model Configuration
+1. Add configuration in `src/configs/settings.py`
+2. Use environment variables for secrets
+3. Document all parameters
+## API Development
+### Adding New Endpoints
+1. Create new endpoint in `src/api/app.py`
+2. Add input validation
+3. Implement error handling
+4. Add tests
+5. Update documentation
+### Error Handling
+1. Use custom exceptions
+2. Implement proper logging
+3. Return appropriate status codes
+4. Include error messages
+Example:
+```python
+class ModelError(Exception):
+    """Base exception for model errors."""
+    pass
+def process_request(request):
+    try:
+        result = model.process(request)
+        return result
+    except ModelError as e:
+        logger.error(f"Model error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+```
+## Deployment
+### Local Deployment
+1. Build the package:
+```bash
+python -m build
+```
+2. Run the server:
+```bash
+python src/api/app.py
+```
+### Hugging Face Spaces
+1. Update `README.md` with Space metadata
+2. Ensure all dependencies are in `requirements.txt`
+3. Test the Space locally
+4. Push changes to the Space
+### Production Deployment
+1. Set up proper logging
+2. Configure security measures
+3. Implement rate limiting
+4. Set up monitoring
+5. Use environment variables
+## Contributing
+### Workflow
+1. Fork the repository
+2. Create a feature branch
+3. Make changes
+4. Run tests
+5. Update documentation
+6. Create a pull request
+### Pull Request Process
+1. Update documentation
+2. Add tests
+3. Ensure CI passes
+4. Get code review
+5. Address feedback
+6. Merge when approved
+## Performance Optimization
+### Model Optimization
+1. Use model quantization
+2. Implement caching
+3. Batch processing
+4. GPU optimization
+### API Optimization
+1. Response compression
+2. Request validation
+3. Connection pooling
+4. Caching strategies
+## Security
+### Best Practices
+1. Input validation
+2. Error handling
+3. Rate limiting
+4. Secure configuration
+5. Regular updates
+### Security Checklist
+- [ ] Validate all inputs
+- [ ] Sanitize outputs
+- [ ] Use secure dependencies
+- [ ] Implement rate limiting
+- [ ] Set up monitoring
+- [ ] Regular security audits
+## Monitoring and Logging
+### Logging
+1. Use structured logging
+2. Include context
+3. Set appropriate levels
+4. Rotate logs
+### Monitoring
+1. Track key metrics
+2. Set up alerts
+3. Monitor resources
+4. Track errors
+## Future Development
+### Planned Features
+1. Video support
+2. Batch processing
+3. Model fine-tuning
+4. API authentication
+5. Advanced caching
+### Contributing Ideas
+1. Open issues
+2. Discuss in PRs
+3. Join discussions
+4. Share use cases
+## Resources
+### Documentation
+- [Python Documentation](https://docs.python.org/)
+- [Gradio Documentation](https://gradio.app/docs/)
+- [Hugging Face Docs](https://huggingface.co/docs)
+- [Pytest Documentation](https://docs.pytest.org/)
+### Tools
+- [Black](https://black.readthedocs.io/)
+- [isort](https://pycqa.github.io/isort/)
+- [flake8](https://flake8.pycqa.org/)
+- [mypy](https://mypy.readthedocs.io/)
+### Community
+- [GitHub Issues](https://github.com/Prashant-ambati/llava-implementation/issues)
+- [Hugging Face Forums](https://discuss.huggingface.co/)
+- [Stack Overflow](https://stackoverflow.com/)

docs/guides/user_guide.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# LLaVA Chat User Guide
+## Introduction
+Welcome to LLaVA Chat! This guide will help you get started with using our AI-powered image understanding and chat interface. LLaVA (Large Language and Vision Assistant) combines advanced vision and language models to provide detailed analysis and natural conversations about images.
+## Getting Started
+### Accessing the Interface
+1. Visit our [Hugging Face Space](https://huggingface.co/spaces/Prashant26am/llava-chat)
+2. Wait for the interface to load (this may take a few moments as the model initializes)
+3. You're ready to start chatting with images!
+### Basic Usage
+1. **Upload an Image**
+   - Click the image upload area or drag and drop an image
+   - Supported formats: JPEG, PNG, GIF
+   - Maximum file size: 10MB
+2. **Enter Your Prompt**
+   - Type your question or prompt in the text box
+   - Be specific about what you want to know
+   - You can ask multiple questions about the same image
+3. **Adjust Parameters** (Optional)
+   - Click "Generation Parameters" to expand
+   - Modify settings to control the response:
+     - Max New Tokens: Longer responses (64-2048)
+     - Temperature: More creative responses (0.1-1.0)
+     - Top P: More diverse responses (0.1-1.0)
+4. **Generate Response**
+   - Click the "Generate Response" button
+   - Wait for the model to process (usually a few seconds)
+   - Read the response in the output box
+   - Use the copy button to save the response
+## Best Practices
+### Writing Effective Prompts
+1. **Be Specific**
+   - Instead of "What's in this image?", try "What objects can you identify in this image?"
+   - Instead of "Describe this", try "Describe the scene, focusing on the main subject"
+2. **Ask Follow-up Questions**
+   - "What emotions does this image convey?"
+   - "Can you identify any specific details about [object]?"
+   - "How would you describe the composition of this image?"
+3. **Use Natural Language**
+   - Write as if you're talking to a person
+   - Feel free to ask for clarification or more details
+   - You can have a conversation about the image
+### Example Prompts
+1. **General Analysis**
+   - "What can you see in this image?"
+   - "Describe this scene in detail"
+   - "What's the main subject of this image?"
+2. **Specific Details**
+   - "What colors are prominent in this image?"
+   - "Can you identify any text or signs in the image?"
+   - "What time of day does this image appear to be taken?"
+3. **Emotional Response**
+   - "What mood or atmosphere does this image convey?"
+   - "How does this image make you feel?"
+   - "What emotions might this image evoke in viewers?"
+4. **Technical Analysis**
+   - "What's the composition of this image?"
+   - "How would you describe the lighting in this image?"
+   - "What camera angle or perspective is used?"
+## Troubleshooting
+### Common Issues
+1. **Image Not Loading**
+   - Check file format (JPEG, PNG, GIF)
+   - Ensure file size is under 10MB
+   - Try refreshing the page
+2. **Slow Response**
+   - Reduce image size
+   - Simplify your prompt
+   - Check your internet connection
+3. **Unexpected Responses**
+   - Try rephrasing your prompt
+   - Adjust generation parameters
+   - Be more specific in your question
+### Getting Help
+If you encounter any issues:
+1. Check this guide for solutions
+2. Visit our [GitHub repository](https://github.com/Prashant-ambati/llava-implementation)
+3. Open an issue on GitHub
+4. Contact us through Hugging Face
+## Advanced Usage
+### Parameter Tuning
+1. **Max New Tokens**
+   - Lower values (64-256): Short, concise responses
+   - Medium values (256-512): Balanced responses
+   - Higher values (512+): Detailed, comprehensive responses
+2. **Temperature**
+   - Lower values (0.1-0.3): More focused, deterministic responses
+   - Medium values (0.4-0.7): Balanced creativity
+   - Higher values (0.8-1.0): More creative, diverse responses
+3. **Top P**
+   - Lower values (0.1-0.3): More focused word choice
+   - Medium values (0.4-0.7): Balanced diversity
+   - Higher values (0.8-1.0): More diverse word choice
+### Tips for Better Results
+1. **Image Quality**
+   - Use clear, well-lit images
+   - Ensure the subject is clearly visible
+   - Avoid heavily edited or filtered images
+2. **Prompt Engineering**
+   - Start with simple questions
+   - Build up to more complex queries
+   - Use follow-up questions for details
+3. **Response Management**
+   - Copy important responses
+   - Save interesting conversations
+   - Compare responses with different parameters
+## Privacy and Ethics
+1. **Image Privacy**
+   - Don't upload sensitive or private images
+   - Be mindful of copyright
+   - Respect others' privacy
+2. **Responsible Use**
+   - Use the tool ethically
+   - Don't use for harmful purposes
+   - Respect content guidelines
+## Future Updates
+We're constantly improving LLaVA Chat. Planned features include:
+1. Support for video input
+2. Batch image processing
+3. More advanced parameter controls
+4. Additional model options
+5. Enhanced response formatting
+Stay tuned for updates!

examples/api_client.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Example API client for the LLaVA model.
+"""
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, Any, Optional
+import requests
+from PIL import Image
+import base64
+from io import BytesIO
+def encode_image(image_path: str) -> str:
+    """
+    Encode an image to base64 string.
+    Args:
+        image_path: Path to the image file
+    Returns:
+        str: Base64 encoded image
+    """
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def process_image(
+    api_url: str,
+    image_path: str,
+    prompt: str,
+    max_new_tokens: Optional[int] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None
+) -> Dict[str, Any]:
+    """
+    Process an image using the LLaVA API.
+    Args:
+        api_url: URL of the API endpoint
+        image_path: Path to the input image
+        prompt: Text prompt for the model
+        max_new_tokens: Optional maximum tokens to generate
+        temperature: Optional sampling temperature
+        top_p: Optional top-p sampling parameter
+    Returns:
+        Dict containing the API response
+    """
+    # Prepare the request payload
+    payload = {
+        "image": encode_image(image_path),
+        "prompt": prompt
+    }
+    # Add optional parameters if provided
+    if max_new_tokens is not None:
+        payload["max_new_tokens"] = max_new_tokens
+    if temperature is not None:
+        payload["temperature"] = temperature
+    if top_p is not None:
+        payload["top_p"] = top_p
+    try:
+        # Send the request
+        response = requests.post(api_url, json=payload)
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        print(f"Error making request: {e}")
+        if hasattr(e.response, 'text'):
+            print(f"Response: {e.response.text}")
+        raise
+def save_response(response: Dict[str, Any], output_path: Optional[str] = None):
+    """
+    Save or print the API response.
+    Args:
+        response: API response dictionary
+        output_path: Optional path to save the response
+    """
+    if output_path:
+        with open(output_path, 'w') as f:
+            json.dump(response, f, indent=2)
+        print(f"Saved response to {output_path}")
+    else:
+        print("\nAPI Response:")
+        print("-" * 50)
+        print(json.dumps(response, indent=2))
+        print("-" * 50)
+def main():
+    """Main function to process images using the API."""
+    parser = argparse.ArgumentParser(description="Process images using LLaVA API")
+    parser.add_argument("image_path", type=str, help="Path to the input image")
+    parser.add_argument("prompt", type=str, help="Text prompt for the model")
+    parser.add_argument("--api-url", type=str, default="http://localhost:7860/api/process",
+                      help="URL of the API endpoint")
+    parser.add_argument("--max-tokens", type=int, help="Maximum tokens to generate")
+    parser.add_argument("--temperature", type=float, help="Sampling temperature")
+    parser.add_argument("--top-p", type=float, help="Top-p sampling parameter")
+    parser.add_argument("--output", type=str, help="Path to save the response")
+    args = parser.parse_args()
+    try:
+        # Process image
+        response = process_image(
+            api_url=args.api_url,
+            image_path=args.image_path,
+            prompt=args.prompt,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p
+        )
+        # Save or print response
+        save_response(response, args.output)
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()

examples/llava_demo.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+

examples/process_image.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Example script for processing images with the LLaVA model.
+"""
+import argparse
+from pathlib import Path
+from PIL import Image
+from src.models.llava_model import LLaVAModel
+from src.configs.settings import DEFAULT_MAX_NEW_TOKENS, DEFAULT_TEMPERATURE, DEFAULT_TOP_P
+from src.utils.logging import setup_logging, get_logger
+# Set up logging
+setup_logging()
+logger = get_logger(__name__)
+def process_image(
+    image_path: str,
+    prompt: str,
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    temperature: float = DEFAULT_TEMPERATURE,
+    top_p: float = DEFAULT_TOP_P
+) -> str:
+    """
+    Process an image with the LLaVA model.
+    Args:
+        image_path: Path to the input image
+        prompt: Text prompt for the model
+        max_new_tokens: Maximum number of tokens to generate
+        temperature: Sampling temperature
+        top_p: Top-p sampling parameter
+    Returns:
+        str: Model response
+    """
+    try:
+        # Load image
+        image = Image.open(image_path)
+        logger.info(f"Loaded image from {image_path}")
+        # Initialize model
+        model = LLaVAModel()
+        logger.info("Model initialized")
+        # Generate response
+        response = model(
+            image=image,
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p
+        )
+        logger.info("Generated response")
+        return response
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}")
+        raise
+def main():
+    """Main function to process images from command line."""
+    parser = argparse.ArgumentParser(description="Process images with LLaVA model")
+    parser.add_argument("image_path", type=str, help="Path to the input image")
+    parser.add_argument("prompt", type=str, help="Text prompt for the model")
+    parser.add_argument("--max-tokens", type=int, default=DEFAULT_MAX_NEW_TOKENS,
+                      help="Maximum number of tokens to generate")
+    parser.add_argument("--temperature", type=float, default=DEFAULT_TEMPERATURE,
+                      help="Sampling temperature")
+    parser.add_argument("--top-p", type=float, default=DEFAULT_TOP_P,
+                      help="Top-p sampling parameter")
+    parser.add_argument("--output", type=str, help="Path to save the response")
+    args = parser.parse_args()
+    try:
+        # Process image
+        response = process_image(
+            image_path=args.image_path,
+            prompt=args.prompt,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p
+        )
+        # Print or save response
+        if args.output:
+            output_path = Path(args.output)
+            output_path.write_text(response)
+            logger.info(f"Saved response to {output_path}")
+        else:
+            print("\nModel Response:")
+            print("-" * 50)
+            print(response)
+            print("-" * 50)
+    except Exception as e:
+        logger.error(f"Error: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,181 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "llava-implementation"
+version = "0.1.0"
+description = "A modern implementation of LLaVA with a beautiful web interface"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {text = "MIT"}
+authors = [
+    {name = "Prashant Ambati", email = "[email protected]"}
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "torch>=2.0.0",
+    "torchvision>=0.15.0",
+    "transformers>=4.36.0",
+    "accelerate>=0.25.0",
+    "pillow>=10.0.0",
+    "numpy>=1.24.0",
+    "tqdm>=4.65.0",
+    "matplotlib>=3.7.0",
+    "opencv-python>=4.8.0",
+    "einops>=0.7.0",
+    "timm>=0.9.0",
+    "sentencepiece>=0.1.99",
+    "peft>=0.7.0",
+    "bitsandbytes>=0.41.0",
+    "safetensors>=0.4.0",
+    "gradio==4.44.1",
+    "fastapi>=0.109.0",
+    "uvicorn>=0.27.0",
+    "python-multipart>=0.0.6",
+    "pydantic>=2.5.0",
+    "python-jose>=3.3.0",
+    "passlib>=1.7.4",
+    "bcrypt>=4.0.1",
+    "aiofiles>=23.2.0",
+    "httpx>=0.26.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.12.0",
+    "pytest-asyncio>=0.23.5",
+    "pytest-xdist>=3.5.0",
+    "black>=24.1.1",
+    "isort>=5.13.2",
+    "flake8>=7.0.0",
+    "mypy>=1.8.0",
+    "types-Pillow>=10.2.0.20240106",
+    "types-requests>=2.31.0.20240125",
+    "sphinx>=7.2.6",
+    "sphinx-rtd-theme>=2.0.0",
+    "sphinx-autodoc-typehints>=2.0.1",
+    "sphinx-copybutton>=0.5.2",
+    "sphinx-tabs>=3.4.4",
+    "pre-commit>=3.6.0",
+    "ipython>=8.21.0",
+    "jupyter>=1.0.0",
+    "notebook>=7.0.7",
+    "ipykernel>=6.29.0",
+    "build>=1.0.3",
+    "twine>=4.0.2",
+    "wheel>=0.42.0",
+    "memory-profiler>=0.61.0",
+    "line-profiler>=4.1.2",
+    "debugpy>=1.8.0",
+]
+[project.urls]
+Homepage = "https://github.com/Prashant-ambati/llava-implementation"
+Documentation = "https://github.com/Prashant-ambati/llava-implementation#readme"
+Repository = "https://github.com/Prashant-ambati/llava-implementation.git"
+Issues = "https://github.com/Prashant-ambati/llava-implementation/issues"
+"Bug Tracker" = "https://github.com/Prashant-ambati/llava-implementation/issues"
+[tool.setuptools]
+packages = ["src"]
+[tool.black]
+line-length = 88
+target-version = ["py38"]
+include = '\.pyi?$'
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 88
+[tool.mypy]
+python_version = "3.8"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+warn_unreachable = true
+strict_optional = true
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q --cov=src"
+testpaths = [
+    "tests",
+]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+[tool.coverage.run]
+source = ["src"]
+branch = true
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "if self.debug:",
+    "raise NotImplementedError",
+    "if __name__ == .__main__.:",
+    "pass",
+    "raise ImportError",
+]
+show_missing = true
+fail_under = 80
+[tool.bandit]
+exclude_dirs = ["tests", "docs"]
+skips = ["B101"]
+[tool.ruff]
+line-length = 88
+target-version = "py38"
+select = [
+    "E",  # pycodestyle errors
+    "W",  # pycodestyle warnings
+    "F",  # pyflakes
+    "I",  # isort
+    "B",  # flake8-bugbear
+    "C4", # flake8-comprehensions
+    "UP", # pyupgrade
+    "N",  # pep8-naming
+    "PL", # pylint
+    "RUF", # ruff-specific rules
+]
+ignore = [
+    "E501",  # line length violations
+    "B008",  # do not perform function calls in argument defaults
+]
+[tool.ruff.isort]
+known-first-party = ["src"]
+[tool.ruff.mccabe]
+max-complexity = 10

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+# Testing
+pytest==8.0.0
+pytest-cov==4.1.0
+pytest-mock==3.12.0
+pytest-asyncio==0.23.5
+pytest-xdist==3.5.0
+# Code Quality
+black==24.1.1
+isort==5.13.2
+flake8==7.0.0
+mypy==1.8.0
+types-Pillow==10.2.0.20240106
+types-requests==2.31.0.20240125
+# Documentation
+sphinx==7.2.6
+sphinx-rtd-theme==2.0.0
+sphinx-autodoc-typehints==2.0.1
+sphinx-copybutton==0.5.2
+sphinx-tabs==3.4.4
+# Development Tools
+pre-commit==3.6.0
+ipython==8.21.0
+jupyter==1.0.0
+notebook==7.0.7
+ipykernel==6.29.0
+# Build Tools
+build==1.0.3
+twine==4.0.2
+wheel==0.42.0
+# Monitoring and Debugging
+memory-profiler==0.61.0
+line-profiler==4.1.2
+debugpy==1.8.0

requirements.txt CHANGED Viewed

@@ -1,26 +1,25 @@
 torch>=2.0.0
 torchvision>=0.15.0
-transformers>=4.30.0
-accelerate>=0.20.0
-pillow>=9.0.0
 numpy>=1.24.0
 tqdm>=4.65.0
 matplotlib>=3.7.0
-opencv-python>=4.7.0
-einops>=0.6.0
 timm>=0.9.0
 sentencepiece>=0.1.99
-gradio>=3.35.0
-peft>=0.4.0
-bitsandbytes>=0.40.0
-safetensors>=0.3.1
-fastapi==0.104.1
-uvicorn==0.24.0
-python-multipart==0.0.6
-pydantic==2.5.2
-python-jose==3.3.0
-passlib==1.7.4
-bcrypt==4.0.1
-aiofiles==23.2.1
-python-dotenv==1.0.0
-httpx==0.25.2

 torch>=2.0.0
 torchvision>=0.15.0
+transformers>=4.36.0
+accelerate>=0.25.0
+pillow>=10.0.0
 numpy>=1.24.0
 tqdm>=4.65.0
 matplotlib>=3.7.0
+opencv-python>=4.8.0
+einops>=0.7.0
 timm>=0.9.0
 sentencepiece>=0.1.99
+peft>=0.7.0
+bitsandbytes>=0.41.0
+safetensors>=0.4.0
+gradio==4.44.1
+fastapi>=0.109.0
+uvicorn>=0.27.0
+python-multipart>=0.0.6
+pydantic>=2.5.0
+python-jose>=3.3.0
+passlib>=1.7.4
+bcrypt>=4.0.1
+aiofiles>=23.2.0
+httpx>=0.26.0

src/__init__.py ADDED Viewed

File without changes

src/api/__init__.py ADDED Viewed

File without changes

src/api/app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Gradio interface for the LLaVA model.
+"""
+import gradio as gr
+from PIL import Image
+from ..configs.settings import (
+    GRADIO_THEME,
+    GRADIO_TITLE,
+    GRADIO_DESCRIPTION,
+    DEFAULT_MAX_NEW_TOKENS,
+    DEFAULT_TEMPERATURE,
+    DEFAULT_TOP_P,
+    API_HOST,
+    API_PORT,
+    API_WORKERS,
+    API_RELOAD
+)
+from ..models.llava_model import LLaVAModel
+from ..utils.logging import setup_logging, get_logger
+# Set up logging
+setup_logging()
+logger = get_logger(__name__)
+# Initialize model
+model = LLaVAModel()
+def process_image(
+    image: Image.Image,
+    prompt: str,
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    temperature: float = DEFAULT_TEMPERATURE,
+    top_p: float = DEFAULT_TOP_P
+) -> str:
+    """
+    Process an image with the LLaVA model.
+    Args:
+        image: Input image
+        prompt: Text prompt
+        max_new_tokens: Maximum number of tokens to generate
+        temperature: Sampling temperature
+        top_p: Top-p sampling parameter
+    Returns:
+        str: Model response
+    """
+    try:
+        logger.info(f"Processing image with prompt: {prompt[:100]}...")
+        response = model(
+            image=image,
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p
+        )
+        logger.info("Successfully generated response")
+        return response
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}")
+        return f"Error: {str(e)}"
+def create_interface() -> gr.Interface:
+    """Create and return the Gradio interface."""
+    with gr.Blocks(theme=GRADIO_THEME) as interface:
+        gr.Markdown(f"""# {GRADIO_TITLE}
+{GRADIO_DESCRIPTION}
+## Example Prompts
+Try these prompts to get started:
+- "What can you see in this image?"
+- "Describe this scene in detail"
+- "What emotions does this image convey?"
+- "What's happening in this picture?"
+- "Can you identify any objects or people in this image?"
+## Usage Instructions
+1. Upload an image using the image uploader
+2. Enter your prompt in the text box
+3. (Optional) Adjust the generation parameters
+4. Click "Generate Response" to get LLaVA's analysis
+""")
+        with gr.Row():
+            with gr.Column():
+                # Input components
+                image_input = gr.Image(type="pil", label="Upload Image")
+                prompt_input = gr.Textbox(
+                    label="Prompt",
+                    placeholder="What can you see in this image?",
+                    lines=3
+                )
+                with gr.Accordion("Generation Parameters", open=False):
+                    max_tokens = gr.Slider(
+                        minimum=64,
+                        maximum=2048,
+                        value=DEFAULT_MAX_NEW_TOKENS,
+                        step=64,
+                        label="Max New Tokens"
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=DEFAULT_TEMPERATURE,
+                        step=0.1,
+                        label="Temperature"
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=DEFAULT_TOP_P,
+                        step=0.1,
+                        label="Top P"
+                    )
+                generate_btn = gr.Button("Generate Response", variant="primary")
+            with gr.Column():
+                # Output component
+                output = gr.Textbox(
+                    label="Response",
+                    lines=10,
+                    show_copy_button=True
+                )
+        # Set up event handlers
+        generate_btn.click(
+            fn=process_image,
+            inputs=[
+                image_input,
+                prompt_input,
+                max_tokens,
+                temperature,
+                top_p
+            ],
+            outputs=output
+        )
+    return interface
+def main():
+    """Run the Gradio interface."""
+    interface = create_interface()
+    interface.launch(
+        server_name=API_HOST,
+        server_port=API_PORT,
+        share=True,
+        show_error=True,
+        show_api=False
+    )
+if __name__ == "__main__":
+    main()

src/configs/__init__.py ADDED Viewed

File without changes

src/configs/settings.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Configuration settings for the LLaVA implementation.
+"""
+import os
+from pathlib import Path
+# Project paths
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+SRC_DIR = PROJECT_ROOT / "src"
+ASSETS_DIR = PROJECT_ROOT / "assets"
+EXAMPLES_DIR = PROJECT_ROOT / "examples"
+# Model settings
+MODEL_NAME = "liuhaotian/llava-v1.5-7b"
+MODEL_REVISION = "main"
+DEVICE = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu"
+# Generation settings
+DEFAULT_MAX_NEW_TOKENS = 512
+DEFAULT_TEMPERATURE = 0.7
+DEFAULT_TOP_P = 0.9
+# API settings
+API_HOST = "0.0.0.0"
+API_PORT = 7860
+API_WORKERS = 1
+API_RELOAD = True
+# Gradio settings
+GRADIO_THEME = "soft"
+GRADIO_TITLE = "LLaVA Chat"
+GRADIO_DESCRIPTION = """
+A powerful multimodal AI assistant that can understand and discuss images.
+Upload any image and chat with LLaVA about it!
+"""
+# Logging settings
+LOG_LEVEL = "INFO"
+LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+LOG_DIR = PROJECT_ROOT / "logs"
+LOG_FILE = LOG_DIR / "app.log"
+# Create necessary directories
+for directory in [ASSETS_DIR, EXAMPLES_DIR, LOG_DIR]:
+    directory.mkdir(parents=True, exist_ok=True)

src/models/__init__.py ADDED Viewed

File without changes

src/models/llava_model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+LLaVA model implementation.
+"""
+import torch
+from transformers import AutoProcessor, AutoModelForCausalLM
+from PIL import Image
+from ..configs.settings import MODEL_NAME, MODEL_REVISION, DEVICE
+from ..utils.logging import get_logger
+logger = get_logger(__name__)
+class LLaVAModel:
+    """LLaVA model wrapper class."""
+    def __init__(self):
+        """Initialize the LLaVA model and processor."""
+        logger.info(f"Initializing LLaVA model from {MODEL_NAME}")
+        self.processor = AutoProcessor.from_pretrained(
+            MODEL_NAME,
+            revision=MODEL_REVISION,
+            trust_remote_code=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            revision=MODEL_REVISION,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        logger.info("Model initialization complete")
+    def generate_response(
+        self,
+        image: Image.Image,
+        prompt: str,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9
+    ) -> str:
+        """
+        Generate a response for the given image and prompt.
+        Args:
+            image: Input image as PIL Image
+            prompt: Text prompt for the model
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature
+            top_p: Top-p sampling parameter
+        Returns:
+            str: Generated response
+        """
+        try:
+            # Prepare inputs
+            inputs = self.processor(
+                prompt,
+                image,
+                return_tensors="pt"
+            ).to(DEVICE)
+            # Generate response
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True
+                )
+            # Decode and return response
+            response = self.processor.decode(
+                outputs[0],
+                skip_special_tokens=True
+            )
+            logger.debug(f"Generated response: {response[:100]}...")
+            return response
+        except Exception as e:
+            logger.error(f"Error generating response: {str(e)}")
+            raise
+    def __call__(self, *args, **kwargs):
+        """Convenience method to call generate_response."""
+        return self.generate_response(*args, **kwargs)

main.py → src/models/main.py RENAMED Viewed

File without changes

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+torch>=2.0.0
+torchvision>=0.15.0
+transformers>=4.30.0
+accelerate>=0.20.0
+pillow>=9.0.0
+numpy>=1.24.0
+tqdm>=4.65.0
+matplotlib>=3.7.0
+opencv-python>=4.7.0
+einops>=0.6.0
+timm>=0.9.0
+sentencepiece>=0.1.99
+gradio>=3.35.0
+peft>=0.4.0
+bitsandbytes>=0.40.0
+safetensors>=0.3.1
+fastapi==0.104.1
+uvicorn==0.24.0
+python-multipart==0.0.6
+pydantic==2.5.2
+python-jose==3.3.0
+passlib==1.7.4
+bcrypt==4.0.1
+aiofiles==23.2.1
+python-dotenv==1.0.0
+httpx==0.25.2

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/logging.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Logging utilities for the LLaVA implementation.
+"""
+import logging
+import sys
+from pathlib import Path
+from ..configs.settings import LOG_LEVEL, LOG_FORMAT, LOG_FILE
+def setup_logging(name: str = None) -> logging.Logger:
+    """
+    Set up logging configuration for the application.
+    Args:
+        name: Optional name for the logger. If None, returns the root logger.
+    Returns:
+        logging.Logger: Configured logger instance.
+    """
+    # Create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(LOG_LEVEL)
+    # Create formatters
+    formatter = logging.Formatter(LOG_FORMAT)
+    # Create handlers
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    file_handler = logging.FileHandler(LOG_FILE)
+    file_handler.setFormatter(formatter)
+    # Add handlers to logger
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    return logger
+def get_logger(name: str = None) -> logging.Logger:
+    """
+    Get a logger instance with the specified name.
+    Args:
+        name: Optional name for the logger. If None, returns the root logger.
+    Returns:
+        logging.Logger: Logger instance.
+    """
+    return logging.getLogger(name)

tests/test_model.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Tests for the LLaVA model implementation.
+"""
+import pytest
+from PIL import Image
+import torch
+from src.models.llava_model import LLaVAModel
+from src.configs.settings import DEFAULT_MAX_NEW_TOKENS, DEFAULT_TEMPERATURE, DEFAULT_TOP_P
+@pytest.fixture
+def model():
+    """Fixture to provide a model instance."""
+    return LLaVAModel()
+@pytest.fixture
+def sample_image():
+    """Fixture to provide a sample image."""
+    # Create a simple test image
+    return Image.new('RGB', (224, 224), color='red')
+def test_model_initialization(model):
+    """Test that the model initializes correctly."""
+    assert model is not None
+    assert model.processor is not None
+    assert model.model is not None
+def test_model_device(model):
+    """Test that the model is on the correct device."""
+    assert next(model.model.parameters()).device.type in ['cuda', 'cpu']
+def test_generate_response(model, sample_image):
+    """Test that the model can generate responses."""
+    prompt = "What color is this image?"
+    response = model.generate_response(
+        image=sample_image,
+        prompt=prompt,
+        max_new_tokens=DEFAULT_MAX_NEW_TOKENS,
+        temperature=DEFAULT_TEMPERATURE,
+        top_p=DEFAULT_TOP_P
+    )
+    assert isinstance(response, str)
+    assert len(response) > 0
+def test_generate_response_with_invalid_image(model):
+    """Test that the model handles invalid images correctly."""
+    with pytest.raises(Exception):
+        model.generate_response(
+            image=None,
+            prompt="What color is this image?",
+            max_new_tokens=DEFAULT_MAX_NEW_TOKENS,
+            temperature=DEFAULT_TEMPERATURE,
+            top_p=DEFAULT_TOP_P
+        )
+def test_generate_response_with_empty_prompt(model, sample_image):
+    """Test that the model handles empty prompts correctly."""
+    with pytest.raises(Exception):
+        model.generate_response(
+            image=sample_image,
+            prompt="",
+            max_new_tokens=DEFAULT_MAX_NEW_TOKENS,
+            temperature=DEFAULT_TEMPERATURE,
+            top_p=DEFAULT_TOP_P
+        )