| FROM python:3.11-slim | |
| # Set working directory | |
| WORKDIR /app | |
| # Install system dependencies | |
| RUN apt-get update && apt-get install -y \ | |
| build-essential \ | |
| cmake \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Copy requirements and install Python dependencies | |
| COPY requirements.txt . | |
| RUN pip install --no-cache-dir -r requirements.txt | |
| # Install llama-cpp-python with server support | |
| RUN pip install --no-cache-dir llama-cpp-python[server] | |
| # Copy model and scripts | |
| COPY . . | |
| # Create non-root user | |
| RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app | |
| USER appuser | |
| # Expose port | |
| EXPOSE 8000 | |
| # Health check | |
| HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ | |
| CMD curl -f http://localhost:8000/health || exit 1 | |
| # Start server | |
| CMD ["python", "-m", "llama_cpp.server", \ | |
| "--model", "Qwen3-4B-Function-Calling-Pro.gguf", \ | |
| "--host", "0.0.0.0", \ | |
| "--port", "8000", \ | |
| "--n_ctx", "2048", \ | |
| "--n_threads", "8", \ | |
| "--temperature", "0.7"] | |