# Kirim-V1-Base Docker Image
# Optimized for inference with CUDA support

FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}

# Set working directory
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    python3-dev \
    git \
    wget \
    curl \
    vim \
    build-essential \
    cmake \
    ninja-build \
    && rm -rf /var/lib/apt/lists/*

# Upgrade pip and install Python dependencies
RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel

# Install PyTorch with CUDA support
RUN pip3 install --no-cache-dir \
    torch==2.1.0 \
    torchvision==0.16.0 \
    torchaudio==2.1.0 \
    --index-url https://download.pytorch.org/whl/cu121

# Copy requirements and install dependencies
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

# Install additional optimization libraries
RUN pip3 install --no-cache-dir \
    flash-attn==2.3.0 --no-build-isolation \
    xformers==0.0.22

# Copy model files and scripts
COPY . .

# Create directories for model cache and outputs
RUN mkdir -p /app/models /app/outputs /root/.cache/huggingface

# Set HuggingFace cache directory
ENV HF_HOME=/root/.cache/huggingface
ENV TRANSFORMERS_CACHE=/app/models

# Expose port for API (if needed)
EXPOSE 8000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD python3 -c "import torch; print(torch.cuda.is_available())" || exit 1

# Default command - interactive chat
CMD ["python3", "inference.py", "--chat"]

# Alternative commands:
# For API server: CMD ["python3", "api_server.py"]
# For single inference: CMD ["python3", "inference.py", "--prompt", "你好"]
# For quantized model: CMD ["python3", "inference.py", "--load_in_4bit", "--chat"]