Helion-V2.5-Rnd / Dockerfile
Trouter-Library's picture
Create Dockerfile
51402bc verified
# Multi-stage build for DeepXR/Helion-2.5-Rnd
# Optimized for production inference with vLLM
# Stage 1: Base image with CUDA and Python
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
CUDA_HOME=/usr/local/cuda \
TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \
FORCE_CUDA=1 \
MAX_JOBS=8
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
python3.10-dev \
git \
wget \
curl \
vim \
build-essential \
cmake \
ninja-build \
ccache \
libssl-dev \
libffi-dev \
libjpeg-dev \
libpng-dev \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Update pip and install build tools
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel
# Stage 2: Build dependencies
FROM base AS builder
WORKDIR /build
# Install PyTorch with CUDA support
RUN pip install --no-cache-dir \
torch==2.2.0 \
torchvision==0.17.0 \
torchaudio==2.2.0 \
--index-url https://download.pytorch.org/whl/cu121
# Install vLLM and core dependencies
RUN pip install --no-cache-dir \
vllm==0.3.3 \
transformers==4.40.0 \
tokenizers==0.15.2 \
sentencepiece==0.2.0 \
accelerate==0.28.0 \
bitsandbytes==0.43.0 \
safetensors==0.4.2 \
huggingface-hub==0.21.4
# Install additional ML libraries
RUN pip install --no-cache-dir \
numpy==1.26.4 \
scipy==1.12.0 \
pandas==2.2.1 \
scikit-learn==1.4.1 \
pydantic==2.6.4 \
fastapi==0.110.0 \
uvicorn[standard]==0.29.0 \
aiohttp==3.9.3 \
ray[default]==2.10.0
# Install monitoring and optimization tools
RUN pip install --no-cache-dir \
prometheus-client==0.20.0 \
gputil==1.4.0 \
psutil==5.9.8 \
py-cpuinfo==9.0.0 \
pynvml==11.5.0
# Stage 3: Final runtime image
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
# Copy environment variables
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
CUDA_HOME=/usr/local/cuda \
MODEL_NAME=DeepXR/Helion-2.5-Rnd \
MODEL_PATH=/models/helion \
PORT=8000 \
HOST=0.0.0.0 \
TENSOR_PARALLEL_SIZE=2 \
MAX_MODEL_LEN=131072 \
GPU_MEMORY_UTILIZATION=0.95 \
WORKERS=1
# Install runtime dependencies only
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
curl \
vim \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Copy Python packages from builder
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# Create application directory
WORKDIR /app
# Create necessary directories
RUN mkdir -p /models/helion /app/inference /app/logs /app/cache
# Copy inference code
COPY ./inference /app/inference
COPY ./model_config.yaml /app/
COPY ./config.json /app/
# Set permissions
RUN chmod +x /app/inference/*.py
# Create non-root user for security
RUN useradd -m -u 1000 helion && \
chown -R helion:helion /app /models
USER helion
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:${PORT}/health || exit 1
# Expose ports
EXPOSE 8000 8001 8002
# Set default command
CMD ["python3", "-m", "inference.server", \
"--model", "${MODEL_PATH}", \
"--host", "${HOST}", \
"--port", "${PORT}", \
"--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \
"--max-model-len", "${MAX_MODEL_LEN}", \
"--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"]
# Labels
LABEL maintainer="DeepXR Team" \
version="2.5.0-rnd" \
description="Helion-2.5 Research & Development Model - Advanced Language Model" \
model="DeepXR/Helion-2.5-Rnd" \
license="Apache-2.0"