File size: 3,834 Bytes
51402bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# Multi-stage build for DeepXR/Helion-2.5-Rnd
# Optimized for production inference with vLLM
# Stage 1: Base image with CUDA and Python
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
CUDA_HOME=/usr/local/cuda \
TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \
FORCE_CUDA=1 \
MAX_JOBS=8
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
python3.10-dev \
git \
wget \
curl \
vim \
build-essential \
cmake \
ninja-build \
ccache \
libssl-dev \
libffi-dev \
libjpeg-dev \
libpng-dev \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Update pip and install build tools
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel
# Stage 2: Build dependencies
FROM base AS builder
WORKDIR /build
# Install PyTorch with CUDA support
RUN pip install --no-cache-dir \
torch==2.2.0 \
torchvision==0.17.0 \
torchaudio==2.2.0 \
--index-url https://download.pytorch.org/whl/cu121
# Install vLLM and core dependencies
RUN pip install --no-cache-dir \
vllm==0.3.3 \
transformers==4.40.0 \
tokenizers==0.15.2 \
sentencepiece==0.2.0 \
accelerate==0.28.0 \
bitsandbytes==0.43.0 \
safetensors==0.4.2 \
huggingface-hub==0.21.4
# Install additional ML libraries
RUN pip install --no-cache-dir \
numpy==1.26.4 \
scipy==1.12.0 \
pandas==2.2.1 \
scikit-learn==1.4.1 \
pydantic==2.6.4 \
fastapi==0.110.0 \
uvicorn[standard]==0.29.0 \
aiohttp==3.9.3 \
ray[default]==2.10.0
# Install monitoring and optimization tools
RUN pip install --no-cache-dir \
prometheus-client==0.20.0 \
gputil==1.4.0 \
psutil==5.9.8 \
py-cpuinfo==9.0.0 \
pynvml==11.5.0
# Stage 3: Final runtime image
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
# Copy environment variables
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
CUDA_HOME=/usr/local/cuda \
MODEL_NAME=DeepXR/Helion-2.5-Rnd \
MODEL_PATH=/models/helion \
PORT=8000 \
HOST=0.0.0.0 \
TENSOR_PARALLEL_SIZE=2 \
MAX_MODEL_LEN=131072 \
GPU_MEMORY_UTILIZATION=0.95 \
WORKERS=1
# Install runtime dependencies only
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
curl \
vim \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Copy Python packages from builder
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin
# Create application directory
WORKDIR /app
# Create necessary directories
RUN mkdir -p /models/helion /app/inference /app/logs /app/cache
# Copy inference code
COPY ./inference /app/inference
COPY ./model_config.yaml /app/
COPY ./config.json /app/
# Set permissions
RUN chmod +x /app/inference/*.py
# Create non-root user for security
RUN useradd -m -u 1000 helion && \
chown -R helion:helion /app /models
USER helion
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:${PORT}/health || exit 1
# Expose ports
EXPOSE 8000 8001 8002
# Set default command
CMD ["python3", "-m", "inference.server", \
"--model", "${MODEL_PATH}", \
"--host", "${HOST}", \
"--port", "${PORT}", \
"--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \
"--max-model-len", "${MAX_MODEL_LEN}", \
"--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"]
# Labels
LABEL maintainer="DeepXR Team" \
version="2.5.0-rnd" \
description="Helion-2.5 Research & Development Model - Advanced Language Model" \
model="DeepXR/Helion-2.5-Rnd" \
license="Apache-2.0" |