File size: 3,834 Bytes
51402bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Multi-stage build for DeepXR/Helion-2.5-Rnd
# Optimized for production inference with vLLM

# Stage 1: Base image with CUDA and Python
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    CUDA_HOME=/usr/local/cuda \
    TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \
    FORCE_CUDA=1 \
    MAX_JOBS=8

# Install system dependencies
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    python3.10-dev \
    git \
    wget \
    curl \
    vim \
    build-essential \
    cmake \
    ninja-build \
    ccache \
    libssl-dev \
    libffi-dev \
    libjpeg-dev \
    libpng-dev \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

# Update pip and install build tools
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel

# Stage 2: Build dependencies
FROM base AS builder

WORKDIR /build

# Install PyTorch with CUDA support
RUN pip install --no-cache-dir \
    torch==2.2.0 \
    torchvision==0.17.0 \
    torchaudio==2.2.0 \
    --index-url https://download.pytorch.org/whl/cu121

# Install vLLM and core dependencies
RUN pip install --no-cache-dir \
    vllm==0.3.3 \
    transformers==4.40.0 \
    tokenizers==0.15.2 \
    sentencepiece==0.2.0 \
    accelerate==0.28.0 \
    bitsandbytes==0.43.0 \
    safetensors==0.4.2 \
    huggingface-hub==0.21.4

# Install additional ML libraries
RUN pip install --no-cache-dir \
    numpy==1.26.4 \
    scipy==1.12.0 \
    pandas==2.2.1 \
    scikit-learn==1.4.1 \
    pydantic==2.6.4 \
    fastapi==0.110.0 \
    uvicorn[standard]==0.29.0 \
    aiohttp==3.9.3 \
    ray[default]==2.10.0

# Install monitoring and optimization tools
RUN pip install --no-cache-dir \
    prometheus-client==0.20.0 \
    gputil==1.4.0 \
    psutil==5.9.8 \
    py-cpuinfo==9.0.0 \
    pynvml==11.5.0

# Stage 3: Final runtime image
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

# Copy environment variables
ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    CUDA_HOME=/usr/local/cuda \
    MODEL_NAME=DeepXR/Helion-2.5-Rnd \
    MODEL_PATH=/models/helion \
    PORT=8000 \
    HOST=0.0.0.0 \
    TENSOR_PARALLEL_SIZE=2 \
    MAX_MODEL_LEN=131072 \
    GPU_MEMORY_UTILIZATION=0.95 \
    WORKERS=1

# Install runtime dependencies only
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    curl \
    vim \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

# Copy Python packages from builder
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Create application directory
WORKDIR /app

# Create necessary directories
RUN mkdir -p /models/helion /app/inference /app/logs /app/cache

# Copy inference code
COPY ./inference /app/inference
COPY ./model_config.yaml /app/
COPY ./config.json /app/

# Set permissions
RUN chmod +x /app/inference/*.py

# Create non-root user for security
RUN useradd -m -u 1000 helion && \
    chown -R helion:helion /app /models

USER helion

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:${PORT}/health || exit 1

# Expose ports
EXPOSE 8000 8001 8002

# Set default command
CMD ["python3", "-m", "inference.server", \
     "--model", "${MODEL_PATH}", \
     "--host", "${HOST}", \
     "--port", "${PORT}", \
     "--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \
     "--max-model-len", "${MAX_MODEL_LEN}", \
     "--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"]

# Labels
LABEL maintainer="DeepXR Team" \
      version="2.5.0-rnd" \
      description="Helion-2.5 Research & Development Model - Advanced Language Model" \
      model="DeepXR/Helion-2.5-Rnd" \
      license="Apache-2.0"