|
|
|
|
|
|
|
|
|
|
|
|
|
|
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base |
|
|
|
|
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive \ |
|
|
PYTHONUNBUFFERED=1 \ |
|
|
CUDA_HOME=/usr/local/cuda \ |
|
|
TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \ |
|
|
FORCE_CUDA=1 \ |
|
|
MAX_JOBS=8 |
|
|
|
|
|
|
|
|
RUN apt-get update && apt-get install -y \ |
|
|
python3.10 \ |
|
|
python3-pip \ |
|
|
python3.10-dev \ |
|
|
git \ |
|
|
wget \ |
|
|
curl \ |
|
|
vim \ |
|
|
build-essential \ |
|
|
cmake \ |
|
|
ninja-build \ |
|
|
ccache \ |
|
|
libssl-dev \ |
|
|
libffi-dev \ |
|
|
libjpeg-dev \ |
|
|
libpng-dev \ |
|
|
libgomp1 \ |
|
|
&& rm -rf /var/lib/apt/lists/* |
|
|
|
|
|
|
|
|
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel |
|
|
|
|
|
|
|
|
FROM base AS builder |
|
|
|
|
|
WORKDIR /build |
|
|
|
|
|
|
|
|
RUN pip install --no-cache-dir \ |
|
|
torch==2.2.0 \ |
|
|
torchvision==0.17.0 \ |
|
|
torchaudio==2.2.0 \ |
|
|
--index-url https://download.pytorch.org/whl/cu121 |
|
|
|
|
|
|
|
|
RUN pip install --no-cache-dir \ |
|
|
vllm==0.3.3 \ |
|
|
transformers==4.40.0 \ |
|
|
tokenizers==0.15.2 \ |
|
|
sentencepiece==0.2.0 \ |
|
|
accelerate==0.28.0 \ |
|
|
bitsandbytes==0.43.0 \ |
|
|
safetensors==0.4.2 \ |
|
|
huggingface-hub==0.21.4 |
|
|
|
|
|
|
|
|
RUN pip install --no-cache-dir \ |
|
|
numpy==1.26.4 \ |
|
|
scipy==1.12.0 \ |
|
|
pandas==2.2.1 \ |
|
|
scikit-learn==1.4.1 \ |
|
|
pydantic==2.6.4 \ |
|
|
fastapi==0.110.0 \ |
|
|
uvicorn[standard]==0.29.0 \ |
|
|
aiohttp==3.9.3 \ |
|
|
ray[default]==2.10.0 |
|
|
|
|
|
|
|
|
RUN pip install --no-cache-dir \ |
|
|
prometheus-client==0.20.0 \ |
|
|
gputil==1.4.0 \ |
|
|
psutil==5.9.8 \ |
|
|
py-cpuinfo==9.0.0 \ |
|
|
pynvml==11.5.0 |
|
|
|
|
|
|
|
|
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 |
|
|
|
|
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive \ |
|
|
PYTHONUNBUFFERED=1 \ |
|
|
CUDA_HOME=/usr/local/cuda \ |
|
|
MODEL_NAME=DeepXR/Helion-2.5-Rnd \ |
|
|
MODEL_PATH=/models/helion \ |
|
|
PORT=8000 \ |
|
|
HOST=0.0.0.0 \ |
|
|
TENSOR_PARALLEL_SIZE=2 \ |
|
|
MAX_MODEL_LEN=131072 \ |
|
|
GPU_MEMORY_UTILIZATION=0.95 \ |
|
|
WORKERS=1 |
|
|
|
|
|
|
|
|
RUN apt-get update && apt-get install -y \ |
|
|
python3.10 \ |
|
|
python3-pip \ |
|
|
curl \ |
|
|
vim \ |
|
|
libgomp1 \ |
|
|
&& rm -rf /var/lib/apt/lists/* |
|
|
|
|
|
|
|
|
COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages |
|
|
COPY --from=builder /usr/local/bin /usr/local/bin |
|
|
|
|
|
|
|
|
WORKDIR /app |
|
|
|
|
|
|
|
|
RUN mkdir -p /models/helion /app/inference /app/logs /app/cache |
|
|
|
|
|
|
|
|
COPY ./inference /app/inference |
|
|
COPY ./model_config.yaml /app/ |
|
|
COPY ./config.json /app/ |
|
|
|
|
|
|
|
|
RUN chmod +x /app/inference/*.py |
|
|
|
|
|
|
|
|
RUN useradd -m -u 1000 helion && \ |
|
|
chown -R helion:helion /app /models |
|
|
|
|
|
USER helion |
|
|
|
|
|
|
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ |
|
|
CMD curl -f http://localhost:${PORT}/health || exit 1 |
|
|
|
|
|
|
|
|
EXPOSE 8000 8001 8002 |
|
|
|
|
|
|
|
|
CMD ["python3", "-m", "inference.server", \ |
|
|
"--model", "${MODEL_PATH}", \ |
|
|
"--host", "${HOST}", \ |
|
|
"--port", "${PORT}", \ |
|
|
"--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \ |
|
|
"--max-model-len", "${MAX_MODEL_LEN}", \ |
|
|
"--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"] |
|
|
|
|
|
|
|
|
LABEL maintainer="DeepXR Team" \ |
|
|
version="2.5.0-rnd" \ |
|
|
description="Helion-2.5 Research & Development Model - Advanced Language Model" \ |
|
|
model="DeepXR/Helion-2.5-Rnd" \ |
|
|
license="Apache-2.0" |