Spaces:
Paused
Paused
File size: 2,548 Bytes
6da36b0 06d8451 6da36b0 06d8451 6da36b0 550a0f3 06d8451 6da36b0 fe282a1 3486288 4ddb93e fe282a1 4ddb93e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
# -----------------------------
# ENV Variables
# -----------------------------
ENV DEBIAN_FRONTEND=noninteractive
ENV HF_HOME=/opt/hf
ENV OMP_NUM_THREADS=2
ENV VLLM_CPU_KVCACHE_SPACE=8
ENV VLLM_ARGS="--dtype auto"
ENV VLLM_CPU_OMP_THREADS_BIND=0-4
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV NUMA_DISABLE=1
ENV OMP_NUM_THREADS=2
# -----------------------------
# Install dependencies
# -----------------------------
# Install lscpu & tini
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
util-linux procps numactl tini curl ca-certificates && \
rm -rf /var/lib/apt/lists/*
# -----------------------------
# Install vLLM
# -----------------------------
# RUN python3 -m pip install --no-cache-dir vllm==0.10.0
# -----------------------------
# Create mock lscpu
# -----------------------------
RUN mkdir -p /usr/local/bin && \
echo '#!/bin/bash' > /usr/local/bin/lscpu && \
echo 'cat <<EOF' >> /usr/local/bin/lscpu && \
echo '{' >> /usr/local/bin/lscpu && \
echo ' "CPU(s)": "4",' >> /usr/local/bin/lscpu && \
echo ' "On-line CPU(s) list": "0-3",' >> /usr/local/bin/lscpu && \
echo ' "Thread(s) per core": "1",' >> /usr/local/bin/lscpu && \
echo ' "Core(s) per socket": "4",' >> /usr/local/bin/lscpu && \
echo ' "Socket(s)": "1",' >> /usr/local/bin/lscpu && \
echo ' "NUMA node(s)": "1"' >> /usr/local/bin/lscpu && \
echo '}' >> /usr/local/bin/lscpu && \
echo 'EOF' >> /usr/local/bin/lscpu && \
chmod +x /usr/local/bin/lscpu
# Make sure our mock is used first
ENV PATH=/usr/local/bin:$PATH
# -----------------------------
# Expose port
# -----------------------------
EXPOSE 7860
# -----------------------------
# Checkpoints
# -----------------------------
RUN cat /etc/os-release
RUN vllm -v
RUN pip show vllm
RUN pip list
# -----------------------------
# Start vLLM
# -----------------------------
COPY start_server.sh /workspace
WORKDIR /workspace
ENTRYPOINT ["./start_server.sh"]
# CMD ["./start_server.sh"]
# RUN python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 2>/dev/null
# CMD ["python3", "-m", "vllm.entrypoints.openai.api_server", \
# "--model", "unsloth/Llama-3.2-1B-bnb-4bit", \
# "--host", "0.0.0.0", \
# "--port", "7860", \
# "--tensor-parallel-size", "1", \
# "--gpu-memory-utilization", "0.0"]
|