FROM vllm/vllm-openai:latest # Expose API port (default for vLLM is 8000) EXPOSE 7860 # Environment variables for vLLM # Set host to listen on all interfaces ENV HOST=0.0.0.0 ENV PORT=7860 # Disable history/persistence equivalent # (vLLM doesn't store chat history by default, but we'll avoid caching between runs) ENV VLLM_DISABLE_LOGGING=true ENV VLLM_NO_DISK_CACHE=true ENV TRANSFORMERS_CACHE=/tmp/.vllm/models # Create RAM-based temporary model directory RUN mkdir -p /tmp/.vllm/models && \ chmod -R 777 /tmp/.vllm/models # Optional: mark as tmpfs for ephemeral storage VOLUME ["/tmp/.vllm/models"] # Remove any persistent model folder RUN rm -rf /root/.cache && mkdir -p /root/.cache && chmod -R 777 /root/.cache # Pull llama-2-7b from Hugging Face and run # Hugging Face token must be passed as build arg or env var ARG HF_TOKEN ENV HF_TOKEN=${HF_TOKEN} # By default vLLM downloads at startup CMD ["--model", "meta-llama/Llama-2-7b-hf", "--host", "0.0.0.0", "--port", "7860"]