FROM vllm/vllm-openai:latest

# Expose API port (default for vLLM is 8000)
EXPOSE 7860

# Environment variables for vLLM
# Set host to listen on all interfaces
ENV HOST=0.0.0.0
ENV PORT=7860

# Disable history/persistence equivalent
# (vLLM doesn't store chat history by default, but we'll avoid caching between runs)
ENV VLLM_DISABLE_LOGGING=true
ENV VLLM_NO_DISK_CACHE=true
ENV TRANSFORMERS_CACHE=/tmp/.vllm/models

# Create RAM-based temporary model directory
RUN mkdir -p /tmp/.vllm/models && \
    chmod -R 777 /tmp/.vllm/models

# Optional: mark as tmpfs for ephemeral storage
VOLUME ["/tmp/.vllm/models"]

# Remove any persistent model folder
RUN rm -rf /root/.cache && mkdir -p /root/.cache && chmod -R 777 /root/.cache

# Pull llama-2-7b from Hugging Face and run
# Hugging Face token must be passed as build arg or env var
ARG HF_TOKEN
ENV HF_TOKEN=${HF_TOKEN}

# By default vLLM downloads at startup
CMD ["--model", "meta-llama/Llama-2-7b-hf", "--host", "0.0.0.0", "--port", "7860"]