Spaces:

binary1ne
/

vllm-llama2

Paused

binary1ne commited on Aug 12

Commit

c9e3139

verified ·

1 Parent(s): 249e668

Create Dockerfile

Files changed (1) hide show

Dockerfile ADDED Viewed

+FROM vllm/vllm-openai:latest
+# Expose API port (default for vLLM is 8000)
+EXPOSE 7860
+# Environment variables for vLLM
+# Set host to listen on all interfaces
+ENV HOST=0.0.0.0
+ENV PORT=7860
+# Disable history/persistence equivalent
+# (vLLM doesn't store chat history by default, but we'll avoid caching between runs)
+ENV VLLM_DISABLE_LOGGING=true
+ENV VLLM_NO_DISK_CACHE=true
+ENV TRANSFORMERS_CACHE=/tmp/.vllm/models
+# Create RAM-based temporary model directory
+RUN mkdir -p /tmp/.vllm/models && \
+    chmod -R 777 /tmp/.vllm/models
+# Optional: mark as tmpfs for ephemeral storage
+VOLUME ["/tmp/.vllm/models"]
+# Remove any persistent model folder
+RUN rm -rf /root/.cache && mkdir -p /root/.cache && chmod -R 777 /root/.cache
+# Pull llama-2-7b from Hugging Face and run
+# Hugging Face token must be passed as build arg or env var
+ARG HF_TOKEN
+ENV HF_TOKEN=${HF_TOKEN}
+# By default vLLM downloads at startup
+CMD ["--model", "meta-llama/Llama-2-7b-hf", "--host", "0.0.0.0", "--port", "7860"]