binary1ne commited on
Commit
c9e3139
·
verified ·
1 Parent(s): 249e668

Create Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +33 -0
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM vllm/vllm-openai:latest
2
+
3
+ # Expose API port (default for vLLM is 8000)
4
+ EXPOSE 7860
5
+
6
+ # Environment variables for vLLM
7
+ # Set host to listen on all interfaces
8
+ ENV HOST=0.0.0.0
9
+ ENV PORT=7860
10
+
11
+ # Disable history/persistence equivalent
12
+ # (vLLM doesn't store chat history by default, but we'll avoid caching between runs)
13
+ ENV VLLM_DISABLE_LOGGING=true
14
+ ENV VLLM_NO_DISK_CACHE=true
15
+ ENV TRANSFORMERS_CACHE=/tmp/.vllm/models
16
+
17
+ # Create RAM-based temporary model directory
18
+ RUN mkdir -p /tmp/.vllm/models && \
19
+ chmod -R 777 /tmp/.vllm/models
20
+
21
+ # Optional: mark as tmpfs for ephemeral storage
22
+ VOLUME ["/tmp/.vllm/models"]
23
+
24
+ # Remove any persistent model folder
25
+ RUN rm -rf /root/.cache && mkdir -p /root/.cache && chmod -R 777 /root/.cache
26
+
27
+ # Pull llama-2-7b from Hugging Face and run
28
+ # Hugging Face token must be passed as build arg or env var
29
+ ARG HF_TOKEN
30
+ ENV HF_TOKEN=${HF_TOKEN}
31
+
32
+ # By default vLLM downloads at startup
33
+ CMD ["--model", "meta-llama/Llama-2-7b-hf", "--host", "0.0.0.0", "--port", "7860"]