Spaces:
Paused
Paused
Update start_server.sh
Browse files- start_server.sh +32 -30
start_server.sh
CHANGED
|
@@ -2,36 +2,38 @@
|
|
| 2 |
set -e
|
| 3 |
export PYTHONUNBUFFERED=1
|
| 4 |
|
| 5 |
-
# ================================
|
| 6 |
-
# Fixed configuration for your setup
|
| 7 |
-
# ================================
|
| 8 |
-
MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit"
|
| 9 |
-
HOST="0.0.0.0"
|
| 10 |
-
VLLM_PORT="7860"
|
| 11 |
-
CPU_KVCACHE_SPACE="8" # in GiB
|
| 12 |
-
DTYPE="auto" # auto, float16, float32, etc.
|
| 13 |
-
VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed
|
| 14 |
|
| 15 |
-
echo "[vLLM] Starting server with:"
|
| 16 |
-
echo " MODEL_NAME=$MODEL_NAME"
|
| 17 |
-
echo " HOST=$HOST"
|
| 18 |
-
echo " VLLM_PORT=$VLLM_PORT"
|
| 19 |
-
echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB"
|
| 20 |
-
echo " DTYPE=$DTYPE"
|
| 21 |
-
echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS"
|
| 22 |
|
| 23 |
-
# Warn if /sys not mounted (lscpu detection will fail)
|
| 24 |
-
if [ ! -e /sys/devices/system/cpu/possible ]; then
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
fi
|
| 28 |
|
| 29 |
-
# Start the vLLM CPU server (logs will stream to console)
|
| 30 |
-
exec python3 -u -m vllm.entrypoints.openai.api_server \
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
| 2 |
set -e
|
| 3 |
export PYTHONUNBUFFERED=1
|
| 4 |
|
| 5 |
+
# # ================================
|
| 6 |
+
# # Fixed configuration for your setup
|
| 7 |
+
# # ================================
|
| 8 |
+
# MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit"
|
| 9 |
+
# HOST="0.0.0.0"
|
| 10 |
+
# VLLM_PORT="7860"
|
| 11 |
+
# CPU_KVCACHE_SPACE="8" # in GiB
|
| 12 |
+
# DTYPE="auto" # auto, float16, float32, etc.
|
| 13 |
+
# VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed
|
| 14 |
|
| 15 |
+
# echo "[vLLM] Starting server with:"
|
| 16 |
+
# echo " MODEL_NAME=$MODEL_NAME"
|
| 17 |
+
# echo " HOST=$HOST"
|
| 18 |
+
# echo " VLLM_PORT=$VLLM_PORT"
|
| 19 |
+
# echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB"
|
| 20 |
+
# echo " DTYPE=$DTYPE"
|
| 21 |
+
# echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS"
|
| 22 |
|
| 23 |
+
# # Warn if /sys not mounted (lscpu detection will fail)
|
| 24 |
+
# if [ ! -e /sys/devices/system/cpu/possible ]; then
|
| 25 |
+
# echo "[WARN] /sys not mounted — CPU topology detection may fail."
|
| 26 |
+
# echo " Run with: docker run -v /sys:/sys:ro ..."
|
| 27 |
+
# fi
|
| 28 |
|
| 29 |
+
# # Start the vLLM CPU server (logs will stream to console)
|
| 30 |
+
# exec python3 -u -m vllm.entrypoints.openai.api_server \
|
| 31 |
+
# --model "$MODEL_NAME" \
|
| 32 |
+
# --host "$HOST" \
|
| 33 |
+
# --port "$VLLM_PORT" \
|
| 34 |
+
# --cpu-offload \
|
| 35 |
+
# --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \
|
| 36 |
+
# --dtype "$DTYPE" \
|
| 37 |
+
# $VLLM_EXTRA_ARGS
|
| 38 |
+
|
| 39 |
+
python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 2>/dev/null
|