#!/bin/bash set -e export PYTHONUNBUFFERED=1 # # ================================ # # Fixed configuration for your setup # # ================================ # MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit" # HOST="0.0.0.0" # VLLM_PORT="7860" # CPU_KVCACHE_SPACE="8" # in GiB # DTYPE="auto" # auto, float16, float32, etc. # VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed # echo "[vLLM] Starting server with:" # echo " MODEL_NAME=$MODEL_NAME" # echo " HOST=$HOST" # echo " VLLM_PORT=$VLLM_PORT" # echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB" # echo " DTYPE=$DTYPE" # echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS" # # Warn if /sys not mounted (lscpu detection will fail) # if [ ! -e /sys/devices/system/cpu/possible ]; then # echo "[WARN] /sys not mounted — CPU topology detection may fail." # echo " Run with: docker run -v /sys:/sys:ro ..." # fi # # Start the vLLM CPU server (logs will stream to console) # exec python3 -u -m vllm.entrypoints.openai.api_server \ # --model "$MODEL_NAME" \ # --host "$HOST" \ # --port "$VLLM_PORT" \ # --cpu-offload \ # --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \ # --dtype "$DTYPE" \ # $VLLM_EXTRA_ARGS python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 2>/dev/null