#!/bin/bash set -e # Defaults if not passed in MODEL_NAME="${MODEL_NAME:-unsloth/llama-2-7b-bnb-4bit}" HOST="${HOST:-0.0.0.0}" VLLM_PORT="${VLLM_PORT:-8000}" TP_SIZE="${TP_SIZE:-1}" GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" echo "[vLLM] Starting server with:" echo " MODEL_NAME=$MODEL_NAME" echo " HOST=$HOST" echo " VLLM_PORT=$VLLM_PORT" echo " TP_SIZE=$TP_SIZE" echo " GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION" exec python3 -m vllm.entrypoints.openai.api_server \ --model "$MODEL_NAME" \ --host "$HOST" \ --port "$VLLM_PORT" \ --tensor-parallel-size "$TP_SIZE" \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"