New alora script (#24)

Browse files

- Add script for running vLLM with aLoRA adapters (7f3e4f6bae45d03525f3fd4bb5a8a24e580265de)

Co-authored-by: Fred Reiss <[email protected]>

Files changed (1) hide show

run_vllm_alora.sh +48 -0

run_vllm_alora.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#! /bin/bash
+################################################################################
+# Shell script that starts a copy of vLLM with a base model plus all the
+# available aLoRA adapters in this repository.
+#
+# To run this script:
+# 1. Create and activate a Python virtual environment using a tool such as
+#    miniforge, uv, or venv.
+# 2. Install the fork of vLLM that supports aLoRA on your machine
+#    (`VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/tdoublep/vllm.git@alora`)
+# 3. Install the Hugging Face CLI (`pip install -U "huggingface_hub[cli]"`)
+# 3. Download the intrinsics library by running:
+#    hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib
+# 4. Edit the constants BASE_MODEL_NAME, BASE_MODEL_ORG, and PORT as needed
+# 5. Run this script from the root of your local copy of rag-intrinsics-lib.
+################################################################################
+BASE_MODEL_NAME=granite-3.3-8b-instruct
+BASE_MODEL_ORG=ibm-granite
+PORT=55555
+export VLLM_API_KEY=rag_intrinsics_1234
+# Find all aLoRA adapters for the target base model. Note that this can be
+# edited to serve both aLoRA and LoRA adapters simultaneously.
+ALORAS=""
+for item in "."/*; do
+    # Remove the "./"
+    name=$(basename -- "${item}")
+    if [ -d "./${name}/alora/${BASE_MODEL_NAME}" ]; then
+        ALORAS+="${name}=./${name}/alora/${BASE_MODEL_NAME} "
+    fi
+done
+CMD="vllm serve ${BASE_MODEL_ORG}/${BASE_MODEL_NAME} \
+    --port ${PORT} \
+    --gpu-memory-utilization 0.45 \
+    --max-model-len 8192 \
+    --enable-lora \
+    --enable-activated-lora \
+    --enable-prefix-caching \
+    --max_lora_rank 64 \
+    --lora-modules $ALORAS"
+echo $CMD
+$CMD