Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

evaluate_qwen2.5_bf16.sh +98 -0
evaluate_qwen2.5_w4a16.sh +141 -0
evaluate_qwen2.5_w8a16.sh +144 -0
evaluate_qwen2.5_w8a8.sh +171 -0
quantize_qwen2.5_fp8.sh +39 -0
quantize_qwen2.5_w4a16.sh +49 -0
quantize_qwen2.5_w8a16.sh +190 -0
quantize_qwen2.5_w8a8.sh +109 -0
quantize_qwen2.5_w8a8_sq.sh +52 -0

evaluate_qwen2.5_bf16.sh ADDED Viewed

	@@ -0,0 +1,98 @@

+source ~/environments/clearml/bin/activate
+for size in 0.5B 1.5B 3B 7B
+do
+model="Qwen2.5-${size}"
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id "Qwen/"$model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM reference/vllm" \
+  --task-name "${model}/openllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+model="Qwen2.5-${size}-Instruct"
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id "Qwen/"$model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM reference/vllm" \
+  --task-name "${model}/openllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+done
+size=32B
+model="Qwen2.5-${size}"
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id "Qwen/"$model \
+  --queue-name oneshot-a100x2 \
+  --project-name "LLM reference/vllm" \
+  --task-name "${model}/openllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+model="Qwen2.5-${size}-Instruct"
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id "Qwen/"$model \
+  --queue-name oneshot-a100x2 \
+  --project-name "LLM reference/vllm" \
+  --task-name "${model}/openllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+size=72B
+model="Qwen2.5-${size}"
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id "Qwen/"$model \
+  --queue-name oneshot-a100x4 \
+  --project-name "LLM reference/vllm" \
+  --task-name "${model}/openllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+model="Qwen2.5-${size}-Instruct"
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id "Qwen/"$model \
+  --queue-name oneshot-a100x4 \
+  --project-name "LLM reference/vllm" \
+  --task-name "${model}/openllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256

evaluate_qwen2.5_w4a16.sh ADDED Viewed

	@@ -0,0 +1,141 @@

+source ~/environments/clearml/bin/activate
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id ece15d6b3b574a9ba3062f8922dda9d9 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-0.5B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id b3c075c7685a42c798ebe91bdb1bb140 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 70d4376f576241eb9ca066ce2864c517 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-1.5B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 328b46a2fc3d431c95a8d8900f1b88ce \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 29785c1921db4680a1bcbb1c32b1bb0c \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-3B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id bb496bef644e4336afba44944aa8e915 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 7ab087ff9ada4033a52350078a2a665a \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-7B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 0a0cf6bf30c94a3ea96babf1c92f23a2 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 7da4e5e0e2db49c984d3e195ab09ae09 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-32B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 79b0a7340f2247068940085e66a7e17b \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256

evaluate_qwen2.5_w8a16.sh ADDED Viewed

	@@ -0,0 +1,144 @@

+source ~/environments/clearml/bin/activate
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 0fe5857173ac484a89316214b14fcf96 \
+  --clearml-model \
+  --queue-name oneshot-a100x2 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+<<END
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 6a4ecaa68a6e45ea80c62680b0a65aa0 \
+  --clearml-model \
+  --queue-name oneshot-a100x2 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-72B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id c67be85bc77f462c93381280019dea1d \
+  --clearml-model \
+  --queue-name oneshot-a100x4 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-7B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id effae214cc464181a92d5a57df10f3d6 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-0.5B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 21dd39ef3013401d84b258410647e847 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 324acdd4c7c4426dbdfeb29667dc4b53 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-1.5B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 82cbaa6e27c84f08ac10e9f115034b0b \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 337f5b50610443c7ad2a380dce8e0be8 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-3B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id a8ebd9cae5324572906d50f95eeee5dd \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 5190911e94a340988dac223c252e72a2 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+END

evaluate_qwen2.5_w8a8.sh ADDED Viewed

	@@ -0,0 +1,171 @@

+source ~/environments/clearml/bin/activate
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 0909a970c53644f593c62b2e076ee763 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-7B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 0323219f7a8b4e938c2583ab17851b91 \
+  --clearml-model \
+  --queue-name oneshot-a100x2 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-72B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 31c634f743fe458687556bc85a6aeb32 \
+  --clearml-model \
+  --queue-name oneshot-a100x2 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+<<END
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id ee78f00c9e924db19e8d763442f45209 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-0.5B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 5baf53c17abf460d95a5d04bf4ec184d \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 0dd5e7b10a97426b92d3f7278a1c47ad \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-1.5B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id c63fe381f1e94c2880ce6f8c15465ed4 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id a1ed1bec7c4540eba02cae98c03cada9 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-3B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id d7392721b76c4a299ac494c7b3ce1299 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id 71606d92b022423591547683b81ebc91 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id c9094d7778584311894b490d5c0d7e66 \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-32B/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
+  --model-id d2164c0031f2445eb726fc94c94210bd \
+  --clearml-model \
+  --queue-name oneshot-a100x1 \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
+  --benchmark-tasks openllm \
+  --max-model-len 4096 \
+  --add-bos-token \
+  --batch-size auto \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9 \
+  --max-num-batched-tokens 256
+END

quantize_qwen2.5_fp8.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+source ~/environments/clearml/bin/activate
+recipe_template=$(cat <<'EOF'
+quant_stage:
+  quant_modifiers:
+    QuantizationModifier:
+      ignore: ["lm_head"]
+      scheme: FP8
+      targets: ["Linear"]
+      observer: "mse"
+EOF
+)
+for size in 0.5B 1.5B 3B 7B 32B 72B
+do
+for version in base instruct
+do
+if [ $version = "base" ]; then
+  model="Qwen2.5-${size}"
+else
+  model="Qwen2.5-${size}-Instruct"
+fi
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196__damp01"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - FP8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "${recipe}" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W4A16" "calibration" $size "MSE" $version
+done
+done

quantize_qwen2.5_w4a16.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+source ~/environments/clearml/bin/activate
+recipe_template=$(cat <<'EOF'
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      sequential_update: true
+      dampening_frac: 0.1
+      ignore: ["lm_head"]
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 4
+            type: "int"
+            symmetric: true
+            strategy: "group"
+            group_size: 128
+            actorder: "group"
+          targets: ["Linear"]
+          observer: "mse"
+EOF
+)
+for size in 0.5B 1.5B 3B 7B 32B 72B
+do
+for version in base instruct
+do
+if [ $version = "base" ]; then
+  model="Qwen2.5-${size}"
+else
+  model="Qwen2.5-${size}-Instruct"
+fi
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196__damp01"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "${recipe}" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W4A16" "calibration" $size "MSE" $version
+done
+done

quantize_qwen2.5_w8a16.sh ADDED Viewed

	@@ -0,0 +1,190 @@

+source ~/environments/clearml/bin/activate
+recipe_template=$(cat <<'EOF'
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      sequential_update: true
+      dampening_frac: 0.1
+      ignore: ["lm_head"]
+      scheme: "W8A16"
+      targets: "Linear"
+      observer: "mse"
+EOF
+)
+for size in 3B
+do
+for version in instruct
+do
+if [ $version = "base" ]; then
+  model="Qwen2.5-${size}"
+else
+  model="Qwen2.5-${size}-Instruct"
+fi
+prefix="${model//./_}""__llm_compressor__calibration__mse__1024__8196_damp01"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "${recipe}" \
+  --num-samples 1024 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
+done
+done
+<<END
+size="72B"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x4 \
+  --evaluation-queue oneshot-a100x4 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
+size="32B"
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x2 \
+  --evaluation-queue oneshot-a100x2 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x2 \
+  --evaluation-queue oneshot-a100x2 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
+size="72B"
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x4 \
+  --evaluation-queue oneshot-a100x2 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x4 \
+  --evaluation-queue oneshot-a100x2 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_mse_damp01.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
+size="7B"
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
+for size in 0.5B 1.5B 3B 7B
+do
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "w8a16" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "w8a16" "calibration" $size "MSE" "Instruct"
+done
+for size in 32B 72B
+do
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - w8a16/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --oneshot-queue oneshot-a100x2 \
+  --evaluation-queue oneshot-a100x2 \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "w8a16" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - w8a16/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --oneshot-queue oneshot-a100x2 \
+  --evaluation-queue oneshot-a100x2 \
+  --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "w8a16" "calibration" $size "MSE" "Instruct"
+done
+END

quantize_qwen2.5_w8a8.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+source ~/environments/clearml/bin/activate
+size="7B"
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
+size="72B"
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x4 \
+  --evaluation-queue oneshot-a100x4 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --oneshot-queue oneshot-a100x4 \
+  --evaluation-queue oneshot-a100x4 \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
+<<END
+for size in 0.5B 1.5B 3B 7B
+do
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
+done
+for size in 32B 72B
+do
+model="Qwen2.5-${size}"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --oneshot-queue oneshot-a100x2 \
+  --evaluation-queue oneshot-a100x2 \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
+model="Qwen2.5-${size}-Instruct"
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --oneshot-queue oneshot-a100x2 \
+  --evaluation-queue oneshot-a100x2 \
+  --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
+done
+END

quantize_qwen2.5_w8a8_sq.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+source ~/environments/clearml/bin/activate
+recipe_template=$(cat <<'EOF'
+quant_stage:
+  quant_modifiers:
+    SmoothQuantModifier:
+      smoothing_strength: 0.0
+      mappings:
+        - [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"]
+        - [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
+        - [["re:.*down_proj"], "re:.*up_proj"]
+    GPTQModifier:
+      sequential_update: true
+      dampening_frac: 0.05
+      ignore: ["lm_head"]
+      scheme: "W8A8"
+      targets: "Linear"
+      observer: "mse"
+EOF
+)
+for size in 3B
+do
+for version in base
+do
+for sq in 0.9
+do
+recipe=$(echo "$recipe_template" | sed "s/smoothing_strength: 0.0/smoothing_strength: ${sq}/")
+if [ $version = "base" ]; then
+  model="Qwen2.5-${size}"
+else
+  model="Qwen2.5-${size}-Instruct"
+fi
+prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196_damp005_sq${sq//0./0}"
+python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
+  --model-id "Qwen/"$model \
+  --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
+  --task-prefix $prefix \
+  --recipe "${recipe}" \
+  --num-samples 512 \
+  --max-seq-len 8196 \
+  --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "SQ"
+done
+done
+done