Upload folder using huggingface_hub
Browse files- evaluate_qwen2.5_bf16.sh +98 -0
- evaluate_qwen2.5_w4a16.sh +141 -0
- evaluate_qwen2.5_w8a16.sh +144 -0
- evaluate_qwen2.5_w8a8.sh +171 -0
- quantize_qwen2.5_fp8.sh +39 -0
- quantize_qwen2.5_w4a16.sh +49 -0
- quantize_qwen2.5_w8a16.sh +190 -0
- quantize_qwen2.5_w8a8.sh +109 -0
- quantize_qwen2.5_w8a8_sq.sh +52 -0
evaluate_qwen2.5_bf16.sh
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
source ~/environments/clearml/bin/activate
|
| 2 |
+
|
| 3 |
+
for size in 0.5B 1.5B 3B 7B
|
| 4 |
+
do
|
| 5 |
+
|
| 6 |
+
model="Qwen2.5-${size}"
|
| 7 |
+
|
| 8 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 9 |
+
--model-id "Qwen/"$model \
|
| 10 |
+
--queue-name oneshot-a100x1 \
|
| 11 |
+
--project-name "LLM reference/vllm" \
|
| 12 |
+
--task-name "${model}/openllm" \
|
| 13 |
+
--benchmark-tasks openllm \
|
| 14 |
+
--max-model-len 4096 \
|
| 15 |
+
--add-bos-token \
|
| 16 |
+
--batch-size auto \
|
| 17 |
+
--enable-chunked-prefill \
|
| 18 |
+
--gpu-memory-utilization 0.9 \
|
| 19 |
+
--max-num-batched-tokens 256
|
| 20 |
+
|
| 21 |
+
model="Qwen2.5-${size}-Instruct"
|
| 22 |
+
|
| 23 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 24 |
+
--model-id "Qwen/"$model \
|
| 25 |
+
--queue-name oneshot-a100x1 \
|
| 26 |
+
--project-name "LLM reference/vllm" \
|
| 27 |
+
--task-name "${model}/openllm" \
|
| 28 |
+
--benchmark-tasks openllm \
|
| 29 |
+
--max-model-len 4096 \
|
| 30 |
+
--add-bos-token \
|
| 31 |
+
--batch-size auto \
|
| 32 |
+
--enable-chunked-prefill \
|
| 33 |
+
--gpu-memory-utilization 0.9 \
|
| 34 |
+
--max-num-batched-tokens 256
|
| 35 |
+
|
| 36 |
+
done
|
| 37 |
+
|
| 38 |
+
size=32B
|
| 39 |
+
model="Qwen2.5-${size}"
|
| 40 |
+
|
| 41 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 42 |
+
--model-id "Qwen/"$model \
|
| 43 |
+
--queue-name oneshot-a100x2 \
|
| 44 |
+
--project-name "LLM reference/vllm" \
|
| 45 |
+
--task-name "${model}/openllm" \
|
| 46 |
+
--benchmark-tasks openllm \
|
| 47 |
+
--max-model-len 4096 \
|
| 48 |
+
--add-bos-token \
|
| 49 |
+
--batch-size auto \
|
| 50 |
+
--enable-chunked-prefill \
|
| 51 |
+
--gpu-memory-utilization 0.9 \
|
| 52 |
+
--max-num-batched-tokens 256
|
| 53 |
+
|
| 54 |
+
model="Qwen2.5-${size}-Instruct"
|
| 55 |
+
|
| 56 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 57 |
+
--model-id "Qwen/"$model \
|
| 58 |
+
--queue-name oneshot-a100x2 \
|
| 59 |
+
--project-name "LLM reference/vllm" \
|
| 60 |
+
--task-name "${model}/openllm" \
|
| 61 |
+
--benchmark-tasks openllm \
|
| 62 |
+
--max-model-len 4096 \
|
| 63 |
+
--add-bos-token \
|
| 64 |
+
--batch-size auto \
|
| 65 |
+
--enable-chunked-prefill \
|
| 66 |
+
--gpu-memory-utilization 0.9 \
|
| 67 |
+
--max-num-batched-tokens 256
|
| 68 |
+
|
| 69 |
+
size=72B
|
| 70 |
+
model="Qwen2.5-${size}"
|
| 71 |
+
|
| 72 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 73 |
+
--model-id "Qwen/"$model \
|
| 74 |
+
--queue-name oneshot-a100x4 \
|
| 75 |
+
--project-name "LLM reference/vllm" \
|
| 76 |
+
--task-name "${model}/openllm" \
|
| 77 |
+
--benchmark-tasks openllm \
|
| 78 |
+
--max-model-len 4096 \
|
| 79 |
+
--add-bos-token \
|
| 80 |
+
--batch-size auto \
|
| 81 |
+
--enable-chunked-prefill \
|
| 82 |
+
--gpu-memory-utilization 0.9 \
|
| 83 |
+
--max-num-batched-tokens 256
|
| 84 |
+
|
| 85 |
+
model="Qwen2.5-${size}-Instruct"
|
| 86 |
+
|
| 87 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 88 |
+
--model-id "Qwen/"$model \
|
| 89 |
+
--queue-name oneshot-a100x4 \
|
| 90 |
+
--project-name "LLM reference/vllm" \
|
| 91 |
+
--task-name "${model}/openllm" \
|
| 92 |
+
--benchmark-tasks openllm \
|
| 93 |
+
--max-model-len 4096 \
|
| 94 |
+
--add-bos-token \
|
| 95 |
+
--batch-size auto \
|
| 96 |
+
--enable-chunked-prefill \
|
| 97 |
+
--gpu-memory-utilization 0.9 \
|
| 98 |
+
--max-num-batched-tokens 256
|
evaluate_qwen2.5_w4a16.sh
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
source ~/environments/clearml/bin/activate
|
| 2 |
+
|
| 3 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 4 |
+
--model-id ece15d6b3b574a9ba3062f8922dda9d9 \
|
| 5 |
+
--clearml-model \
|
| 6 |
+
--queue-name oneshot-a100x1 \
|
| 7 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 8 |
+
--task-name "Qwen2.5-0.5B/openllm/vllm" \
|
| 9 |
+
--benchmark-tasks openllm \
|
| 10 |
+
--max-model-len 4096 \
|
| 11 |
+
--add-bos-token \
|
| 12 |
+
--batch-size auto \
|
| 13 |
+
--enable-chunked-prefill \
|
| 14 |
+
--gpu-memory-utilization 0.9 \
|
| 15 |
+
--max-num-batched-tokens 256
|
| 16 |
+
|
| 17 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 18 |
+
--model-id b3c075c7685a42c798ebe91bdb1bb140 \
|
| 19 |
+
--clearml-model \
|
| 20 |
+
--queue-name oneshot-a100x1 \
|
| 21 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 22 |
+
--task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
|
| 23 |
+
--benchmark-tasks openllm \
|
| 24 |
+
--max-model-len 4096 \
|
| 25 |
+
--add-bos-token \
|
| 26 |
+
--batch-size auto \
|
| 27 |
+
--enable-chunked-prefill \
|
| 28 |
+
--gpu-memory-utilization 0.9 \
|
| 29 |
+
--max-num-batched-tokens 256
|
| 30 |
+
|
| 31 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 32 |
+
--model-id 70d4376f576241eb9ca066ce2864c517 \
|
| 33 |
+
--clearml-model \
|
| 34 |
+
--queue-name oneshot-a100x1 \
|
| 35 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 36 |
+
--task-name "Qwen2.5-1.5B/openllm/vllm" \
|
| 37 |
+
--benchmark-tasks openllm \
|
| 38 |
+
--max-model-len 4096 \
|
| 39 |
+
--add-bos-token \
|
| 40 |
+
--batch-size auto \
|
| 41 |
+
--enable-chunked-prefill \
|
| 42 |
+
--gpu-memory-utilization 0.9 \
|
| 43 |
+
--max-num-batched-tokens 256
|
| 44 |
+
|
| 45 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 46 |
+
--model-id 328b46a2fc3d431c95a8d8900f1b88ce \
|
| 47 |
+
--clearml-model \
|
| 48 |
+
--queue-name oneshot-a100x1 \
|
| 49 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 50 |
+
--task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
|
| 51 |
+
--benchmark-tasks openllm \
|
| 52 |
+
--max-model-len 4096 \
|
| 53 |
+
--add-bos-token \
|
| 54 |
+
--batch-size auto \
|
| 55 |
+
--enable-chunked-prefill \
|
| 56 |
+
--gpu-memory-utilization 0.9 \
|
| 57 |
+
--max-num-batched-tokens 256
|
| 58 |
+
|
| 59 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 60 |
+
--model-id 29785c1921db4680a1bcbb1c32b1bb0c \
|
| 61 |
+
--clearml-model \
|
| 62 |
+
--queue-name oneshot-a100x1 \
|
| 63 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 64 |
+
--task-name "Qwen2.5-3B/openllm/vllm" \
|
| 65 |
+
--benchmark-tasks openllm \
|
| 66 |
+
--max-model-len 4096 \
|
| 67 |
+
--add-bos-token \
|
| 68 |
+
--batch-size auto \
|
| 69 |
+
--enable-chunked-prefill \
|
| 70 |
+
--gpu-memory-utilization 0.9 \
|
| 71 |
+
--max-num-batched-tokens 256
|
| 72 |
+
|
| 73 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 74 |
+
--model-id bb496bef644e4336afba44944aa8e915 \
|
| 75 |
+
--clearml-model \
|
| 76 |
+
--queue-name oneshot-a100x1 \
|
| 77 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 78 |
+
--task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
|
| 79 |
+
--benchmark-tasks openllm \
|
| 80 |
+
--max-model-len 4096 \
|
| 81 |
+
--add-bos-token \
|
| 82 |
+
--batch-size auto \
|
| 83 |
+
--enable-chunked-prefill \
|
| 84 |
+
--gpu-memory-utilization 0.9 \
|
| 85 |
+
--max-num-batched-tokens 256
|
| 86 |
+
|
| 87 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 88 |
+
--model-id 7ab087ff9ada4033a52350078a2a665a \
|
| 89 |
+
--clearml-model \
|
| 90 |
+
--queue-name oneshot-a100x1 \
|
| 91 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 92 |
+
--task-name "Qwen2.5-7B/openllm/vllm" \
|
| 93 |
+
--benchmark-tasks openllm \
|
| 94 |
+
--max-model-len 4096 \
|
| 95 |
+
--add-bos-token \
|
| 96 |
+
--batch-size auto \
|
| 97 |
+
--enable-chunked-prefill \
|
| 98 |
+
--gpu-memory-utilization 0.9 \
|
| 99 |
+
--max-num-batched-tokens 256
|
| 100 |
+
|
| 101 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 102 |
+
--model-id 0a0cf6bf30c94a3ea96babf1c92f23a2 \
|
| 103 |
+
--clearml-model \
|
| 104 |
+
--queue-name oneshot-a100x1 \
|
| 105 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 106 |
+
--task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
|
| 107 |
+
--benchmark-tasks openllm \
|
| 108 |
+
--max-model-len 4096 \
|
| 109 |
+
--add-bos-token \
|
| 110 |
+
--batch-size auto \
|
| 111 |
+
--enable-chunked-prefill \
|
| 112 |
+
--gpu-memory-utilization 0.9 \
|
| 113 |
+
--max-num-batched-tokens 256
|
| 114 |
+
|
| 115 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 116 |
+
--model-id 7da4e5e0e2db49c984d3e195ab09ae09 \
|
| 117 |
+
--clearml-model \
|
| 118 |
+
--queue-name oneshot-a100x1 \
|
| 119 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 120 |
+
--task-name "Qwen2.5-32B/openllm/vllm" \
|
| 121 |
+
--benchmark-tasks openllm \
|
| 122 |
+
--max-model-len 4096 \
|
| 123 |
+
--add-bos-token \
|
| 124 |
+
--batch-size auto \
|
| 125 |
+
--enable-chunked-prefill \
|
| 126 |
+
--gpu-memory-utilization 0.9 \
|
| 127 |
+
--max-num-batched-tokens 256
|
| 128 |
+
|
| 129 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 130 |
+
--model-id 79b0a7340f2247068940085e66a7e17b \
|
| 131 |
+
--clearml-model \
|
| 132 |
+
--queue-name oneshot-a100x1 \
|
| 133 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 134 |
+
--task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
|
| 135 |
+
--benchmark-tasks openllm \
|
| 136 |
+
--max-model-len 4096 \
|
| 137 |
+
--add-bos-token \
|
| 138 |
+
--batch-size auto \
|
| 139 |
+
--enable-chunked-prefill \
|
| 140 |
+
--gpu-memory-utilization 0.9 \
|
| 141 |
+
--max-num-batched-tokens 256
|
evaluate_qwen2.5_w8a16.sh
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
source ~/environments/clearml/bin/activate
|
| 2 |
+
|
| 3 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 4 |
+
--model-id 0fe5857173ac484a89316214b14fcf96 \
|
| 5 |
+
--clearml-model \
|
| 6 |
+
--queue-name oneshot-a100x2 \
|
| 7 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 8 |
+
--task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
|
| 9 |
+
--benchmark-tasks openllm \
|
| 10 |
+
--max-model-len 4096 \
|
| 11 |
+
--add-bos-token \
|
| 12 |
+
--batch-size auto \
|
| 13 |
+
--enable-chunked-prefill \
|
| 14 |
+
--gpu-memory-utilization 0.9 \
|
| 15 |
+
--max-num-batched-tokens 256
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
<<END
|
| 19 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 20 |
+
--model-id 6a4ecaa68a6e45ea80c62680b0a65aa0 \
|
| 21 |
+
--clearml-model \
|
| 22 |
+
--queue-name oneshot-a100x2 \
|
| 23 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 24 |
+
--task-name "Qwen2.5-72B/openllm/vllm" \
|
| 25 |
+
--benchmark-tasks openllm \
|
| 26 |
+
--max-model-len 4096 \
|
| 27 |
+
--add-bos-token \
|
| 28 |
+
--batch-size auto \
|
| 29 |
+
--enable-chunked-prefill \
|
| 30 |
+
--gpu-memory-utilization 0.9 \
|
| 31 |
+
--max-num-batched-tokens 256
|
| 32 |
+
|
| 33 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 34 |
+
--model-id c67be85bc77f462c93381280019dea1d \
|
| 35 |
+
--clearml-model \
|
| 36 |
+
--queue-name oneshot-a100x4 \
|
| 37 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 38 |
+
--task-name "Qwen2.5-7B/openllm/vllm" \
|
| 39 |
+
--benchmark-tasks openllm \
|
| 40 |
+
--max-model-len 4096 \
|
| 41 |
+
--add-bos-token \
|
| 42 |
+
--batch-size auto \
|
| 43 |
+
--enable-chunked-prefill \
|
| 44 |
+
--gpu-memory-utilization 0.9 \
|
| 45 |
+
--max-num-batched-tokens 256
|
| 46 |
+
|
| 47 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 48 |
+
--model-id effae214cc464181a92d5a57df10f3d6 \
|
| 49 |
+
--clearml-model \
|
| 50 |
+
--queue-name oneshot-a100x1 \
|
| 51 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 52 |
+
--task-name "Qwen2.5-0.5B/openllm/vllm" \
|
| 53 |
+
--benchmark-tasks openllm \
|
| 54 |
+
--max-model-len 4096 \
|
| 55 |
+
--add-bos-token \
|
| 56 |
+
--batch-size auto \
|
| 57 |
+
--enable-chunked-prefill \
|
| 58 |
+
--gpu-memory-utilization 0.9 \
|
| 59 |
+
--max-num-batched-tokens 256
|
| 60 |
+
|
| 61 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 62 |
+
--model-id 21dd39ef3013401d84b258410647e847 \
|
| 63 |
+
--clearml-model \
|
| 64 |
+
--queue-name oneshot-a100x1 \
|
| 65 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 66 |
+
--task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
|
| 67 |
+
--benchmark-tasks openllm \
|
| 68 |
+
--max-model-len 4096 \
|
| 69 |
+
--add-bos-token \
|
| 70 |
+
--batch-size auto \
|
| 71 |
+
--enable-chunked-prefill \
|
| 72 |
+
--gpu-memory-utilization 0.9 \
|
| 73 |
+
--max-num-batched-tokens 256
|
| 74 |
+
|
| 75 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 76 |
+
--model-id 324acdd4c7c4426dbdfeb29667dc4b53 \
|
| 77 |
+
--clearml-model \
|
| 78 |
+
--queue-name oneshot-a100x1 \
|
| 79 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 80 |
+
--task-name "Qwen2.5-1.5B/openllm/vllm" \
|
| 81 |
+
--benchmark-tasks openllm \
|
| 82 |
+
--max-model-len 4096 \
|
| 83 |
+
--add-bos-token \
|
| 84 |
+
--batch-size auto \
|
| 85 |
+
--enable-chunked-prefill \
|
| 86 |
+
--gpu-memory-utilization 0.9 \
|
| 87 |
+
--max-num-batched-tokens 256
|
| 88 |
+
|
| 89 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 90 |
+
--model-id 82cbaa6e27c84f08ac10e9f115034b0b \
|
| 91 |
+
--clearml-model \
|
| 92 |
+
--queue-name oneshot-a100x1 \
|
| 93 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 94 |
+
--task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
|
| 95 |
+
--benchmark-tasks openllm \
|
| 96 |
+
--max-model-len 4096 \
|
| 97 |
+
--add-bos-token \
|
| 98 |
+
--batch-size auto \
|
| 99 |
+
--enable-chunked-prefill \
|
| 100 |
+
--gpu-memory-utilization 0.9 \
|
| 101 |
+
--max-num-batched-tokens 256
|
| 102 |
+
|
| 103 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 104 |
+
--model-id 337f5b50610443c7ad2a380dce8e0be8 \
|
| 105 |
+
--clearml-model \
|
| 106 |
+
--queue-name oneshot-a100x1 \
|
| 107 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 108 |
+
--task-name "Qwen2.5-3B/openllm/vllm" \
|
| 109 |
+
--benchmark-tasks openllm \
|
| 110 |
+
--max-model-len 4096 \
|
| 111 |
+
--add-bos-token \
|
| 112 |
+
--batch-size auto \
|
| 113 |
+
--enable-chunked-prefill \
|
| 114 |
+
--gpu-memory-utilization 0.9 \
|
| 115 |
+
--max-num-batched-tokens 256
|
| 116 |
+
|
| 117 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 118 |
+
--model-id a8ebd9cae5324572906d50f95eeee5dd \
|
| 119 |
+
--clearml-model \
|
| 120 |
+
--queue-name oneshot-a100x1 \
|
| 121 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 122 |
+
--task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
|
| 123 |
+
--benchmark-tasks openllm \
|
| 124 |
+
--max-model-len 4096 \
|
| 125 |
+
--add-bos-token \
|
| 126 |
+
--batch-size auto \
|
| 127 |
+
--enable-chunked-prefill \
|
| 128 |
+
--gpu-memory-utilization 0.9 \
|
| 129 |
+
--max-num-batched-tokens 256
|
| 130 |
+
|
| 131 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 132 |
+
--model-id 5190911e94a340988dac223c252e72a2 \
|
| 133 |
+
--clearml-model \
|
| 134 |
+
--queue-name oneshot-a100x1 \
|
| 135 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 136 |
+
--task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
|
| 137 |
+
--benchmark-tasks openllm \
|
| 138 |
+
--max-model-len 4096 \
|
| 139 |
+
--add-bos-token \
|
| 140 |
+
--batch-size auto \
|
| 141 |
+
--enable-chunked-prefill \
|
| 142 |
+
--gpu-memory-utilization 0.9 \
|
| 143 |
+
--max-num-batched-tokens 256
|
| 144 |
+
END
|
evaluate_qwen2.5_w8a8.sh
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
source ~/environments/clearml/bin/activate
|
| 2 |
+
|
| 3 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 4 |
+
--model-id 0909a970c53644f593c62b2e076ee763 \
|
| 5 |
+
--clearml-model \
|
| 6 |
+
--queue-name oneshot-a100x1 \
|
| 7 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 8 |
+
--task-name "Qwen2.5-7B/openllm/vllm" \
|
| 9 |
+
--benchmark-tasks openllm \
|
| 10 |
+
--max-model-len 4096 \
|
| 11 |
+
--add-bos-token \
|
| 12 |
+
--batch-size auto \
|
| 13 |
+
--enable-chunked-prefill \
|
| 14 |
+
--gpu-memory-utilization 0.9 \
|
| 15 |
+
--max-num-batched-tokens 256
|
| 16 |
+
|
| 17 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 18 |
+
--model-id 0323219f7a8b4e938c2583ab17851b91 \
|
| 19 |
+
--clearml-model \
|
| 20 |
+
--queue-name oneshot-a100x2 \
|
| 21 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 22 |
+
--task-name "Qwen2.5-72B/openllm/vllm" \
|
| 23 |
+
--benchmark-tasks openllm \
|
| 24 |
+
--max-model-len 4096 \
|
| 25 |
+
--add-bos-token \
|
| 26 |
+
--batch-size auto \
|
| 27 |
+
--enable-chunked-prefill \
|
| 28 |
+
--gpu-memory-utilization 0.9 \
|
| 29 |
+
--max-num-batched-tokens 256
|
| 30 |
+
|
| 31 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 32 |
+
--model-id 31c634f743fe458687556bc85a6aeb32 \
|
| 33 |
+
--clearml-model \
|
| 34 |
+
--queue-name oneshot-a100x2 \
|
| 35 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 36 |
+
--task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
|
| 37 |
+
--benchmark-tasks openllm \
|
| 38 |
+
--max-model-len 4096 \
|
| 39 |
+
--add-bos-token \
|
| 40 |
+
--batch-size auto \
|
| 41 |
+
--enable-chunked-prefill \
|
| 42 |
+
--gpu-memory-utilization 0.9 \
|
| 43 |
+
--max-num-batched-tokens 256
|
| 44 |
+
|
| 45 |
+
<<END
|
| 46 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 47 |
+
--model-id ee78f00c9e924db19e8d763442f45209 \
|
| 48 |
+
--clearml-model \
|
| 49 |
+
--queue-name oneshot-a100x1 \
|
| 50 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 51 |
+
--task-name "Qwen2.5-0.5B/openllm/vllm" \
|
| 52 |
+
--benchmark-tasks openllm \
|
| 53 |
+
--max-model-len 4096 \
|
| 54 |
+
--add-bos-token \
|
| 55 |
+
--batch-size auto \
|
| 56 |
+
--enable-chunked-prefill \
|
| 57 |
+
--gpu-memory-utilization 0.9 \
|
| 58 |
+
--max-num-batched-tokens 256
|
| 59 |
+
|
| 60 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 61 |
+
--model-id 5baf53c17abf460d95a5d04bf4ec184d \
|
| 62 |
+
--clearml-model \
|
| 63 |
+
--queue-name oneshot-a100x1 \
|
| 64 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 65 |
+
--task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
|
| 66 |
+
--benchmark-tasks openllm \
|
| 67 |
+
--max-model-len 4096 \
|
| 68 |
+
--add-bos-token \
|
| 69 |
+
--batch-size auto \
|
| 70 |
+
--enable-chunked-prefill \
|
| 71 |
+
--gpu-memory-utilization 0.9 \
|
| 72 |
+
--max-num-batched-tokens 256
|
| 73 |
+
|
| 74 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 75 |
+
--model-id 0dd5e7b10a97426b92d3f7278a1c47ad \
|
| 76 |
+
--clearml-model \
|
| 77 |
+
--queue-name oneshot-a100x1 \
|
| 78 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 79 |
+
--task-name "Qwen2.5-1.5B/openllm/vllm" \
|
| 80 |
+
--benchmark-tasks openllm \
|
| 81 |
+
--max-model-len 4096 \
|
| 82 |
+
--add-bos-token \
|
| 83 |
+
--batch-size auto \
|
| 84 |
+
--enable-chunked-prefill \
|
| 85 |
+
--gpu-memory-utilization 0.9 \
|
| 86 |
+
--max-num-batched-tokens 256
|
| 87 |
+
|
| 88 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 89 |
+
--model-id c63fe381f1e94c2880ce6f8c15465ed4 \
|
| 90 |
+
--clearml-model \
|
| 91 |
+
--queue-name oneshot-a100x1 \
|
| 92 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 93 |
+
--task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
|
| 94 |
+
--benchmark-tasks openllm \
|
| 95 |
+
--max-model-len 4096 \
|
| 96 |
+
--add-bos-token \
|
| 97 |
+
--batch-size auto \
|
| 98 |
+
--enable-chunked-prefill \
|
| 99 |
+
--gpu-memory-utilization 0.9 \
|
| 100 |
+
--max-num-batched-tokens 256
|
| 101 |
+
|
| 102 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 103 |
+
--model-id a1ed1bec7c4540eba02cae98c03cada9 \
|
| 104 |
+
--clearml-model \
|
| 105 |
+
--queue-name oneshot-a100x1 \
|
| 106 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 107 |
+
--task-name "Qwen2.5-3B/openllm/vllm" \
|
| 108 |
+
--benchmark-tasks openllm \
|
| 109 |
+
--max-model-len 4096 \
|
| 110 |
+
--add-bos-token \
|
| 111 |
+
--batch-size auto \
|
| 112 |
+
--enable-chunked-prefill \
|
| 113 |
+
--gpu-memory-utilization 0.9 \
|
| 114 |
+
--max-num-batched-tokens 256
|
| 115 |
+
|
| 116 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 117 |
+
--model-id d7392721b76c4a299ac494c7b3ce1299 \
|
| 118 |
+
--clearml-model \
|
| 119 |
+
--queue-name oneshot-a100x1 \
|
| 120 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 121 |
+
--task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
|
| 122 |
+
--benchmark-tasks openllm \
|
| 123 |
+
--max-model-len 4096 \
|
| 124 |
+
--add-bos-token \
|
| 125 |
+
--batch-size auto \
|
| 126 |
+
--enable-chunked-prefill \
|
| 127 |
+
--gpu-memory-utilization 0.9 \
|
| 128 |
+
--max-num-batched-tokens 256
|
| 129 |
+
|
| 130 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 131 |
+
--model-id 71606d92b022423591547683b81ebc91 \
|
| 132 |
+
--clearml-model \
|
| 133 |
+
--queue-name oneshot-a100x1 \
|
| 134 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 135 |
+
--task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
|
| 136 |
+
--benchmark-tasks openllm \
|
| 137 |
+
--max-model-len 4096 \
|
| 138 |
+
--add-bos-token \
|
| 139 |
+
--batch-size auto \
|
| 140 |
+
--enable-chunked-prefill \
|
| 141 |
+
--gpu-memory-utilization 0.9 \
|
| 142 |
+
--max-num-batched-tokens 256
|
| 143 |
+
|
| 144 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 145 |
+
--model-id c9094d7778584311894b490d5c0d7e66 \
|
| 146 |
+
--clearml-model \
|
| 147 |
+
--queue-name oneshot-a100x1 \
|
| 148 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 149 |
+
--task-name "Qwen2.5-32B/openllm/vllm" \
|
| 150 |
+
--benchmark-tasks openllm \
|
| 151 |
+
--max-model-len 4096 \
|
| 152 |
+
--add-bos-token \
|
| 153 |
+
--batch-size auto \
|
| 154 |
+
--enable-chunked-prefill \
|
| 155 |
+
--gpu-memory-utilization 0.9 \
|
| 156 |
+
--max-num-batched-tokens 256
|
| 157 |
+
|
| 158 |
+
python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
|
| 159 |
+
--model-id d2164c0031f2445eb726fc94c94210bd \
|
| 160 |
+
--clearml-model \
|
| 161 |
+
--queue-name oneshot-a100x1 \
|
| 162 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 163 |
+
--task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
|
| 164 |
+
--benchmark-tasks openllm \
|
| 165 |
+
--max-model-len 4096 \
|
| 166 |
+
--add-bos-token \
|
| 167 |
+
--batch-size auto \
|
| 168 |
+
--enable-chunked-prefill \
|
| 169 |
+
--gpu-memory-utilization 0.9 \
|
| 170 |
+
--max-num-batched-tokens 256
|
| 171 |
+
END
|
quantize_qwen2.5_fp8.sh
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
source ~/environments/clearml/bin/activate
|
| 3 |
+
|
| 4 |
+
recipe_template=$(cat <<'EOF'
|
| 5 |
+
quant_stage:
|
| 6 |
+
quant_modifiers:
|
| 7 |
+
QuantizationModifier:
|
| 8 |
+
ignore: ["lm_head"]
|
| 9 |
+
scheme: FP8
|
| 10 |
+
targets: ["Linear"]
|
| 11 |
+
observer: "mse"
|
| 12 |
+
EOF
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
for size in 0.5B 1.5B 3B 7B 32B 72B
|
| 16 |
+
do
|
| 17 |
+
for version in base instruct
|
| 18 |
+
do
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if [ $version = "base" ]; then
|
| 22 |
+
model="Qwen2.5-${size}"
|
| 23 |
+
else
|
| 24 |
+
model="Qwen2.5-${size}-Instruct"
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196__damp01"
|
| 28 |
+
|
| 29 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 30 |
+
--model-id "Qwen/"$model \
|
| 31 |
+
--project-name "LLM quantization - FP8/llmcompressor/Qwen2.5" \
|
| 32 |
+
--task-prefix $prefix \
|
| 33 |
+
--recipe "${recipe}" \
|
| 34 |
+
--num-samples 512 \
|
| 35 |
+
--max-seq-len 8196 \
|
| 36 |
+
--tags "Qwen2.5" "W4A16" "calibration" $size "MSE" $version
|
| 37 |
+
|
| 38 |
+
done
|
| 39 |
+
done
|
quantize_qwen2.5_w4a16.sh
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
source ~/environments/clearml/bin/activate
|
| 3 |
+
|
| 4 |
+
recipe_template=$(cat <<'EOF'
|
| 5 |
+
quant_stage:
|
| 6 |
+
quant_modifiers:
|
| 7 |
+
GPTQModifier:
|
| 8 |
+
sequential_update: true
|
| 9 |
+
dampening_frac: 0.1
|
| 10 |
+
ignore: ["lm_head"]
|
| 11 |
+
config_groups:
|
| 12 |
+
group_0:
|
| 13 |
+
weights:
|
| 14 |
+
num_bits: 4
|
| 15 |
+
type: "int"
|
| 16 |
+
symmetric: true
|
| 17 |
+
strategy: "group"
|
| 18 |
+
group_size: 128
|
| 19 |
+
actorder: "group"
|
| 20 |
+
targets: ["Linear"]
|
| 21 |
+
observer: "mse"
|
| 22 |
+
EOF
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
for size in 0.5B 1.5B 3B 7B 32B 72B
|
| 26 |
+
do
|
| 27 |
+
for version in base instruct
|
| 28 |
+
do
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
if [ $version = "base" ]; then
|
| 32 |
+
model="Qwen2.5-${size}"
|
| 33 |
+
else
|
| 34 |
+
model="Qwen2.5-${size}-Instruct"
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196__damp01"
|
| 38 |
+
|
| 39 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 40 |
+
--model-id "Qwen/"$model \
|
| 41 |
+
--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
|
| 42 |
+
--task-prefix $prefix \
|
| 43 |
+
--recipe "${recipe}" \
|
| 44 |
+
--num-samples 512 \
|
| 45 |
+
--max-seq-len 8196 \
|
| 46 |
+
--tags "Qwen2.5" "W4A16" "calibration" $size "MSE" $version
|
| 47 |
+
|
| 48 |
+
done
|
| 49 |
+
done
|
quantize_qwen2.5_w8a16.sh
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
source ~/environments/clearml/bin/activate
|
| 3 |
+
|
| 4 |
+
recipe_template=$(cat <<'EOF'
|
| 5 |
+
quant_stage:
|
| 6 |
+
quant_modifiers:
|
| 7 |
+
GPTQModifier:
|
| 8 |
+
sequential_update: true
|
| 9 |
+
dampening_frac: 0.1
|
| 10 |
+
ignore: ["lm_head"]
|
| 11 |
+
scheme: "W8A16"
|
| 12 |
+
targets: "Linear"
|
| 13 |
+
observer: "mse"
|
| 14 |
+
EOF
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
for size in 3B
|
| 18 |
+
do
|
| 19 |
+
for version in instruct
|
| 20 |
+
do
|
| 21 |
+
|
| 22 |
+
if [ $version = "base" ]; then
|
| 23 |
+
model="Qwen2.5-${size}"
|
| 24 |
+
else
|
| 25 |
+
model="Qwen2.5-${size}-Instruct"
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__1024__8196_damp01"
|
| 29 |
+
|
| 30 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 31 |
+
--model-id "Qwen/"$model \
|
| 32 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 33 |
+
--task-prefix $prefix \
|
| 34 |
+
--recipe "${recipe}" \
|
| 35 |
+
--num-samples 1024 \
|
| 36 |
+
--max-seq-len 8196 \
|
| 37 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
|
| 38 |
+
|
| 39 |
+
done
|
| 40 |
+
done
|
| 41 |
+
|
| 42 |
+
<<END
|
| 43 |
+
size="72B"
|
| 44 |
+
model="Qwen2.5-${size}-Instruct"
|
| 45 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 46 |
+
|
| 47 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 48 |
+
--model-id "Qwen/"$model \
|
| 49 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 50 |
+
--oneshot-queue oneshot-a100x4 \
|
| 51 |
+
--evaluation-queue oneshot-a100x4 \
|
| 52 |
+
--task-prefix $prefix \
|
| 53 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
|
| 54 |
+
--num-samples 512 \
|
| 55 |
+
--max-seq-len 8196 \
|
| 56 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
|
| 57 |
+
|
| 58 |
+
size="32B"
|
| 59 |
+
model="Qwen2.5-${size}"
|
| 60 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 61 |
+
|
| 62 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 63 |
+
--model-id "Qwen/"$model \
|
| 64 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 65 |
+
--oneshot-queue oneshot-a100x2 \
|
| 66 |
+
--evaluation-queue oneshot-a100x2 \
|
| 67 |
+
--task-prefix $prefix \
|
| 68 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
|
| 69 |
+
--num-samples 512 \
|
| 70 |
+
--max-seq-len 8196 \
|
| 71 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
|
| 72 |
+
|
| 73 |
+
model="Qwen2.5-${size}-Instruct"
|
| 74 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 75 |
+
|
| 76 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 77 |
+
--model-id "Qwen/"$model \
|
| 78 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 79 |
+
--oneshot-queue oneshot-a100x2 \
|
| 80 |
+
--evaluation-queue oneshot-a100x2 \
|
| 81 |
+
--task-prefix $prefix \
|
| 82 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
|
| 83 |
+
--num-samples 512 \
|
| 84 |
+
--max-seq-len 8196 \
|
| 85 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
|
| 86 |
+
|
| 87 |
+
size="72B"
|
| 88 |
+
model="Qwen2.5-${size}"
|
| 89 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 90 |
+
|
| 91 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 92 |
+
--model-id "Qwen/"$model \
|
| 93 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 94 |
+
--oneshot-queue oneshot-a100x4 \
|
| 95 |
+
--evaluation-queue oneshot-a100x2 \
|
| 96 |
+
--task-prefix $prefix \
|
| 97 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
|
| 98 |
+
--num-samples 512 \
|
| 99 |
+
--max-seq-len 8196 \
|
| 100 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
|
| 101 |
+
|
| 102 |
+
model="Qwen2.5-${size}-Instruct"
|
| 103 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 104 |
+
|
| 105 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 106 |
+
--model-id "Qwen/"$model \
|
| 107 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 108 |
+
--oneshot-queue oneshot-a100x4 \
|
| 109 |
+
--evaluation-queue oneshot-a100x2 \
|
| 110 |
+
--task-prefix $prefix \
|
| 111 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_mse_damp01.yaml" \
|
| 112 |
+
--num-samples 512 \
|
| 113 |
+
--max-seq-len 8196 \
|
| 114 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
|
| 115 |
+
|
| 116 |
+
size="7B"
|
| 117 |
+
model="Qwen2.5-${size}"
|
| 118 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 119 |
+
|
| 120 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 121 |
+
--model-id "Qwen/"$model \
|
| 122 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 123 |
+
--task-prefix $prefix \
|
| 124 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
|
| 125 |
+
--num-samples 512 \
|
| 126 |
+
--max-seq-len 8196 \
|
| 127 |
+
--tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
|
| 128 |
+
|
| 129 |
+
for size in 0.5B 1.5B 3B 7B
|
| 130 |
+
do
|
| 131 |
+
|
| 132 |
+
model="Qwen2.5-${size}"
|
| 133 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 134 |
+
|
| 135 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 136 |
+
--model-id "Qwen/"$model \
|
| 137 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 138 |
+
--task-prefix $prefix \
|
| 139 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
|
| 140 |
+
--num-samples 512 \
|
| 141 |
+
--max-seq-len 8196 \
|
| 142 |
+
--tags "Qwen2.5" "w8a16" "calibration" $size "MSE"
|
| 143 |
+
|
| 144 |
+
model="Qwen2.5-${size}-Instruct"
|
| 145 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 146 |
+
|
| 147 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 148 |
+
--model-id "Qwen/"$model \
|
| 149 |
+
--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
|
| 150 |
+
--task-prefix $prefix \
|
| 151 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
|
| 152 |
+
--num-samples 512 \
|
| 153 |
+
--max-seq-len 8196 \
|
| 154 |
+
--tags "Qwen2.5" "w8a16" "calibration" $size "MSE" "Instruct"
|
| 155 |
+
|
| 156 |
+
done
|
| 157 |
+
|
| 158 |
+
for size in 32B 72B
|
| 159 |
+
do
|
| 160 |
+
|
| 161 |
+
model="Qwen2.5-${size}"
|
| 162 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 163 |
+
|
| 164 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 165 |
+
--model-id "Qwen/"$model \
|
| 166 |
+
--project-name "LLM quantization - w8a16/llmcompressor/Qwen2.5" \
|
| 167 |
+
--task-prefix $prefix \
|
| 168 |
+
--oneshot-queue oneshot-a100x2 \
|
| 169 |
+
--evaluation-queue oneshot-a100x2 \
|
| 170 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
|
| 171 |
+
--num-samples 512 \
|
| 172 |
+
--max-seq-len 8196 \
|
| 173 |
+
--tags "Qwen2.5" "w8a16" "calibration" $size "MSE"
|
| 174 |
+
|
| 175 |
+
model="Qwen2.5-${size}-Instruct"
|
| 176 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 177 |
+
|
| 178 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 179 |
+
--model-id "Qwen/"$model \
|
| 180 |
+
--project-name "LLM quantization - w8a16/llmcompressor/Qwen2.5" \
|
| 181 |
+
--task-prefix $prefix \
|
| 182 |
+
--oneshot-queue oneshot-a100x2 \
|
| 183 |
+
--evaluation-queue oneshot-a100x2 \
|
| 184 |
+
--recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
|
| 185 |
+
--num-samples 512 \
|
| 186 |
+
--max-seq-len 8196 \
|
| 187 |
+
--tags "Qwen2.5" "w8a16" "calibration" $size "MSE" "Instruct"
|
| 188 |
+
|
| 189 |
+
done
|
| 190 |
+
END
|
quantize_qwen2.5_w8a8.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
source ~/environments/clearml/bin/activate
|
| 3 |
+
|
| 4 |
+
size="7B"
|
| 5 |
+
model="Qwen2.5-${size}"
|
| 6 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 7 |
+
|
| 8 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 9 |
+
--model-id "Qwen/"$model \
|
| 10 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 11 |
+
--task-prefix $prefix \
|
| 12 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
|
| 13 |
+
--num-samples 512 \
|
| 14 |
+
--max-seq-len 8196 \
|
| 15 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
|
| 16 |
+
|
| 17 |
+
size="72B"
|
| 18 |
+
model="Qwen2.5-${size}"
|
| 19 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 20 |
+
|
| 21 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 22 |
+
--model-id "Qwen/"$model \
|
| 23 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 24 |
+
--oneshot-queue oneshot-a100x4 \
|
| 25 |
+
--evaluation-queue oneshot-a100x4 \
|
| 26 |
+
--task-prefix $prefix \
|
| 27 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
|
| 28 |
+
--num-samples 512 \
|
| 29 |
+
--max-seq-len 8196 \
|
| 30 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
|
| 31 |
+
|
| 32 |
+
model="Qwen2.5-${size}-Instruct"
|
| 33 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 34 |
+
|
| 35 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 36 |
+
--model-id "Qwen/"$model \
|
| 37 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 38 |
+
--oneshot-queue oneshot-a100x4 \
|
| 39 |
+
--evaluation-queue oneshot-a100x4 \
|
| 40 |
+
--task-prefix $prefix \
|
| 41 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
|
| 42 |
+
--num-samples 512 \
|
| 43 |
+
--max-seq-len 8196 \
|
| 44 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
<<END
|
| 48 |
+
for size in 0.5B 1.5B 3B 7B
|
| 49 |
+
do
|
| 50 |
+
|
| 51 |
+
model="Qwen2.5-${size}"
|
| 52 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 53 |
+
|
| 54 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 55 |
+
--model-id "Qwen/"$model \
|
| 56 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 57 |
+
--task-prefix $prefix \
|
| 58 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
|
| 59 |
+
--num-samples 512 \
|
| 60 |
+
--max-seq-len 8196 \
|
| 61 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
|
| 62 |
+
|
| 63 |
+
model="Qwen2.5-${size}-Instruct"
|
| 64 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 65 |
+
|
| 66 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 67 |
+
--model-id "Qwen/"$model \
|
| 68 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 69 |
+
--task-prefix $prefix \
|
| 70 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
|
| 71 |
+
--num-samples 512 \
|
| 72 |
+
--max-seq-len 8196 \
|
| 73 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
|
| 74 |
+
|
| 75 |
+
done
|
| 76 |
+
|
| 77 |
+
for size in 32B 72B
|
| 78 |
+
do
|
| 79 |
+
|
| 80 |
+
model="Qwen2.5-${size}"
|
| 81 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 82 |
+
|
| 83 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 84 |
+
--model-id "Qwen/"$model \
|
| 85 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 86 |
+
--task-prefix $prefix \
|
| 87 |
+
--oneshot-queue oneshot-a100x2 \
|
| 88 |
+
--evaluation-queue oneshot-a100x2 \
|
| 89 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
|
| 90 |
+
--num-samples 512 \
|
| 91 |
+
--max-seq-len 8196 \
|
| 92 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
|
| 93 |
+
|
| 94 |
+
model="Qwen2.5-${size}-Instruct"
|
| 95 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
|
| 96 |
+
|
| 97 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 98 |
+
--model-id "Qwen/"$model \
|
| 99 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 100 |
+
--task-prefix $prefix \
|
| 101 |
+
--oneshot-queue oneshot-a100x2 \
|
| 102 |
+
--evaluation-queue oneshot-a100x2 \
|
| 103 |
+
--recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
|
| 104 |
+
--num-samples 512 \
|
| 105 |
+
--max-seq-len 8196 \
|
| 106 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
|
| 107 |
+
|
| 108 |
+
done
|
| 109 |
+
END
|
quantize_qwen2.5_w8a8_sq.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
source ~/environments/clearml/bin/activate
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
recipe_template=$(cat <<'EOF'
|
| 6 |
+
quant_stage:
|
| 7 |
+
quant_modifiers:
|
| 8 |
+
SmoothQuantModifier:
|
| 9 |
+
smoothing_strength: 0.0
|
| 10 |
+
mappings:
|
| 11 |
+
- [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"]
|
| 12 |
+
- [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
|
| 13 |
+
- [["re:.*down_proj"], "re:.*up_proj"]
|
| 14 |
+
GPTQModifier:
|
| 15 |
+
sequential_update: true
|
| 16 |
+
dampening_frac: 0.05
|
| 17 |
+
ignore: ["lm_head"]
|
| 18 |
+
scheme: "W8A8"
|
| 19 |
+
targets: "Linear"
|
| 20 |
+
observer: "mse"
|
| 21 |
+
EOF
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
for size in 3B
|
| 25 |
+
do
|
| 26 |
+
for version in base
|
| 27 |
+
do
|
| 28 |
+
for sq in 0.9
|
| 29 |
+
do
|
| 30 |
+
|
| 31 |
+
recipe=$(echo "$recipe_template" | sed "s/smoothing_strength: 0.0/smoothing_strength: ${sq}/")
|
| 32 |
+
|
| 33 |
+
if [ $version = "base" ]; then
|
| 34 |
+
model="Qwen2.5-${size}"
|
| 35 |
+
else
|
| 36 |
+
model="Qwen2.5-${size}-Instruct"
|
| 37 |
+
fi
|
| 38 |
+
|
| 39 |
+
prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196_damp005_sq${sq//0./0}"
|
| 40 |
+
|
| 41 |
+
python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
|
| 42 |
+
--model-id "Qwen/"$model \
|
| 43 |
+
--project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
|
| 44 |
+
--task-prefix $prefix \
|
| 45 |
+
--recipe "${recipe}" \
|
| 46 |
+
--num-samples 512 \
|
| 47 |
+
--max-seq-len 8196 \
|
| 48 |
+
--tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "SQ"
|
| 49 |
+
|
| 50 |
+
done
|
| 51 |
+
done
|
| 52 |
+
done
|