alexmarques commited on
Commit
1a552f6
·
verified ·
1 Parent(s): 6da1c59

Upload folder using huggingface_hub

Browse files
evaluate_qwen2.5_bf16.sh ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source ~/environments/clearml/bin/activate
2
+
3
+ for size in 0.5B 1.5B 3B 7B
4
+ do
5
+
6
+ model="Qwen2.5-${size}"
7
+
8
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
9
+ --model-id "Qwen/"$model \
10
+ --queue-name oneshot-a100x1 \
11
+ --project-name "LLM reference/vllm" \
12
+ --task-name "${model}/openllm" \
13
+ --benchmark-tasks openllm \
14
+ --max-model-len 4096 \
15
+ --add-bos-token \
16
+ --batch-size auto \
17
+ --enable-chunked-prefill \
18
+ --gpu-memory-utilization 0.9 \
19
+ --max-num-batched-tokens 256
20
+
21
+ model="Qwen2.5-${size}-Instruct"
22
+
23
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
24
+ --model-id "Qwen/"$model \
25
+ --queue-name oneshot-a100x1 \
26
+ --project-name "LLM reference/vllm" \
27
+ --task-name "${model}/openllm" \
28
+ --benchmark-tasks openllm \
29
+ --max-model-len 4096 \
30
+ --add-bos-token \
31
+ --batch-size auto \
32
+ --enable-chunked-prefill \
33
+ --gpu-memory-utilization 0.9 \
34
+ --max-num-batched-tokens 256
35
+
36
+ done
37
+
38
+ size=32B
39
+ model="Qwen2.5-${size}"
40
+
41
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
42
+ --model-id "Qwen/"$model \
43
+ --queue-name oneshot-a100x2 \
44
+ --project-name "LLM reference/vllm" \
45
+ --task-name "${model}/openllm" \
46
+ --benchmark-tasks openllm \
47
+ --max-model-len 4096 \
48
+ --add-bos-token \
49
+ --batch-size auto \
50
+ --enable-chunked-prefill \
51
+ --gpu-memory-utilization 0.9 \
52
+ --max-num-batched-tokens 256
53
+
54
+ model="Qwen2.5-${size}-Instruct"
55
+
56
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
57
+ --model-id "Qwen/"$model \
58
+ --queue-name oneshot-a100x2 \
59
+ --project-name "LLM reference/vllm" \
60
+ --task-name "${model}/openllm" \
61
+ --benchmark-tasks openllm \
62
+ --max-model-len 4096 \
63
+ --add-bos-token \
64
+ --batch-size auto \
65
+ --enable-chunked-prefill \
66
+ --gpu-memory-utilization 0.9 \
67
+ --max-num-batched-tokens 256
68
+
69
+ size=72B
70
+ model="Qwen2.5-${size}"
71
+
72
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
73
+ --model-id "Qwen/"$model \
74
+ --queue-name oneshot-a100x4 \
75
+ --project-name "LLM reference/vllm" \
76
+ --task-name "${model}/openllm" \
77
+ --benchmark-tasks openllm \
78
+ --max-model-len 4096 \
79
+ --add-bos-token \
80
+ --batch-size auto \
81
+ --enable-chunked-prefill \
82
+ --gpu-memory-utilization 0.9 \
83
+ --max-num-batched-tokens 256
84
+
85
+ model="Qwen2.5-${size}-Instruct"
86
+
87
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
88
+ --model-id "Qwen/"$model \
89
+ --queue-name oneshot-a100x4 \
90
+ --project-name "LLM reference/vllm" \
91
+ --task-name "${model}/openllm" \
92
+ --benchmark-tasks openllm \
93
+ --max-model-len 4096 \
94
+ --add-bos-token \
95
+ --batch-size auto \
96
+ --enable-chunked-prefill \
97
+ --gpu-memory-utilization 0.9 \
98
+ --max-num-batched-tokens 256
evaluate_qwen2.5_w4a16.sh ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source ~/environments/clearml/bin/activate
2
+
3
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
4
+ --model-id ece15d6b3b574a9ba3062f8922dda9d9 \
5
+ --clearml-model \
6
+ --queue-name oneshot-a100x1 \
7
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
8
+ --task-name "Qwen2.5-0.5B/openllm/vllm" \
9
+ --benchmark-tasks openllm \
10
+ --max-model-len 4096 \
11
+ --add-bos-token \
12
+ --batch-size auto \
13
+ --enable-chunked-prefill \
14
+ --gpu-memory-utilization 0.9 \
15
+ --max-num-batched-tokens 256
16
+
17
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
18
+ --model-id b3c075c7685a42c798ebe91bdb1bb140 \
19
+ --clearml-model \
20
+ --queue-name oneshot-a100x1 \
21
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
22
+ --task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
23
+ --benchmark-tasks openllm \
24
+ --max-model-len 4096 \
25
+ --add-bos-token \
26
+ --batch-size auto \
27
+ --enable-chunked-prefill \
28
+ --gpu-memory-utilization 0.9 \
29
+ --max-num-batched-tokens 256
30
+
31
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
32
+ --model-id 70d4376f576241eb9ca066ce2864c517 \
33
+ --clearml-model \
34
+ --queue-name oneshot-a100x1 \
35
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
36
+ --task-name "Qwen2.5-1.5B/openllm/vllm" \
37
+ --benchmark-tasks openllm \
38
+ --max-model-len 4096 \
39
+ --add-bos-token \
40
+ --batch-size auto \
41
+ --enable-chunked-prefill \
42
+ --gpu-memory-utilization 0.9 \
43
+ --max-num-batched-tokens 256
44
+
45
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
46
+ --model-id 328b46a2fc3d431c95a8d8900f1b88ce \
47
+ --clearml-model \
48
+ --queue-name oneshot-a100x1 \
49
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
50
+ --task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
51
+ --benchmark-tasks openllm \
52
+ --max-model-len 4096 \
53
+ --add-bos-token \
54
+ --batch-size auto \
55
+ --enable-chunked-prefill \
56
+ --gpu-memory-utilization 0.9 \
57
+ --max-num-batched-tokens 256
58
+
59
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
60
+ --model-id 29785c1921db4680a1bcbb1c32b1bb0c \
61
+ --clearml-model \
62
+ --queue-name oneshot-a100x1 \
63
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
64
+ --task-name "Qwen2.5-3B/openllm/vllm" \
65
+ --benchmark-tasks openllm \
66
+ --max-model-len 4096 \
67
+ --add-bos-token \
68
+ --batch-size auto \
69
+ --enable-chunked-prefill \
70
+ --gpu-memory-utilization 0.9 \
71
+ --max-num-batched-tokens 256
72
+
73
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
74
+ --model-id bb496bef644e4336afba44944aa8e915 \
75
+ --clearml-model \
76
+ --queue-name oneshot-a100x1 \
77
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
78
+ --task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
79
+ --benchmark-tasks openllm \
80
+ --max-model-len 4096 \
81
+ --add-bos-token \
82
+ --batch-size auto \
83
+ --enable-chunked-prefill \
84
+ --gpu-memory-utilization 0.9 \
85
+ --max-num-batched-tokens 256
86
+
87
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
88
+ --model-id 7ab087ff9ada4033a52350078a2a665a \
89
+ --clearml-model \
90
+ --queue-name oneshot-a100x1 \
91
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
92
+ --task-name "Qwen2.5-7B/openllm/vllm" \
93
+ --benchmark-tasks openllm \
94
+ --max-model-len 4096 \
95
+ --add-bos-token \
96
+ --batch-size auto \
97
+ --enable-chunked-prefill \
98
+ --gpu-memory-utilization 0.9 \
99
+ --max-num-batched-tokens 256
100
+
101
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
102
+ --model-id 0a0cf6bf30c94a3ea96babf1c92f23a2 \
103
+ --clearml-model \
104
+ --queue-name oneshot-a100x1 \
105
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
106
+ --task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
107
+ --benchmark-tasks openllm \
108
+ --max-model-len 4096 \
109
+ --add-bos-token \
110
+ --batch-size auto \
111
+ --enable-chunked-prefill \
112
+ --gpu-memory-utilization 0.9 \
113
+ --max-num-batched-tokens 256
114
+
115
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
116
+ --model-id 7da4e5e0e2db49c984d3e195ab09ae09 \
117
+ --clearml-model \
118
+ --queue-name oneshot-a100x1 \
119
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
120
+ --task-name "Qwen2.5-32B/openllm/vllm" \
121
+ --benchmark-tasks openllm \
122
+ --max-model-len 4096 \
123
+ --add-bos-token \
124
+ --batch-size auto \
125
+ --enable-chunked-prefill \
126
+ --gpu-memory-utilization 0.9 \
127
+ --max-num-batched-tokens 256
128
+
129
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
130
+ --model-id 79b0a7340f2247068940085e66a7e17b \
131
+ --clearml-model \
132
+ --queue-name oneshot-a100x1 \
133
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
134
+ --task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
135
+ --benchmark-tasks openllm \
136
+ --max-model-len 4096 \
137
+ --add-bos-token \
138
+ --batch-size auto \
139
+ --enable-chunked-prefill \
140
+ --gpu-memory-utilization 0.9 \
141
+ --max-num-batched-tokens 256
evaluate_qwen2.5_w8a16.sh ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source ~/environments/clearml/bin/activate
2
+
3
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
4
+ --model-id 0fe5857173ac484a89316214b14fcf96 \
5
+ --clearml-model \
6
+ --queue-name oneshot-a100x2 \
7
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
8
+ --task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
9
+ --benchmark-tasks openllm \
10
+ --max-model-len 4096 \
11
+ --add-bos-token \
12
+ --batch-size auto \
13
+ --enable-chunked-prefill \
14
+ --gpu-memory-utilization 0.9 \
15
+ --max-num-batched-tokens 256
16
+
17
+
18
+ <<END
19
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
20
+ --model-id 6a4ecaa68a6e45ea80c62680b0a65aa0 \
21
+ --clearml-model \
22
+ --queue-name oneshot-a100x2 \
23
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
24
+ --task-name "Qwen2.5-72B/openllm/vllm" \
25
+ --benchmark-tasks openllm \
26
+ --max-model-len 4096 \
27
+ --add-bos-token \
28
+ --batch-size auto \
29
+ --enable-chunked-prefill \
30
+ --gpu-memory-utilization 0.9 \
31
+ --max-num-batched-tokens 256
32
+
33
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
34
+ --model-id c67be85bc77f462c93381280019dea1d \
35
+ --clearml-model \
36
+ --queue-name oneshot-a100x4 \
37
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
38
+ --task-name "Qwen2.5-7B/openllm/vllm" \
39
+ --benchmark-tasks openllm \
40
+ --max-model-len 4096 \
41
+ --add-bos-token \
42
+ --batch-size auto \
43
+ --enable-chunked-prefill \
44
+ --gpu-memory-utilization 0.9 \
45
+ --max-num-batched-tokens 256
46
+
47
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
48
+ --model-id effae214cc464181a92d5a57df10f3d6 \
49
+ --clearml-model \
50
+ --queue-name oneshot-a100x1 \
51
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
52
+ --task-name "Qwen2.5-0.5B/openllm/vllm" \
53
+ --benchmark-tasks openllm \
54
+ --max-model-len 4096 \
55
+ --add-bos-token \
56
+ --batch-size auto \
57
+ --enable-chunked-prefill \
58
+ --gpu-memory-utilization 0.9 \
59
+ --max-num-batched-tokens 256
60
+
61
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
62
+ --model-id 21dd39ef3013401d84b258410647e847 \
63
+ --clearml-model \
64
+ --queue-name oneshot-a100x1 \
65
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
66
+ --task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
67
+ --benchmark-tasks openllm \
68
+ --max-model-len 4096 \
69
+ --add-bos-token \
70
+ --batch-size auto \
71
+ --enable-chunked-prefill \
72
+ --gpu-memory-utilization 0.9 \
73
+ --max-num-batched-tokens 256
74
+
75
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
76
+ --model-id 324acdd4c7c4426dbdfeb29667dc4b53 \
77
+ --clearml-model \
78
+ --queue-name oneshot-a100x1 \
79
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
80
+ --task-name "Qwen2.5-1.5B/openllm/vllm" \
81
+ --benchmark-tasks openllm \
82
+ --max-model-len 4096 \
83
+ --add-bos-token \
84
+ --batch-size auto \
85
+ --enable-chunked-prefill \
86
+ --gpu-memory-utilization 0.9 \
87
+ --max-num-batched-tokens 256
88
+
89
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
90
+ --model-id 82cbaa6e27c84f08ac10e9f115034b0b \
91
+ --clearml-model \
92
+ --queue-name oneshot-a100x1 \
93
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
94
+ --task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
95
+ --benchmark-tasks openllm \
96
+ --max-model-len 4096 \
97
+ --add-bos-token \
98
+ --batch-size auto \
99
+ --enable-chunked-prefill \
100
+ --gpu-memory-utilization 0.9 \
101
+ --max-num-batched-tokens 256
102
+
103
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
104
+ --model-id 337f5b50610443c7ad2a380dce8e0be8 \
105
+ --clearml-model \
106
+ --queue-name oneshot-a100x1 \
107
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
108
+ --task-name "Qwen2.5-3B/openllm/vllm" \
109
+ --benchmark-tasks openllm \
110
+ --max-model-len 4096 \
111
+ --add-bos-token \
112
+ --batch-size auto \
113
+ --enable-chunked-prefill \
114
+ --gpu-memory-utilization 0.9 \
115
+ --max-num-batched-tokens 256
116
+
117
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
118
+ --model-id a8ebd9cae5324572906d50f95eeee5dd \
119
+ --clearml-model \
120
+ --queue-name oneshot-a100x1 \
121
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
122
+ --task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
123
+ --benchmark-tasks openllm \
124
+ --max-model-len 4096 \
125
+ --add-bos-token \
126
+ --batch-size auto \
127
+ --enable-chunked-prefill \
128
+ --gpu-memory-utilization 0.9 \
129
+ --max-num-batched-tokens 256
130
+
131
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
132
+ --model-id 5190911e94a340988dac223c252e72a2 \
133
+ --clearml-model \
134
+ --queue-name oneshot-a100x1 \
135
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
136
+ --task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
137
+ --benchmark-tasks openllm \
138
+ --max-model-len 4096 \
139
+ --add-bos-token \
140
+ --batch-size auto \
141
+ --enable-chunked-prefill \
142
+ --gpu-memory-utilization 0.9 \
143
+ --max-num-batched-tokens 256
144
+ END
evaluate_qwen2.5_w8a8.sh ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source ~/environments/clearml/bin/activate
2
+
3
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
4
+ --model-id 0909a970c53644f593c62b2e076ee763 \
5
+ --clearml-model \
6
+ --queue-name oneshot-a100x1 \
7
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
8
+ --task-name "Qwen2.5-7B/openllm/vllm" \
9
+ --benchmark-tasks openllm \
10
+ --max-model-len 4096 \
11
+ --add-bos-token \
12
+ --batch-size auto \
13
+ --enable-chunked-prefill \
14
+ --gpu-memory-utilization 0.9 \
15
+ --max-num-batched-tokens 256
16
+
17
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
18
+ --model-id 0323219f7a8b4e938c2583ab17851b91 \
19
+ --clearml-model \
20
+ --queue-name oneshot-a100x2 \
21
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
22
+ --task-name "Qwen2.5-72B/openllm/vllm" \
23
+ --benchmark-tasks openllm \
24
+ --max-model-len 4096 \
25
+ --add-bos-token \
26
+ --batch-size auto \
27
+ --enable-chunked-prefill \
28
+ --gpu-memory-utilization 0.9 \
29
+ --max-num-batched-tokens 256
30
+
31
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
32
+ --model-id 31c634f743fe458687556bc85a6aeb32 \
33
+ --clearml-model \
34
+ --queue-name oneshot-a100x2 \
35
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
36
+ --task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
37
+ --benchmark-tasks openllm \
38
+ --max-model-len 4096 \
39
+ --add-bos-token \
40
+ --batch-size auto \
41
+ --enable-chunked-prefill \
42
+ --gpu-memory-utilization 0.9 \
43
+ --max-num-batched-tokens 256
44
+
45
+ <<END
46
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
47
+ --model-id ee78f00c9e924db19e8d763442f45209 \
48
+ --clearml-model \
49
+ --queue-name oneshot-a100x1 \
50
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
51
+ --task-name "Qwen2.5-0.5B/openllm/vllm" \
52
+ --benchmark-tasks openllm \
53
+ --max-model-len 4096 \
54
+ --add-bos-token \
55
+ --batch-size auto \
56
+ --enable-chunked-prefill \
57
+ --gpu-memory-utilization 0.9 \
58
+ --max-num-batched-tokens 256
59
+
60
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
61
+ --model-id 5baf53c17abf460d95a5d04bf4ec184d \
62
+ --clearml-model \
63
+ --queue-name oneshot-a100x1 \
64
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
65
+ --task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
66
+ --benchmark-tasks openllm \
67
+ --max-model-len 4096 \
68
+ --add-bos-token \
69
+ --batch-size auto \
70
+ --enable-chunked-prefill \
71
+ --gpu-memory-utilization 0.9 \
72
+ --max-num-batched-tokens 256
73
+
74
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
75
+ --model-id 0dd5e7b10a97426b92d3f7278a1c47ad \
76
+ --clearml-model \
77
+ --queue-name oneshot-a100x1 \
78
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
79
+ --task-name "Qwen2.5-1.5B/openllm/vllm" \
80
+ --benchmark-tasks openllm \
81
+ --max-model-len 4096 \
82
+ --add-bos-token \
83
+ --batch-size auto \
84
+ --enable-chunked-prefill \
85
+ --gpu-memory-utilization 0.9 \
86
+ --max-num-batched-tokens 256
87
+
88
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
89
+ --model-id c63fe381f1e94c2880ce6f8c15465ed4 \
90
+ --clearml-model \
91
+ --queue-name oneshot-a100x1 \
92
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
93
+ --task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
94
+ --benchmark-tasks openllm \
95
+ --max-model-len 4096 \
96
+ --add-bos-token \
97
+ --batch-size auto \
98
+ --enable-chunked-prefill \
99
+ --gpu-memory-utilization 0.9 \
100
+ --max-num-batched-tokens 256
101
+
102
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
103
+ --model-id a1ed1bec7c4540eba02cae98c03cada9 \
104
+ --clearml-model \
105
+ --queue-name oneshot-a100x1 \
106
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
107
+ --task-name "Qwen2.5-3B/openllm/vllm" \
108
+ --benchmark-tasks openllm \
109
+ --max-model-len 4096 \
110
+ --add-bos-token \
111
+ --batch-size auto \
112
+ --enable-chunked-prefill \
113
+ --gpu-memory-utilization 0.9 \
114
+ --max-num-batched-tokens 256
115
+
116
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
117
+ --model-id d7392721b76c4a299ac494c7b3ce1299 \
118
+ --clearml-model \
119
+ --queue-name oneshot-a100x1 \
120
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
121
+ --task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
122
+ --benchmark-tasks openllm \
123
+ --max-model-len 4096 \
124
+ --add-bos-token \
125
+ --batch-size auto \
126
+ --enable-chunked-prefill \
127
+ --gpu-memory-utilization 0.9 \
128
+ --max-num-batched-tokens 256
129
+
130
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
131
+ --model-id 71606d92b022423591547683b81ebc91 \
132
+ --clearml-model \
133
+ --queue-name oneshot-a100x1 \
134
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
135
+ --task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
136
+ --benchmark-tasks openllm \
137
+ --max-model-len 4096 \
138
+ --add-bos-token \
139
+ --batch-size auto \
140
+ --enable-chunked-prefill \
141
+ --gpu-memory-utilization 0.9 \
142
+ --max-num-batched-tokens 256
143
+
144
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
145
+ --model-id c9094d7778584311894b490d5c0d7e66 \
146
+ --clearml-model \
147
+ --queue-name oneshot-a100x1 \
148
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
149
+ --task-name "Qwen2.5-32B/openllm/vllm" \
150
+ --benchmark-tasks openllm \
151
+ --max-model-len 4096 \
152
+ --add-bos-token \
153
+ --batch-size auto \
154
+ --enable-chunked-prefill \
155
+ --gpu-memory-utilization 0.9 \
156
+ --max-num-batched-tokens 256
157
+
158
+ python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
159
+ --model-id d2164c0031f2445eb726fc94c94210bd \
160
+ --clearml-model \
161
+ --queue-name oneshot-a100x1 \
162
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
163
+ --task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
164
+ --benchmark-tasks openllm \
165
+ --max-model-len 4096 \
166
+ --add-bos-token \
167
+ --batch-size auto \
168
+ --enable-chunked-prefill \
169
+ --gpu-memory-utilization 0.9 \
170
+ --max-num-batched-tokens 256
171
+ END
quantize_qwen2.5_fp8.sh ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ source ~/environments/clearml/bin/activate
3
+
4
+ recipe_template=$(cat <<'EOF'
5
+ quant_stage:
6
+ quant_modifiers:
7
+ QuantizationModifier:
8
+ ignore: ["lm_head"]
9
+ scheme: FP8
10
+ targets: ["Linear"]
11
+ observer: "mse"
12
+ EOF
13
+ )
14
+
15
+ for size in 0.5B 1.5B 3B 7B 32B 72B
16
+ do
17
+ for version in base instruct
18
+ do
19
+
20
+
21
+ if [ $version = "base" ]; then
22
+ model="Qwen2.5-${size}"
23
+ else
24
+ model="Qwen2.5-${size}-Instruct"
25
+ fi
26
+
27
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196__damp01"
28
+
29
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
30
+ --model-id "Qwen/"$model \
31
+ --project-name "LLM quantization - FP8/llmcompressor/Qwen2.5" \
32
+ --task-prefix $prefix \
33
+ --recipe "${recipe}" \
34
+ --num-samples 512 \
35
+ --max-seq-len 8196 \
36
+ --tags "Qwen2.5" "W4A16" "calibration" $size "MSE" $version
37
+
38
+ done
39
+ done
quantize_qwen2.5_w4a16.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ source ~/environments/clearml/bin/activate
3
+
4
+ recipe_template=$(cat <<'EOF'
5
+ quant_stage:
6
+ quant_modifiers:
7
+ GPTQModifier:
8
+ sequential_update: true
9
+ dampening_frac: 0.1
10
+ ignore: ["lm_head"]
11
+ config_groups:
12
+ group_0:
13
+ weights:
14
+ num_bits: 4
15
+ type: "int"
16
+ symmetric: true
17
+ strategy: "group"
18
+ group_size: 128
19
+ actorder: "group"
20
+ targets: ["Linear"]
21
+ observer: "mse"
22
+ EOF
23
+ )
24
+
25
+ for size in 0.5B 1.5B 3B 7B 32B 72B
26
+ do
27
+ for version in base instruct
28
+ do
29
+
30
+
31
+ if [ $version = "base" ]; then
32
+ model="Qwen2.5-${size}"
33
+ else
34
+ model="Qwen2.5-${size}-Instruct"
35
+ fi
36
+
37
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196__damp01"
38
+
39
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
40
+ --model-id "Qwen/"$model \
41
+ --project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
42
+ --task-prefix $prefix \
43
+ --recipe "${recipe}" \
44
+ --num-samples 512 \
45
+ --max-seq-len 8196 \
46
+ --tags "Qwen2.5" "W4A16" "calibration" $size "MSE" $version
47
+
48
+ done
49
+ done
quantize_qwen2.5_w8a16.sh ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ source ~/environments/clearml/bin/activate
3
+
4
+ recipe_template=$(cat <<'EOF'
5
+ quant_stage:
6
+ quant_modifiers:
7
+ GPTQModifier:
8
+ sequential_update: true
9
+ dampening_frac: 0.1
10
+ ignore: ["lm_head"]
11
+ scheme: "W8A16"
12
+ targets: "Linear"
13
+ observer: "mse"
14
+ EOF
15
+ )
16
+
17
+ for size in 3B
18
+ do
19
+ for version in instruct
20
+ do
21
+
22
+ if [ $version = "base" ]; then
23
+ model="Qwen2.5-${size}"
24
+ else
25
+ model="Qwen2.5-${size}-Instruct"
26
+ fi
27
+
28
+ prefix="${model//./_}""__llm_compressor__calibration__mse__1024__8196_damp01"
29
+
30
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
31
+ --model-id "Qwen/"$model \
32
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
33
+ --task-prefix $prefix \
34
+ --recipe "${recipe}" \
35
+ --num-samples 1024 \
36
+ --max-seq-len 8196 \
37
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
38
+
39
+ done
40
+ done
41
+
42
+ <<END
43
+ size="72B"
44
+ model="Qwen2.5-${size}-Instruct"
45
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
46
+
47
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
48
+ --model-id "Qwen/"$model \
49
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
50
+ --oneshot-queue oneshot-a100x4 \
51
+ --evaluation-queue oneshot-a100x4 \
52
+ --task-prefix $prefix \
53
+ --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
54
+ --num-samples 512 \
55
+ --max-seq-len 8196 \
56
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
57
+
58
+ size="32B"
59
+ model="Qwen2.5-${size}"
60
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
61
+
62
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
63
+ --model-id "Qwen/"$model \
64
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
65
+ --oneshot-queue oneshot-a100x2 \
66
+ --evaluation-queue oneshot-a100x2 \
67
+ --task-prefix $prefix \
68
+ --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
69
+ --num-samples 512 \
70
+ --max-seq-len 8196 \
71
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
72
+
73
+ model="Qwen2.5-${size}-Instruct"
74
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
75
+
76
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
77
+ --model-id "Qwen/"$model \
78
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
79
+ --oneshot-queue oneshot-a100x2 \
80
+ --evaluation-queue oneshot-a100x2 \
81
+ --task-prefix $prefix \
82
+ --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
83
+ --num-samples 512 \
84
+ --max-seq-len 8196 \
85
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
86
+
87
+ size="72B"
88
+ model="Qwen2.5-${size}"
89
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
90
+
91
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
92
+ --model-id "Qwen/"$model \
93
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
94
+ --oneshot-queue oneshot-a100x4 \
95
+ --evaluation-queue oneshot-a100x2 \
96
+ --task-prefix $prefix \
97
+ --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
98
+ --num-samples 512 \
99
+ --max-seq-len 8196 \
100
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
101
+
102
+ model="Qwen2.5-${size}-Instruct"
103
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
104
+
105
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
106
+ --model-id "Qwen/"$model \
107
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
108
+ --oneshot-queue oneshot-a100x4 \
109
+ --evaluation-queue oneshot-a100x2 \
110
+ --task-prefix $prefix \
111
+ --recipe "/network/alexandre/quantization/recipe_w8a16_mse_damp01.yaml" \
112
+ --num-samples 512 \
113
+ --max-seq-len 8196 \
114
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE" "Instruct"
115
+
116
+ size="7B"
117
+ model="Qwen2.5-${size}"
118
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
119
+
120
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
121
+ --model-id "Qwen/"$model \
122
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
123
+ --task-prefix $prefix \
124
+ --recipe "/network/alexandre/quantization/recipe_w8a16_damp01_mse_sequential.yaml" \
125
+ --num-samples 512 \
126
+ --max-seq-len 8196 \
127
+ --tags "Qwen2.5" "W8A16" "calibration" $size "MSE"
128
+
129
+ for size in 0.5B 1.5B 3B 7B
130
+ do
131
+
132
+ model="Qwen2.5-${size}"
133
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
134
+
135
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
136
+ --model-id "Qwen/"$model \
137
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
138
+ --task-prefix $prefix \
139
+ --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
140
+ --num-samples 512 \
141
+ --max-seq-len 8196 \
142
+ --tags "Qwen2.5" "w8a16" "calibration" $size "MSE"
143
+
144
+ model="Qwen2.5-${size}-Instruct"
145
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
146
+
147
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
148
+ --model-id "Qwen/"$model \
149
+ --project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
150
+ --task-prefix $prefix \
151
+ --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
152
+ --num-samples 512 \
153
+ --max-seq-len 8196 \
154
+ --tags "Qwen2.5" "w8a16" "calibration" $size "MSE" "Instruct"
155
+
156
+ done
157
+
158
+ for size in 32B 72B
159
+ do
160
+
161
+ model="Qwen2.5-${size}"
162
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
163
+
164
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
165
+ --model-id "Qwen/"$model \
166
+ --project-name "LLM quantization - w8a16/llmcompressor/Qwen2.5" \
167
+ --task-prefix $prefix \
168
+ --oneshot-queue oneshot-a100x2 \
169
+ --evaluation-queue oneshot-a100x2 \
170
+ --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
171
+ --num-samples 512 \
172
+ --max-seq-len 8196 \
173
+ --tags "Qwen2.5" "w8a16" "calibration" $size "MSE"
174
+
175
+ model="Qwen2.5-${size}-Instruct"
176
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
177
+
178
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
179
+ --model-id "Qwen/"$model \
180
+ --project-name "LLM quantization - w8a16/llmcompressor/Qwen2.5" \
181
+ --task-prefix $prefix \
182
+ --oneshot-queue oneshot-a100x2 \
183
+ --evaluation-queue oneshot-a100x2 \
184
+ --recipe "/network/alexandre/quantization/recipe_w8a16_mse.yaml" \
185
+ --num-samples 512 \
186
+ --max-seq-len 8196 \
187
+ --tags "Qwen2.5" "w8a16" "calibration" $size "MSE" "Instruct"
188
+
189
+ done
190
+ END
quantize_qwen2.5_w8a8.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ source ~/environments/clearml/bin/activate
3
+
4
+ size="7B"
5
+ model="Qwen2.5-${size}"
6
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
7
+
8
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
9
+ --model-id "Qwen/"$model \
10
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
11
+ --task-prefix $prefix \
12
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
13
+ --num-samples 512 \
14
+ --max-seq-len 8196 \
15
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
16
+
17
+ size="72B"
18
+ model="Qwen2.5-${size}"
19
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
20
+
21
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
22
+ --model-id "Qwen/"$model \
23
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
24
+ --oneshot-queue oneshot-a100x4 \
25
+ --evaluation-queue oneshot-a100x4 \
26
+ --task-prefix $prefix \
27
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
28
+ --num-samples 512 \
29
+ --max-seq-len 8196 \
30
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
31
+
32
+ model="Qwen2.5-${size}-Instruct"
33
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
34
+
35
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
36
+ --model-id "Qwen/"$model \
37
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
38
+ --oneshot-queue oneshot-a100x4 \
39
+ --evaluation-queue oneshot-a100x4 \
40
+ --task-prefix $prefix \
41
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse_damp01.yaml" \
42
+ --num-samples 512 \
43
+ --max-seq-len 8196 \
44
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
45
+
46
+
47
+ <<END
48
+ for size in 0.5B 1.5B 3B 7B
49
+ do
50
+
51
+ model="Qwen2.5-${size}"
52
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
53
+
54
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
55
+ --model-id "Qwen/"$model \
56
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
57
+ --task-prefix $prefix \
58
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
59
+ --num-samples 512 \
60
+ --max-seq-len 8196 \
61
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
62
+
63
+ model="Qwen2.5-${size}-Instruct"
64
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
65
+
66
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
67
+ --model-id "Qwen/"$model \
68
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
69
+ --task-prefix $prefix \
70
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
71
+ --num-samples 512 \
72
+ --max-seq-len 8196 \
73
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
74
+
75
+ done
76
+
77
+ for size in 32B 72B
78
+ do
79
+
80
+ model="Qwen2.5-${size}"
81
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
82
+
83
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
84
+ --model-id "Qwen/"$model \
85
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
86
+ --task-prefix $prefix \
87
+ --oneshot-queue oneshot-a100x2 \
88
+ --evaluation-queue oneshot-a100x2 \
89
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
90
+ --num-samples 512 \
91
+ --max-seq-len 8196 \
92
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE"
93
+
94
+ model="Qwen2.5-${size}-Instruct"
95
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196"
96
+
97
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
98
+ --model-id "Qwen/"$model \
99
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
100
+ --task-prefix $prefix \
101
+ --oneshot-queue oneshot-a100x2 \
102
+ --evaluation-queue oneshot-a100x2 \
103
+ --recipe "/network/alexandre/quantization/recipe_w8a8_mse.yaml" \
104
+ --num-samples 512 \
105
+ --max-seq-len 8196 \
106
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "Instruct"
107
+
108
+ done
109
+ END
quantize_qwen2.5_w8a8_sq.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ source ~/environments/clearml/bin/activate
3
+
4
+
5
+ recipe_template=$(cat <<'EOF'
6
+ quant_stage:
7
+ quant_modifiers:
8
+ SmoothQuantModifier:
9
+ smoothing_strength: 0.0
10
+ mappings:
11
+ - [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"]
12
+ - [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
13
+ - [["re:.*down_proj"], "re:.*up_proj"]
14
+ GPTQModifier:
15
+ sequential_update: true
16
+ dampening_frac: 0.05
17
+ ignore: ["lm_head"]
18
+ scheme: "W8A8"
19
+ targets: "Linear"
20
+ observer: "mse"
21
+ EOF
22
+ )
23
+
24
+ for size in 3B
25
+ do
26
+ for version in base
27
+ do
28
+ for sq in 0.9
29
+ do
30
+
31
+ recipe=$(echo "$recipe_template" | sed "s/smoothing_strength: 0.0/smoothing_strength: ${sq}/")
32
+
33
+ if [ $version = "base" ]; then
34
+ model="Qwen2.5-${size}"
35
+ else
36
+ model="Qwen2.5-${size}-Instruct"
37
+ fi
38
+
39
+ prefix="${model//./_}""__llm_compressor__calibration__mse__512__8196_damp005_sq${sq//0./0}"
40
+
41
+ python /cache/git/research/automation/pipelines/pipeline_llmcompressor_oneshot.py \
42
+ --model-id "Qwen/"$model \
43
+ --project-name "LLM quantization - W8A8/llmcompressor/Qwen2.5" \
44
+ --task-prefix $prefix \
45
+ --recipe "${recipe}" \
46
+ --num-samples 512 \
47
+ --max-seq-len 8196 \
48
+ --tags "Qwen2.5" "W8A8" "calibration" $size "MSE" "SQ"
49
+
50
+ done
51
+ done
52
+ done