Qwen2.5-0.5B-quantized.w8a8 / evaluate_qwen2.5_w4a16.sh

Upload folder using huggingface_hub

1a552f6 verified about 1 year ago

5.06 kB

	source ~/environments/clearml/bin/activate

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id ece15d6b3b574a9ba3062f8922dda9d9 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-0.5B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id b3c075c7685a42c798ebe91bdb1bb140 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 70d4376f576241eb9ca066ce2864c517 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-1.5B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 328b46a2fc3d431c95a8d8900f1b88ce \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 29785c1921db4680a1bcbb1c32b1bb0c \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-3B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id bb496bef644e4336afba44944aa8e915 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 7ab087ff9ada4033a52350078a2a665a \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-7B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 0a0cf6bf30c94a3ea96babf1c92f23a2 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 7da4e5e0e2db49c984d3e195ab09ae09 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-32B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 79b0a7340f2247068940085e66a7e17b \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W4A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-32B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256