amd
/

Llama-3.1-8B-Instruct-FP8-KV

Model card Files Files and versions

bowenbaoamd commited on Oct 7, 2024

Commit

870f255

·

verified ·

1 Parent(s): d4f9cb6

Update README.md

Remove trailing spaces.

Files changed (1) hide show

README.md +4 -4

README.md CHANGED Viewed

@@ -18,9 +18,9 @@ license_link: https://github.com/meta-llama/llama-models/blob/main/models/llama3
 ```sh
 export MODEL_DIR = [local model checkpoint folder] or meta-llama/Meta-Llama-3.1-8B-Instruct
 # single GPU
-python3 quantize_quark.py \
         --model_dir $MODEL_DIR \
-        --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
         --num_calib_data 128 \
@@ -28,9 +28,9 @@ python3 quantize_quark.py \
         --no_weight_matrix_merge
 # If model size is too large for single GPU, please use multi GPU instead.
-python3 quantize_quark.py \
         --model_dir $MODEL_DIR \
-        --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
         --num_calib_data 128 \

 ```sh
 export MODEL_DIR = [local model checkpoint folder] or meta-llama/Meta-Llama-3.1-8B-Instruct
 # single GPU
+python3 quantize_quark.py \
         --model_dir $MODEL_DIR \
+        --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
         --num_calib_data 128 \
         --no_weight_matrix_merge
 # If model size is too large for single GPU, please use multi GPU instead.
+python3 quantize_quark.py \
         --model_dir $MODEL_DIR \
+        --output_dir Meta-Llama-3.1-8B-Instruct-FP8-KV \
         --quant_scheme w_fp8_a_fp8 \
         --kv_cache_dtype fp8 \
         --num_calib_data 128 \