|
|
from datasets import load_dataset |
|
|
from transformers import AutoTokenizer |
|
|
from llmcompressor.modifiers.quantization import GPTQModifier |
|
|
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply |
|
|
import argparse |
|
|
from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy |
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('--model_path', type=str, default="/network/eldar/llama31_8b_gsm8k/sparse_transfer/sp2of4_2ep_lr7e-6_bs32_GradClip2_warmup20ba_noKD") |
|
|
parser.add_argument('--quant_path', type=str, default="output_dir/quant_test") |
|
|
|
|
|
parser.add_argument('--calib_size', type=int, default=256) |
|
|
parser.add_argument('--dampening_frac', type=float, default=0.01) |
|
|
parser.add_argument('--observer', type=str, default="minmax") |
|
|
|
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print(f"[DEBUGGING ARGS] {args}") |
|
|
|
|
|
|
|
|
|
|
|
model = SparseAutoModelForCausalLM.from_pretrained( |
|
|
args.model_path, |
|
|
device_map="auto", |
|
|
torch_dtype="auto", |
|
|
use_cache=False, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained(args.model_path) |
|
|
|
|
|
|
|
|
NUM_CALIBRATION_SAMPLES = args.calib_size |
|
|
DATASET_ID = "garage-bAInd/Open-Platypus" |
|
|
DATASET_SPLIT = "train" |
|
|
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) |
|
|
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) |
|
|
|
|
|
def preprocess(example): |
|
|
concat_txt = example["instruction"] + "\n" + example["output"] |
|
|
return {"text": concat_txt} |
|
|
|
|
|
ds = ds.map(preprocess) |
|
|
print(f"================================================================================") |
|
|
print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}") |
|
|
print(f"================================================================================") |
|
|
|
|
|
def tokenize(sample): |
|
|
return tokenizer( |
|
|
sample["text"], |
|
|
padding=False, |
|
|
truncation=False, |
|
|
add_special_tokens=True, |
|
|
) |
|
|
|
|
|
|
|
|
ds = ds.map(tokenize, remove_columns=ds.column_names) |
|
|
print(f"================================================================================") |
|
|
print(f"[For debugging] Tokenized data sample is:\n{tokenizer.decode(ds[0]['input_ids'])}") |
|
|
print(f"================================================================================") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
recipe = [ |
|
|
GPTQModifier( |
|
|
targets=["Linear"], |
|
|
ignore=["lm_head"], |
|
|
scheme="W8A8", |
|
|
dampening_frac=args.dampening_frac, |
|
|
observer=args.observer, |
|
|
|
|
|
) |
|
|
] |
|
|
oneshot( |
|
|
model=model, |
|
|
dataset=ds, |
|
|
|
|
|
recipe=recipe, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_calibration_samples=args.calib_size, |
|
|
max_seq_length=8192, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAVE_DIR = args.quant_path |
|
|
model.save_pretrained(SAVE_DIR, save_compressed=True) |
|
|
tokenizer.save_pretrained(SAVE_DIR) |
|
|
|
|
|
|