Eldar Kurtic
add model
a7050c3
from datasets import load_dataset
from transformers import AutoTokenizer
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply
import argparse
from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, default="/network/eldar/llama31_8b_gsm8k/sparse_transfer/sp2of4_2ep_lr7e-6_bs32_GradClip2_warmup20ba_noKD")
parser.add_argument('--quant_path', type=str, default="output_dir/quant_test")
parser.add_argument('--calib_size', type=int, default=256) # 32 64 128 256 512 1024 2048
parser.add_argument('--dampening_frac', type=float, default=0.01) # 0.001 0.003 0.005 0.008 0.01 0.03 0.05 0.08 0.1 0.3
parser.add_argument('--observer', type=str, default="minmax") # mse or minmax
# ToDo: add args for SmoothQuant if needed
args = parser.parse_args()
print(f"[DEBUGGING ARGS] {args}")
# TODO: to ablate on additionally is whether we want to add EOS token to calib data
model = SparseAutoModelForCausalLM.from_pretrained(
args.model_path,
device_map="auto",
torch_dtype="auto",
use_cache=False,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
NUM_CALIBRATION_SAMPLES = args.calib_size
DATASET_ID = "garage-bAInd/Open-Platypus"
DATASET_SPLIT = "train"
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example):
concat_txt = example["instruction"] + "\n" + example["output"]
return {"text": concat_txt}
ds = ds.map(preprocess)
print(f"================================================================================")
print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}")
print(f"================================================================================")
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
truncation=False,
add_special_tokens=True,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
print(f"================================================================================")
print(f"[For debugging] Tokenized data sample is:\n{tokenizer.decode(ds[0]['input_ids'])}")
print(f"================================================================================")
# TODO: why do we need to fuck around the calibration data when it's given completely prepared
# TODO: are recipe_args used to override params?
#quant_scheme = QuantizationScheme(
# targets=["Linear"],
# weights=QuantizationArgs(
# num_bits=args.num_bits,
# type=QuantizationType.INT,
# symmetric=True,
# group_size=128,
# strategy=QuantizationStrategy.GROUP,
# observer=args.observer,
# actorder=args.actorder
# ),
# input_activations=None,
# output_activations=None,
#)
recipe = [
GPTQModifier(
targets=["Linear"],
ignore=["lm_head"],
scheme="W8A8",
dampening_frac=args.dampening_frac,
observer=args.observer,
#config_groups={"group_0": quant_scheme},
)
]
oneshot(
model=model,
dataset=ds,
# recipe="w4a16_recipe.yaml",
recipe=recipe,
# recipe_args={
# "quant_stage.quant_modifiers.GPTQModifier.dampening_frac": 100,
# "quant_stage.quant_modifiers.GPTQModifier.sequential_update": args.sequential_update,
# "quant_stage.quant_modifiers.GPTQModifier.config_groups.group_0.weights.observer": args.observer,
# "quant_stage.quant_modifiers.GPTQModifier.config_groups.group_0.weights.actorder": args.actorder,
# },
num_calibration_samples=args.calib_size,
max_seq_length=8192,
)
# apply(
# recipe=self.recipe,
# # recipe_stage=None,
# recipe_args=self.recipe_args,
# model=self.model,
# calib_data=calibration_data, # dataloader
# start=-1,
# copy_data=False,
# accelerator=self.accelerator, # some accelerator object
# min_tokens_per_module=None
# )
# Save to disk compressed.
SAVE_DIR = args.quant_path
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)