from datasets import load_dataset from transformers import AutoTokenizer from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot, apply import argparse from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default="/network/eldar/llama31_8b_gsm8k/sparse_transfer/sp2of4_2ep_lr7e-6_bs32_GradClip2_warmup20ba_noKD") parser.add_argument('--quant_path', type=str, default="output_dir/quant_test") parser.add_argument('--calib_size', type=int, default=256) # 32 64 128 256 512 1024 2048 parser.add_argument('--dampening_frac', type=float, default=0.01) # 0.001 0.003 0.005 0.008 0.01 0.03 0.05 0.08 0.1 0.3 parser.add_argument('--observer', type=str, default="minmax") # mse or minmax # ToDo: add args for SmoothQuant if needed args = parser.parse_args() print(f"[DEBUGGING ARGS] {args}") # TODO: to ablate on additionally is whether we want to add EOS token to calib data model = SparseAutoModelForCausalLM.from_pretrained( args.model_path, device_map="auto", torch_dtype="auto", use_cache=False, trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(args.model_path) NUM_CALIBRATION_SAMPLES = args.calib_size DATASET_ID = "garage-bAInd/Open-Platypus" DATASET_SPLIT = "train" ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) def preprocess(example): concat_txt = example["instruction"] + "\n" + example["output"] return {"text": concat_txt} ds = ds.map(preprocess) print(f"================================================================================") print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}") print(f"================================================================================") def tokenize(sample): return tokenizer( sample["text"], padding=False, truncation=False, add_special_tokens=True, ) ds = ds.map(tokenize, remove_columns=ds.column_names) print(f"================================================================================") print(f"[For debugging] Tokenized data sample is:\n{tokenizer.decode(ds[0]['input_ids'])}") print(f"================================================================================") # TODO: why do we need to fuck around the calibration data when it's given completely prepared # TODO: are recipe_args used to override params? #quant_scheme = QuantizationScheme( # targets=["Linear"], # weights=QuantizationArgs( # num_bits=args.num_bits, # type=QuantizationType.INT, # symmetric=True, # group_size=128, # strategy=QuantizationStrategy.GROUP, # observer=args.observer, # actorder=args.actorder # ), # input_activations=None, # output_activations=None, #) recipe = [ GPTQModifier( targets=["Linear"], ignore=["lm_head"], scheme="W8A8", dampening_frac=args.dampening_frac, observer=args.observer, #config_groups={"group_0": quant_scheme}, ) ] oneshot( model=model, dataset=ds, # recipe="w4a16_recipe.yaml", recipe=recipe, # recipe_args={ # "quant_stage.quant_modifiers.GPTQModifier.dampening_frac": 100, # "quant_stage.quant_modifiers.GPTQModifier.sequential_update": args.sequential_update, # "quant_stage.quant_modifiers.GPTQModifier.config_groups.group_0.weights.observer": args.observer, # "quant_stage.quant_modifiers.GPTQModifier.config_groups.group_0.weights.actorder": args.actorder, # }, num_calibration_samples=args.calib_size, max_seq_length=8192, ) # apply( # recipe=self.recipe, # # recipe_stage=None, # recipe_args=self.recipe_args, # model=self.model, # calib_data=calibration_data, # dataloader # start=-1, # copy_data=False, # accelerator=self.accelerator, # some accelerator object # min_tokens_per_module=None # ) # Save to disk compressed. SAVE_DIR = args.quant_path model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR)