can you please share the quantized code ?
#1
by
HanG
- opened
dear Cpatonn
can share any code to quantized this model?
My apologies for the late reply. I already delete the quantizing script, but it was something like this:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
# Select model and load it.
MODEL_ID = "nvidia/NVIDIA-Nemotron-Nano-12B-v2"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Select calibration dataset.
DATASET_ID = "nvidia/Llama-Nemotron-Post-Training-Dataset"
SUBSET = "default"
split = "train"
# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 1024
# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, SUBSET, split=split, revision="refs/convert/parquet") \
.shuffle(seed=42) \
.remove_columns(['category', 'license', 'reasoning', 'generator', 'used_in_training', 'version', 'system_prompt']) \
.select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example):
conversations = example["input"]
conversations.append({"role": "assistant", "content": example["output"]})
return {"text": tokenizer.apply_chat_template(conversations, tokenize=False,)}
ds = ds.map(preprocess).select(range(NUM_CALIBRATION_SAMPLES))
# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
recipe = """
quant_stage:
quant_modifiers:
AWQModifier:
config_groups:
group_0:
targets: [Linear]
weights:
num_bits: 4
type: int
symmetric: true
group_size: 32
strategy: group
block_structure: null
dynamic: false
actorder: null
observer: mse
observer_kwargs: {}
input_activations: null
output_activations: null
format: null
targets: [Linear]
ignore: [backbone.embeddings, 're:.*mixer.A_log', 're:.*conv1d.*', 're:.*mixer.D', 're:.*mixer.norm.*',
're:.*mixer.dt_bias', 're:backbone.norm_f', lm_head]
mappings:
- smooth_layer: re:backbone\.layers\.(7|16|25|34|43|52)\.norm$
balance_layers: ['re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.q_proj$', 're:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.k_proj$',
're:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.v_proj$']
- smooth_layer: re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.v_proj$
balance_layers: ['re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.o_proj$']
- smooth_layer: re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.norm$
balance_layers: ['re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.up_proj$']
- smooth_layer: re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.up_proj$
balance_layers: ['re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.down_proj$']
duo_scaling: true
"""
# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True
)
# Save to disk compressed.
SAVE_DIR = "./NVIDIA-Nemotron-Nano-12B-v2-AWQ-4bit"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
In addition, please replace the modeling_nemotron_h.py file in this model repo with the modeling_nemotron_h.py file in the original model repo.
thanks so much for your reply, your work is amazing