Text Generation
Transformers
Safetensors
PyTorch
nvidia
conversational

can you please share the quantized code ?

#1
by HanG - opened

dear Cpatonn
can share any code to quantized this model?

Owner

My apologies for the late reply. I already delete the quantizing script, but it was something like this:

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot

# Select model and load it.
MODEL_ID = "nvidia/NVIDIA-Nemotron-Nano-12B-v2"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "nvidia/Llama-Nemotron-Post-Training-Dataset"
SUBSET = "default"
split = "train"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 1024

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, SUBSET, split=split, revision="refs/convert/parquet") \
    .shuffle(seed=42) \
    .remove_columns(['category', 'license', 'reasoning', 'generator', 'used_in_training', 'version', 'system_prompt']) \
    .select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    conversations = example["input"]
    conversations.append({"role": "assistant", "content": example["output"]})
    return {"text": tokenizer.apply_chat_template(conversations, tokenize=False,)}

ds = ds.map(preprocess).select(range(NUM_CALIBRATION_SAMPLES))

# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


ds = ds.map(tokenize, remove_columns=ds.column_names)

recipe = """
quant_stage:
    quant_modifiers:
        AWQModifier:
        config_groups:
            group_0:
            targets: [Linear]
            weights:
                num_bits: 4
                type: int
                symmetric: true
                group_size: 32
                strategy: group
                block_structure: null
                dynamic: false
                actorder: null
                observer: mse
                observer_kwargs: {}
            input_activations: null
            output_activations: null
            format: null
        targets: [Linear]
        ignore: [backbone.embeddings, 're:.*mixer.A_log', 're:.*conv1d.*', 're:.*mixer.D', 're:.*mixer.norm.*',
            're:.*mixer.dt_bias', 're:backbone.norm_f', lm_head]
        mappings:
        - smooth_layer: re:backbone\.layers\.(7|16|25|34|43|52)\.norm$
            balance_layers: ['re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.q_proj$', 're:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.k_proj$',
            're:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.v_proj$']
        - smooth_layer: re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.v_proj$
            balance_layers: ['re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.o_proj$']
        - smooth_layer: re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.norm$
            balance_layers: ['re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.up_proj$']
        - smooth_layer: re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.up_proj$
            balance_layers: ['re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.down_proj$']
        duo_scaling: true
"""

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    trust_remote_code_model=True
)

# Save to disk compressed.
SAVE_DIR = "./NVIDIA-Nemotron-Nano-12B-v2-AWQ-4bit"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

In addition, please replace the modeling_nemotron_h.py file in this model repo with the modeling_nemotron_h.py file in the original model repo.

thanks so much for your reply, your work is amazing

Sign up or log in to comment