can you please share the quantized code ?

by HanG - opened Sep 18

Discussion

HanG

Sep 18

dear Cpatonn
can share any code to quantized this model?

cpatonn

Owner Sep 20

My apologies for the late reply. I already delete the quantizing script, but it was something like this:

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot

# Select model and load it.
MODEL_ID = "nvidia/NVIDIA-Nemotron-Nano-12B-v2"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "nvidia/Llama-Nemotron-Post-Training-Dataset"
SUBSET = "default"
split = "train"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 1024

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, SUBSET, split=split, revision="refs/convert/parquet") \
    .shuffle(seed=42) \
    .remove_columns(['category', 'license', 'reasoning', 'generator', 'used_in_training', 'version', 'system_prompt']) \
    .select(range(NUM_CALIBRATION_SAMPLES))

def preprocess(example):
    conversations = example["input"]
    conversations.append({"role": "assistant", "content": example["output"]})
    return {"text": tokenizer.apply_chat_template(conversations, tokenize=False,)}

ds = ds.map(preprocess).select(range(NUM_CALIBRATION_SAMPLES))

# Tokenize inputs.
def tokenize(sample):
    return tokenizer(
        sample["text"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


ds = ds.map(tokenize, remove_columns=ds.column_names)

recipe = """
quant_stage:
    quant_modifiers:
        AWQModifier:
        config_groups:
            group_0:
            targets: [Linear]
            weights:
                num_bits: 4
                type: int
                symmetric: true
                group_size: 32
                strategy: group
                block_structure: null
                dynamic: false
                actorder: null
                observer: mse
                observer_kwargs: {}
            input_activations: null
            output_activations: null
            format: null
        targets: [Linear]
        ignore: [backbone.embeddings, 're:.*mixer.A_log', 're:.*conv1d.*', 're:.*mixer.D', 're:.*mixer.norm.*',
            're:.*mixer.dt_bias', 're:backbone.norm_f', lm_head]
        mappings:
        - smooth_layer: re:backbone\.layers\.(7|16|25|34|43|52)\.norm$
            balance_layers: ['re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.q_proj$', 're:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.k_proj$',
            're:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.v_proj$']
        - smooth_layer: re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.v_proj$
            balance_layers: ['re:backbone\.layers\.(7|16|25|34|43|52)\.mixer\.o_proj$']
        - smooth_layer: re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.norm$
            balance_layers: ['re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.up_proj$']
        - smooth_layer: re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.up_proj$
            balance_layers: ['re:backbone\.layers\.(1|3|5|8|10|12|14|17|19|21|23|26|28|30|32|35|37|39|41|44|46|48|50|53|55|57|59|61)\.mixer\.down_proj$']
        duo_scaling: true
"""

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    trust_remote_code_model=True
)

# Save to disk compressed.
SAVE_DIR = "./NVIDIA-Nemotron-Nano-12B-v2-AWQ-4bit"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

In addition, please replace the modeling_nemotron_h.py file in this model repo with the modeling_nemotron_h.py file in the original model repo.

HanG

Sep 28

thanks so much for your reply， your work is amazing

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment