chutesai
/

Ling-1T-FP8

@@ -4,115 +4,6 @@ pipeline_tag: text-generation
 library_name: transformers
 ---
-### FP8 quant info
-This is an FP8 quantization of [Ling-1T](https://huggingface.co/inclusionAI/Ling-1T) via [llm-compressor](https://github.com/vllm-project/llm-compressor)
-Script used:
-```python
-from __future__ import annotations
-import torch
-from datasets import load_dataset, Dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from llmcompressor import oneshot
-from llmcompressor.modeling import replace_modules_for_calibration
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-MODEL_ID = "inclusionAI/Ling-1T"
-MAX_SEQUENCE_LENGTH = 4096
-NUM_CALIBRATION_SAMPLES = 256
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-def _find_decoder_layer_name(model) -> str | None:
-    """
-    Return the decoder layer class name so we can ask llmcompressor to process
-    one layer at a time. This keeps peak memory within the 8×B200 envelope.
-    """
-    for module in model.modules():
-        cls_name = module.__class__.__name__
-        if cls_name.endswith("DecoderLayer") or cls_name.endswith("Layer"):
-            if hasattr(module, "self_attn") and hasattr(module, "mlp"):
-                return cls_name
-    return None
-def _build_calibration_set(tokenizer):
-    dataset = load_dataset(
-        DATASET_ID,
-        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
-        streaming=False,
-    ).shuffle(seed=42)
-    def apply_template(example):
-        text = tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-        return {"text": text}
-    dataset = dataset.map(apply_template)
-    def tokenize_fn(example):
-        encoded = tokenizer(
-            example["text"],
-            padding=False,
-            max_length=MAX_SEQUENCE_LENGTH,
-            truncation=True,
-            add_special_tokens=False,
-            return_tensors="pt",
-        )
-        return dict(
-            input_ids=encoded.input_ids.squeeze(0),
-            attention_mask=encoded.attention_mask.squeeze(0),
-        )
-    return Dataset.from_list([tokenize_fn(row) for row in dataset])
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    device_map=None,
-    low_cpu_mem_usage=True,
-    trust_remote_code=True,
-)
-model = replace_modules_for_calibration(model)
-calibration_samples = _build_calibration_set(tokenizer)
-ignore_patterns = [
-    "lm_head",
-    "re:.*embed_tokens",
-    "re:.*router",
-    "re:.*gate$",
-    "re:.*shared_expert_gate$",
-]
-recipe = QuantizationModifier(
-    targets="Linear",
-    scheme="FP8_DYNAMIC",
-    ignore=ignore_patterns,
-)
-decoder_layer = _find_decoder_layer_name(model)
-sequential_targets = [decoder_layer] if decoder_layer is not None else None
-oneshot(
-    model=model,
-    dataset=calibration_samples,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    sequential_targets=sequential_targets,
-    trust_remote_code_model=True,
-    output_dir="/cache/Ling-1T-FP8",
-)
-```
 <p align="center">
     <img src="https://mdn.alipayobjects.com/huamei_qa8qxu/afts/img/A*4QxcQrBlTiAAAAAAQXAAAAgAemJ7AQ/original" width="100"/>
 </p>

 library_name: transformers
 ---
 <p align="center">
     <img src="https://mdn.alipayobjects.com/huamei_qa8qxu/afts/img/A*4QxcQrBlTiAAAAAAQXAAAAgAemJ7AQ/original" width="100"/>
 </p>