jondurbin commited on
Commit
797172c
·
verified ·
1 Parent(s): dd4a87c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -109
README.md CHANGED
@@ -4,115 +4,6 @@ pipeline_tag: text-generation
4
  library_name: transformers
5
  ---
6
 
7
- ### FP8 quant info
8
-
9
- This is an FP8 quantization of [Ling-1T](https://huggingface.co/inclusionAI/Ling-1T) via [llm-compressor](https://github.com/vllm-project/llm-compressor)
10
-
11
- Script used:
12
- ```python
13
- from __future__ import annotations
14
- import torch
15
- from datasets import load_dataset, Dataset
16
- from transformers import AutoModelForCausalLM, AutoTokenizer
17
-
18
- from llmcompressor import oneshot
19
- from llmcompressor.modeling import replace_modules_for_calibration
20
- from llmcompressor.modifiers.quantization import QuantizationModifier
21
- from llmcompressor.utils import dispatch_for_generation
22
-
23
- MODEL_ID = "inclusionAI/Ling-1T"
24
- MAX_SEQUENCE_LENGTH = 4096
25
- NUM_CALIBRATION_SAMPLES = 256
26
- DATASET_ID = "HuggingFaceH4/ultrachat_200k"
27
- DATASET_SPLIT = "train_sft"
28
-
29
- def _find_decoder_layer_name(model) -> str | None:
30
- """
31
- Return the decoder layer class name so we can ask llmcompressor to process
32
- one layer at a time. This keeps peak memory within the 8×B200 envelope.
33
- """
34
- for module in model.modules():
35
- cls_name = module.__class__.__name__
36
- if cls_name.endswith("DecoderLayer") or cls_name.endswith("Layer"):
37
- if hasattr(module, "self_attn") and hasattr(module, "mlp"):
38
- return cls_name
39
- return None
40
-
41
-
42
- def _build_calibration_set(tokenizer):
43
- dataset = load_dataset(
44
- DATASET_ID,
45
- split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
46
- streaming=False,
47
- ).shuffle(seed=42)
48
-
49
- def apply_template(example):
50
- text = tokenizer.apply_chat_template(
51
- example["messages"],
52
- tokenize=False,
53
- )
54
- return {"text": text}
55
-
56
- dataset = dataset.map(apply_template)
57
-
58
- def tokenize_fn(example):
59
- encoded = tokenizer(
60
- example["text"],
61
- padding=False,
62
- max_length=MAX_SEQUENCE_LENGTH,
63
- truncation=True,
64
- add_special_tokens=False,
65
-
66
- return_tensors="pt",
67
- )
68
- return dict(
69
- input_ids=encoded.input_ids.squeeze(0),
70
- attention_mask=encoded.attention_mask.squeeze(0),
71
- )
72
-
73
- return Dataset.from_list([tokenize_fn(row) for row in dataset])
74
-
75
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
76
- model = AutoModelForCausalLM.from_pretrained(
77
- MODEL_ID,
78
- torch_dtype=torch.bfloat16,
79
- device_map=None,
80
- low_cpu_mem_usage=True,
81
- trust_remote_code=True,
82
- )
83
- model = replace_modules_for_calibration(model)
84
-
85
- calibration_samples = _build_calibration_set(tokenizer)
86
-
87
- ignore_patterns = [
88
- "lm_head",
89
- "re:.*embed_tokens",
90
- "re:.*router",
91
- "re:.*gate$",
92
- "re:.*shared_expert_gate$",
93
- ]
94
-
95
- recipe = QuantizationModifier(
96
- targets="Linear",
97
- scheme="FP8_DYNAMIC",
98
- ignore=ignore_patterns,
99
- )
100
-
101
- decoder_layer = _find_decoder_layer_name(model)
102
- sequential_targets = [decoder_layer] if decoder_layer is not None else None
103
-
104
- oneshot(
105
- model=model,
106
- dataset=calibration_samples,
107
- recipe=recipe,
108
- max_seq_length=MAX_SEQUENCE_LENGTH,
109
- num_calibration_samples=NUM_CALIBRATION_SAMPLES,
110
- sequential_targets=sequential_targets,
111
- trust_remote_code_model=True,
112
- output_dir="/cache/Ling-1T-FP8",
113
- )
114
- ```
115
-
116
  <p align="center">
117
  <img src="https://mdn.alipayobjects.com/huamei_qa8qxu/afts/img/A*4QxcQrBlTiAAAAAAQXAAAAgAemJ7AQ/original" width="100"/>
118
  </p>
 
4
  library_name: transformers
5
  ---
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  <p align="center">
8
  <img src="https://mdn.alipayobjects.com/huamei_qa8qxu/afts/img/A*4QxcQrBlTiAAAAAAQXAAAAgAemJ7AQ/original" width="100"/>
9
  </p>