Luigi commited on
Commit
1f5b86c
·
1 Parent(s): a95ded1

Re-apply unsloth following official guidiance

Browse files
Files changed (1) hide show
  1. train_with_unsloth.py +88 -161
train_with_unsloth.py CHANGED
@@ -1,204 +1,131 @@
1
  #! /usr/bin/env python3
2
- import unsloth
3
- from transformers import AutoModelForCausalLM
4
- from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
 
5
  from unsloth import FastLanguageModel
 
6
  from transformers import DataCollatorForLanguageModeling
7
- from transformers import TrainingArguments, Trainer
8
- from transformers import pipeline
9
  from datasets import load_dataset
10
- import torch
11
  import os
12
- import wandb
13
  from transformers.integrations import WandbCallback
14
- import math
15
- from transformers import EvalPrediction
16
 
17
  PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
18
- BASE_MODEL_ID="HuggingFaceTB/SmolLM2-135M-Instruct"
19
  DATASET_ID="yentinglin/TaiwanChat"
20
  N_SAMPLES=80000
21
- MAX_LEN=512
22
 
23
  # Tell wandb which project to use, and that you want to log your model
24
  os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD"
25
  os.environ["WANDB_LOG_MODEL"] = "end"
26
 
27
- # Detect GPU Type
28
- device_str='cpu'
29
- if torch.xpu.is_available():
30
- device_str='xpu'
31
- elif torch.cuda.is_available():
32
- device_str='cuda'
33
- print(f'Device is {device_str}')
34
-
35
  ## Load with Unsloth’s optimized API
36
  # 1) Load quantized model
37
  model, tokenizer = FastLanguageModel.from_pretrained(
38
  model_name = BASE_MODEL_ID,
39
  max_seq_length = MAX_LEN,
40
- dtype = torch.float16,
41
  load_in_4bit = True,
42
  full_finetuning= False, # we will add LoRA adapters next
43
  )
44
 
45
  # 2) Prepare it for k‑bit training (sets up layer norms, disables caching, etc.)
46
- from peft import prepare_model_for_kbit_training
47
- model = prepare_model_for_kbit_training(model) # :contentReference[oaicite:0]{index=0}
48
-
49
- # 3) Attach LoRA adapters on top of the quantized weights
50
- from peft import LoraConfig, get_peft_model, TaskType
51
-
52
- lora_config = LoraConfig(
53
- r = 8, # low‑rank dimension
54
- lora_alpha = 16, # scaling
55
- target_modules = ["q_proj", "v_proj"], # apply to attention
56
- bias = "none",
57
- task_type = TaskType.CAUSAL_LM,
58
- inference_mode = False,
 
59
  )
60
- model = get_peft_model(model, lora_config) # :contentReference[oaicite:1]{index=1}
61
-
62
- # Now `model` has ~1–2% trainable parameters (the LoRA adapters),
63
- # and Trainer will no longer throw the “purely quantized” error.
64
 
65
  # Prepare the TaiwanChat Dataset
66
  # 1) Load & split
67
- full_ds = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")
68
- splits = full_ds.train_test_split(test_size=0.1, seed=42) # :contentReference[oaicite:2]{index=2}
69
- train_ds = splits["train"]
70
- val_ds = splits["test"]
71
-
72
- # Preprocessing Function
73
- def preprocess_examples(examples):
74
- chats = examples["messages"]
75
- # 1) Render ChatML
76
- text = tokenizer.apply_chat_template(
77
- chats, tokenize=False, add_generation_prompt=True
78
- )
79
-
80
- # 2) Tokenize _and_ pad/truncate to MAX_LEN
81
- toks = tokenizer(
82
- text,
83
- truncation=True,
84
- padding="max_length",
85
- max_length=MAX_LEN,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  )
87
- input_ids = toks["input_ids"]
88
- attention_mask= toks["attention_mask"]
89
-
90
- # 3) Find where the assistant reply starts
91
- role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
92
- if role_id in input_ids:
93
- idx = input_ids.index(role_id)
94
- start_of_reply = idx + 2
95
- else:
96
- start_of_reply = 0
97
-
98
- # 4) Build labels: -100 before reply, then copy the rest
99
- labels = [-100] * start_of_reply + input_ids[start_of_reply:]
100
-
101
- # 5) Pad or truncate labels to EXACTLY len(input_ids)
102
- if len(labels) < len(input_ids):
103
- labels += [-100] * (len(input_ids) - len(labels))
104
- else:
105
- labels = labels[: len(input_ids)]
106
-
107
- return {
108
- "input_ids": input_ids,
109
- "attention_mask": attention_mask,
110
- "labels": labels,
111
- }
112
- # Tokenization & Data Collator
113
- tokenized_train = train_ds.map(
114
- preprocess_examples, batched=True, remove_columns=train_ds.column_names
115
- )
116
- tokenized_val = val_ds.map(
117
- preprocess_examples, batched=True, remove_columns=val_ds.column_names
118
- )
119
-
120
- data_collator = DataCollatorForLanguageModeling(
121
- tokenizer=tokenizer, mlm=False
122
- )
123
-
124
- training_args = TrainingArguments(
125
- output_dir=PROJECT_NAME,
126
- per_device_train_batch_size=2,
127
- gradient_accumulation_steps = 16,
128
- learning_rate=5e-5,
129
- num_train_epochs=3,
130
- fp16=False if device_str == 'xpu' else True,
131
- bf16=True if device_str == 'xpu' else False,
132
- #evaluation_strategy = "steps", # run validation every eval_steps
133
- #eval_steps = 1000,
134
- #load_best_model_at_end = True,
135
- #metric_for_best_model = "perplexity",
136
- greater_is_better = False,
137
- logging_steps=1000,
138
- save_steps=5000,
139
-
140
- # ─── W&B integration ───
141
- logging_dir=f"{PROJECT_NAME}/logs", # where to store TensorBoard/W&B logs
142
- report_to=["wandb"], # enable W&B reporting
143
- run_name=f"{PROJECT_NAME}_CLOUD", # name this run in your W&B project
144
-
145
- push_to_hub=True,
146
- gradient_checkpointing=True,
147
- )
148
-
149
- # Enable gradient checkpointing on the model
150
- model.gradient_checkpointing_enable()
151
-
152
- # Define a metrics function
153
- def compute_metrics(p: EvalPrediction):
154
- # p.predictions are logits: (batch, seq_len, vocab_size)
155
- # p.label_ids are (batch, seq_len)
156
- # The Trainer will automatically compute loss on eval_dataset
157
- # We can pull that from p.metrics if available,
158
- # but simplest is to use returned "eval_loss" in Trainer.evaluate()
159
- # Here we compute perplexity manually:
160
- eval_loss = p.metrics["eval_loss"] if "eval_loss" in p.metrics else None
161
- if eval_loss is None:
162
- raise ValueError("eval_loss not found in metrics; ensure compute_metrics is called after evaluation.")
163
- return {"perplexity": math.exp(eval_loss)}
164
-
165
 
166
  # Training with Trainer
167
- trainer = Trainer(
168
  model=model,
169
  args=training_args,
170
- train_dataset=tokenized_train,
171
- #eval_dataset=tokenized_val,
172
- compute_metrics=compute_metrics,
173
- data_collator=data_collator,
174
  callbacks=[WandbCallback], # ensure the W&B callback is attached
 
 
 
 
 
 
 
 
 
 
 
175
  )
176
- trainer.train(resume_from_checkpoint=True)
177
 
178
- # Save Model & Tokenizer Locally
179
- trainer.save_model(PROJECT_NAME)
180
- trainer.push_to_hub(f'Luigi/{PROJECT_NAME}')
181
- tokenizer.save_pretrained(PROJECT_NAME)
182
 
183
- # 1) Load from local folder
184
- model_dir = PROJECT_NAME
185
- tokenizer = AutoTokenizer.from_pretrained(model_dir)
186
- model = AutoModelForCausalLM.from_pretrained(model_dir) # loads your fine‑tuned weights :contentReference[oaicite:2]{index=2}
187
 
188
- # ─── patch for Uns­loth’s fast-forward ───
189
- # Uns­loth expects a `max_seq_length` attribute on the model
190
- setattr(model, "max_seq_length", MAX_LEN)
191
 
192
- # Test Fine-tuned Model
193
- hf_device = 0 if device_str in ("cuda","xpu") else -1
194
  gen = pipeline(
195
- "text-generation",
196
- model=model,
197
- tokenizer=tokenizer,
198
- device=hf_device, # or device=0 for GPU
199
- max_new_tokens=512, # customize as desired
200
  )
201
-
202
  prompt = "請問台北今天的天氣如何?"
203
- output = gen(prompt, do_sample=True, temperature=0.8)
204
- print(output[0]["generated_text"])
 
1
  #! /usr/bin/env python3
2
+ """
3
+ Fine-tune “SmolLM2-135M-Instruct” on the TaiwanChat dataset using Unsloth’s 4-bit quantization
4
+ + LoRA adapters, with evaluation on a 1% hold-out every step, and push the merged model to Hugging Face.
5
+
6
+ Steps:
7
+ 1. Load a 4-bit quantized base model via Unsloth’s FastLanguageModel.
8
+ 2. Attach LoRA adapters (r=16) and enable gradient checkpointing for memory savings.
9
+ 3. Load TaiwanChat, render ChatML, and split 99/1 train/validation.
10
+ 4. Configure SFTTrainer to mask user prompts (train_on_responses_only), run eval every step, log to W&B.
11
+ 5. Train for up to 60 steps.
12
+ 6. Merge base+LoRA weights into 16-bit safetensors and push to Hugging Face with `push_to_hub_merged`.
13
+ """
14
+
15
  from unsloth import FastLanguageModel
16
+ from trl import SFTTrainer, SFTConfig
17
  from transformers import DataCollatorForLanguageModeling
18
+ from unsloth.chat_templates import train_on_responses_only
 
19
  from datasets import load_dataset
 
20
  import os
 
21
  from transformers.integrations import WandbCallback
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
23
 
24
  PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
25
+ BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct"
26
  DATASET_ID="yentinglin/TaiwanChat"
27
  N_SAMPLES=80000
28
+ MAX_LEN=2048
29
 
30
  # Tell wandb which project to use, and that you want to log your model
31
  os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD"
32
  os.environ["WANDB_LOG_MODEL"] = "end"
33
 
 
 
 
 
 
 
 
 
34
  ## Load with Unsloth’s optimized API
35
  # 1) Load quantized model
36
  model, tokenizer = FastLanguageModel.from_pretrained(
37
  model_name = BASE_MODEL_ID,
38
  max_seq_length = MAX_LEN,
 
39
  load_in_4bit = True,
40
  full_finetuning= False, # we will add LoRA adapters next
41
  )
42
 
43
  # 2) Prepare it for k‑bit training (sets up layer norms, disables caching, etc.)
44
+ model = FastLanguageModel.get_peft_model(
45
+ model,
46
+ r = 16,
47
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
48
+ "gate_proj", "up_proj", "down_proj",],
49
+ lora_alpha = 16,
50
+ lora_dropout = 0, # Supports any, but = 0 is optimized
51
+ bias = "none", # Supports any, but = "none" is optimized
52
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
53
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
54
+ random_state = 3407,
55
+ max_seq_length = MAX_LEN,
56
+ use_rslora = False, # We support rank stabilized LoRA
57
+ loftq_config = None, # And LoftQ
58
  )
 
 
 
 
59
 
60
  # Prepare the TaiwanChat Dataset
61
  # 1) Load & split
62
+ dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")
63
+
64
+ # turn list-of-messages → a single “text” string per example, using Unsloth’s ChatML template
65
+ def fmt(examples):
66
+ texts = [
67
+ tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
68
+ for msgs in examples["messages"]
69
+ ]
70
+ return {"text": texts}
71
+
72
+ dataset = dataset.map(fmt, batched=True, remove_columns=["messages"])
73
+ new_dataset = dataset.train_test_split(test_size = 0.01)
74
+
75
+ training_args = SFTConfig(
76
+ fp16_full_eval = True,
77
+ per_device_eval_batch_size = 2,
78
+ eval_accumulation_steps = 4,
79
+ eval_strategy = "steps",
80
+ eval_steps = 1,
81
+ dataset_text_field="text",
82
+ output_dir=PROJECT_NAME,
83
+ max_seq_length = MAX_LEN,
84
+ per_device_train_batch_size = 2,
85
+ gradient_accumulation_steps = 4,
86
+ warmup_steps = 10,
87
+ max_steps = 60,
88
+ logging_steps = 1,
89
+ optim = "adamw_8bit",
90
+ seed = 3407,
91
+ # ─── W&B integration ───
92
+ logging_dir=f"{PROJECT_NAME}/logs", # where to store TensorBoard/W&B logs
93
+ report_to=["wandb"], # enable W&B reporting
94
+ run_name=f"{PROJECT_NAME}_CLOUD", # name this run in your W&B project
95
+ push_to_hub=True,
96
+ gradient_checkpointing=True
97
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # Training with Trainer
100
+ trainer = SFTTrainer(
101
  model=model,
102
  args=training_args,
103
+ data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False),
104
+ tokenizer=tokenizer,
 
 
105
  callbacks=[WandbCallback], # ensure the W&B callback is attached
106
+ train_dataset = new_dataset["train"],
107
+ eval_dataset = new_dataset["test"],
108
+ )
109
+ trainer = train_on_responses_only(trainer)
110
+ trainer.train()
111
+
112
+ model.push_to_hub_merged(
113
+ f'Luigi/{PROJECT_NAME}',
114
+ tokenizer,
115
+ save_method="merged_16bit",
116
+ safe_serialization=None
117
  )
 
118
 
 
 
 
 
119
 
 
 
 
 
120
 
121
+ # 1. load merged model + tokenizer from your HF repo
122
+ tokenizer = AutoTokenizer.from_pretrained(f'Luigi/{PROJECT_NAME}')
123
+ model = AutoModelForCausalLM.from_pretrained(f'Luigi/{PROJECT_NAME}')
124
 
125
+ # 2. run text-generation
 
126
  gen = pipeline(
127
+ "text-generation", model=model, tokenizer=tokenizer,
128
+ device_map="auto", # or device=0 for a single GPU
 
 
 
129
  )
 
130
  prompt = "請問台北今天的天氣如何?"
131
+ print(gen(prompt, max_new_tokens=MAX_LEN)[0]["generated_text"])