SmolLM2-135M-Instruct-TaiwanChat / train_with_unsloth.py
Luigi's picture
ajustement to avoid oom err
2340301
raw
history blame
5.05 kB
#! /usr/bin/env python3
"""
Fine-tune “SmolLM2-135M-Instruct” on the TaiwanChat dataset using Unsloth’s 4-bit quantization
+ LoRA adapters, with evaluation on a 1% hold-out every step, and push the merged model to Hugging Face.
Steps:
1. Load a 4-bit quantized base model via Unsloth’s FastLanguageModel.
2. Attach LoRA adapters (r=16) and enable gradient checkpointing for memory savings.
3. Load TaiwanChat, render ChatML, and split 99/1 train/validation.
4. Configure SFTTrainer to mask user prompts (train_on_responses_only), run eval every step, log to W&B.
5. Train for up to 60 steps.
6. Merge base+LoRA weights into 16-bit safetensors and push to Hugging Face with `push_to_hub_merged`.
"""
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling
from unsloth.chat_templates import train_on_responses_only
from datasets import load_dataset
import os
from transformers.integrations import WandbCallback
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct"
DATASET_ID="yentinglin/TaiwanChat"
N_SAMPLES=80000
MAX_LEN=2048
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
# Tell wandb which project to use, and that you want to log your model
os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD"
os.environ["WANDB_LOG_MODEL"] = "end"
## Load with Unsloth’s optimized API
# 1) Load quantized model
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = BASE_MODEL_ID,
max_seq_length = MAX_LEN,
load_in_4bit = True,
full_finetuning= False, # we will add LoRA adapters next
)
# 2) Prepare it for k‑bit training (sets up layer norms, disables caching, etc.)
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
max_seq_length = MAX_LEN,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
# Prepare the TaiwanChat Dataset
# 1) Load & split
dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")
# turn list-of-messages → a single “text” string per example, using Unsloth’s ChatML template
def fmt(examples):
texts = [
tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
for msgs in examples["messages"]
]
return {"text": texts}
dataset = dataset.map(fmt, batched=True, remove_columns=["messages"])
new_dataset = dataset.train_test_split(test_size = 0.01)
training_args = SFTConfig(
fp16_full_eval = False,
per_device_eval_batch_size = 1,
eval_accumulation_steps = 4,
eval_strategy = "epoch",
eval_steps = 1,
dataset_text_field="text",
output_dir=PROJECT_NAME,
max_seq_length = MAX_LEN,
per_device_train_batch_size = 1,
gradient_accumulation_steps = 4,
warmup_steps = 10,
max_steps = 60,
logging_steps = 1,
optim = "adamw_8bit",
seed = 3407,
# ─── W&B integration ───
logging_dir=f"{PROJECT_NAME}/logs", # where to store TensorBoard/W&B logs
report_to=["wandb"], # enable W&B reporting
run_name=f"{PROJECT_NAME}_CLOUD", # name this run in your W&B project
push_to_hub=True,
gradient_checkpointing=True
)
# Training with Trainer
trainer = SFTTrainer(
model=model,
args=training_args,
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False),
tokenizer=tokenizer,
callbacks=[WandbCallback], # ensure the W&B callback is attached
train_dataset = new_dataset["train"],
eval_dataset = new_dataset["test"],
)
torch.cuda.empty_cache()
trainer = train_on_responses_only(
trainer,
instruction_part = "<|im_start|>user\n",
response_part = "<|im_start|>assistant\n",
)
trainer.train()
model.push_to_hub_merged(
f'Luigi/{PROJECT_NAME}',
tokenizer,
save_method="merged_16bit",
safe_serialization=None
)
# 1. load merged model + tokenizer from your HF repo
tokenizer = AutoTokenizer.from_pretrained(f'Luigi/{PROJECT_NAME}')
model = AutoModelForCausalLM.from_pretrained(f'Luigi/{PROJECT_NAME}')
# 2. run text-generation
gen = pipeline(
"text-generation", model=model, tokenizer=tokenizer,
device_map="auto", # or device=0 for a single GPU
)
prompt = "請問台北今天的天氣如何?"
print(gen(prompt, max_new_tokens=MAX_LEN)[0]["generated_text"])