|
|
|
|
|
""" |
|
|
Fine-tune “SmolLM2-135M-Instruct” on the TaiwanChat dataset using Unsloth’s 4-bit quantization |
|
|
+ LoRA adapters, with evaluation on a 1% hold-out every step, and push the merged model to Hugging Face. |
|
|
|
|
|
Steps: |
|
|
1. Load a 4-bit quantized base model via Unsloth’s FastLanguageModel. |
|
|
2. Attach LoRA adapters (r=16) and enable gradient checkpointing for memory savings. |
|
|
3. Load TaiwanChat, render ChatML, and split 99/1 train/validation. |
|
|
4. Configure SFTTrainer to mask user prompts (train_on_responses_only), run eval every step, log to W&B. |
|
|
5. Train for up to 60 steps. |
|
|
6. Merge base+LoRA weights into 16-bit safetensors and push to Hugging Face with `push_to_hub_merged`. |
|
|
""" |
|
|
|
|
|
from unsloth import FastLanguageModel |
|
|
from trl import SFTTrainer, SFTConfig |
|
|
from transformers import DataCollatorForLanguageModeling |
|
|
from unsloth.chat_templates import train_on_responses_only |
|
|
from datasets import load_dataset |
|
|
import os |
|
|
from transformers.integrations import WandbCallback |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
import torch |
|
|
|
|
|
PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat' |
|
|
BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct" |
|
|
DATASET_ID="yentinglin/TaiwanChat" |
|
|
N_SAMPLES=80000 |
|
|
MAX_LEN=2048 |
|
|
|
|
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128" |
|
|
|
|
|
|
|
|
os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD" |
|
|
os.environ["WANDB_LOG_MODEL"] = "end" |
|
|
|
|
|
|
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
|
model_name = BASE_MODEL_ID, |
|
|
max_seq_length = MAX_LEN, |
|
|
load_in_4bit = True, |
|
|
full_finetuning= False, |
|
|
) |
|
|
|
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
|
model, |
|
|
r = 16, |
|
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
|
|
"gate_proj", "up_proj", "down_proj",], |
|
|
lora_alpha = 16, |
|
|
lora_dropout = 0, |
|
|
bias = "none", |
|
|
|
|
|
use_gradient_checkpointing = "unsloth", |
|
|
random_state = 3407, |
|
|
max_seq_length = MAX_LEN, |
|
|
use_rslora = False, |
|
|
loftq_config = None, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]") |
|
|
|
|
|
|
|
|
def fmt(examples): |
|
|
texts = [ |
|
|
tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
|
|
for msgs in examples["messages"] |
|
|
] |
|
|
return {"text": texts} |
|
|
|
|
|
dataset = dataset.map(fmt, batched=True, remove_columns=["messages"]) |
|
|
new_dataset = dataset.train_test_split(test_size = 0.01) |
|
|
|
|
|
training_args = SFTConfig( |
|
|
fp16_full_eval = False, |
|
|
per_device_eval_batch_size = 1, |
|
|
eval_accumulation_steps = 4, |
|
|
eval_strategy = "epoch", |
|
|
eval_steps = 1, |
|
|
dataset_text_field="text", |
|
|
output_dir=PROJECT_NAME, |
|
|
max_seq_length = MAX_LEN, |
|
|
per_device_train_batch_size = 1, |
|
|
gradient_accumulation_steps = 4, |
|
|
warmup_steps = 10, |
|
|
max_steps = 60, |
|
|
logging_steps = 1, |
|
|
optim = "adamw_8bit", |
|
|
seed = 3407, |
|
|
|
|
|
logging_dir=f"{PROJECT_NAME}/logs", |
|
|
report_to=["wandb"], |
|
|
run_name=f"{PROJECT_NAME}_CLOUD", |
|
|
push_to_hub=True, |
|
|
gradient_checkpointing=True |
|
|
) |
|
|
|
|
|
|
|
|
trainer = SFTTrainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False), |
|
|
tokenizer=tokenizer, |
|
|
callbacks=[WandbCallback], |
|
|
train_dataset = new_dataset["train"], |
|
|
eval_dataset = new_dataset["test"], |
|
|
) |
|
|
torch.cuda.empty_cache() |
|
|
trainer = train_on_responses_only( |
|
|
trainer, |
|
|
instruction_part = "<|im_start|>user\n", |
|
|
response_part = "<|im_start|>assistant\n", |
|
|
) |
|
|
trainer.train() |
|
|
|
|
|
model.push_to_hub_merged( |
|
|
f'Luigi/{PROJECT_NAME}', |
|
|
tokenizer, |
|
|
save_method="merged_16bit", |
|
|
safe_serialization=None |
|
|
) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(f'Luigi/{PROJECT_NAME}') |
|
|
model = AutoModelForCausalLM.from_pretrained(f'Luigi/{PROJECT_NAME}') |
|
|
|
|
|
|
|
|
gen = pipeline( |
|
|
"text-generation", model=model, tokenizer=tokenizer, |
|
|
device_map="auto", |
|
|
) |
|
|
prompt = "請問台北今天的天氣如何?" |
|
|
print(gen(prompt, max_new_tokens=MAX_LEN)[0]["generated_text"]) |