File size: 5,047 Bytes
ab1f15c
1f5b86c
 
 
 
 
 
 
 
 
 
 
 
 
ab1f15c
1f5b86c
ab1f15c
1f5b86c
ab1f15c
 
 
1f5b86c
2340301
ab1f15c
 
1f5b86c
ab1f15c
5d7244c
1f5b86c
ab1f15c
2340301
 
ab1f15c
3645d25
ab1f15c
 
 
 
 
 
 
 
 
 
 
 
1f5b86c
 
 
 
 
 
 
 
 
 
 
 
 
 
ab1f15c
 
 
b91c58e
1f5b86c
 
 
 
 
 
 
 
 
 
 
 
 
 
2340301
 
1f5b86c
2340301
1f5b86c
 
 
 
2340301
1f5b86c
 
 
 
 
 
 
 
 
 
 
 
8249056
b91c58e
ab1f15c
1f5b86c
ab1f15c
 
1f5b86c
 
ab1f15c
1f5b86c
 
 
2340301
20210b2
 
22d1cb2
 
20210b2
1f5b86c
 
 
 
 
 
 
ab1f15c
 
1f5b86c
 
 
5d7244c
1f5b86c
ab1f15c
1f5b86c
 
ab1f15c
 
1f5b86c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#! /usr/bin/env python3
"""
Fine-tune “SmolLM2-135M-Instruct” on the TaiwanChat dataset using Unsloth’s 4-bit quantization
+ LoRA adapters, with evaluation on a 1% hold-out every step, and push the merged model to Hugging Face.

Steps:
1. Load a 4-bit quantized base model via Unsloth’s FastLanguageModel.
2. Attach LoRA adapters (r=16) and enable gradient checkpointing for memory savings.
3. Load TaiwanChat, render ChatML, and split 99/1 train/validation.
4. Configure SFTTrainer to mask user prompts (train_on_responses_only), run eval every step, log to W&B.
5. Train for up to 60 steps.
6. Merge base+LoRA weights into 16-bit safetensors and push to Hugging Face with `push_to_hub_merged`.
"""

from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling
from unsloth.chat_templates import train_on_responses_only
from datasets import load_dataset
import os
from transformers.integrations import WandbCallback
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct"
DATASET_ID="yentinglin/TaiwanChat"
N_SAMPLES=80000
MAX_LEN=2048

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

# Tell wandb which project to use, and that you want to log your model
os.environ["WANDB_PROJECT"]    = f"{PROJECT_NAME}_CLOUD"
os.environ["WANDB_LOG_MODEL"]  = "end"

## Load with Unsloth’s optimized API
# 1) Load quantized model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = BASE_MODEL_ID,
    max_seq_length = MAX_LEN,
    load_in_4bit   = True,
    full_finetuning= False,  # we will add LoRA adapters next
)

# 2) Prepare it for k‑bit training (sets up layer norms, disables caching, etc.)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = MAX_LEN,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

# Prepare the TaiwanChat Dataset
# 1) Load & split
dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")

# turn list-of-messages → a single “text” string per example, using Unsloth’s ChatML template
def fmt(examples):
    texts = [
        tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        for msgs in examples["messages"]
    ]
    return {"text": texts}

dataset = dataset.map(fmt, batched=True, remove_columns=["messages"])
new_dataset = dataset.train_test_split(test_size = 0.01)

training_args = SFTConfig(
        fp16_full_eval = False,
        per_device_eval_batch_size = 1,
        eval_accumulation_steps = 4,
        eval_strategy = "epoch",
        eval_steps = 1,
        dataset_text_field="text",
        output_dir=PROJECT_NAME,
        max_seq_length = MAX_LEN,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        logging_steps = 1,
        optim = "adamw_8bit",
        seed = 3407,
        # ─── W&B integration ───
        logging_dir=f"{PROJECT_NAME}/logs",    # where to store TensorBoard/W&B logs
        report_to=["wandb"],                   # enable W&B reporting
        run_name=f"{PROJECT_NAME}_CLOUD",                 # name this run in your W&B project
        push_to_hub=True,
        gradient_checkpointing=True
    )

# Training with Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False),
    tokenizer=tokenizer,
    callbacks=[WandbCallback],  # ensure the W&B callback is attached
    train_dataset = new_dataset["train"],
    eval_dataset = new_dataset["test"],
)
torch.cuda.empty_cache()
trainer = train_on_responses_only(
  trainer,
  instruction_part = "<|im_start|>user\n",
  response_part    = "<|im_start|>assistant\n",
)
trainer.train()

model.push_to_hub_merged(
    f'Luigi/{PROJECT_NAME}', 
    tokenizer, 
    save_method="merged_16bit", 
    safe_serialization=None
)

# 1. load merged model + tokenizer from your HF repo
tokenizer = AutoTokenizer.from_pretrained(f'Luigi/{PROJECT_NAME}')  
model     = AutoModelForCausalLM.from_pretrained(f'Luigi/{PROJECT_NAME}')  

# 2. run text-generation
gen = pipeline(
  "text-generation", model=model, tokenizer=tokenizer,
  device_map="auto",    # or device=0 for a single GPU
)
prompt = "請問台北今天的天氣如何?"
print(gen(prompt, max_new_tokens=MAX_LEN)[0]["generated_text"])