#! /usr/bin/env python3 """ Fine-tune “SmolLM2-135M-Instruct” on the TaiwanChat dataset using Unsloth’s 4-bit quantization + LoRA adapters, with evaluation on a 1% hold-out every step, and push the merged model to Hugging Face. Steps: 1. Load a 4-bit quantized base model via Unsloth’s FastLanguageModel. 2. Attach LoRA adapters (r=16) and enable gradient checkpointing for memory savings. 3. Load TaiwanChat, render ChatML, and split 99/1 train/validation. 4. Configure SFTTrainer to mask user prompts (train_on_responses_only), run eval every step, log to W&B. 5. Train for up to 60 steps. 6. Merge base+LoRA weights into 16-bit safetensors and push to Hugging Face with `push_to_hub_merged`. """ from unsloth import FastLanguageModel from trl import SFTTrainer, SFTConfig from transformers import DataCollatorForLanguageModeling from unsloth.chat_templates import train_on_responses_only from datasets import load_dataset import os from transformers.integrations import WandbCallback from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat' BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct" DATASET_ID="yentinglin/TaiwanChat" N_SAMPLES=80000 MAX_LEN=2048 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128" # Tell wandb which project to use, and that you want to log your model os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD" os.environ["WANDB_LOG_MODEL"] = "end" ## Load with Unsloth’s optimized API # 1) Load quantized model model, tokenizer = FastLanguageModel.from_pretrained( model_name = BASE_MODEL_ID, max_seq_length = MAX_LEN, load_in_4bit = True, full_finetuning= False, # we will add LoRA adapters next ) # 2) Prepare it for k‑bit training (sets up layer norms, disables caching, etc.) model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, max_seq_length = MAX_LEN, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ ) # Prepare the TaiwanChat Dataset # 1) Load & split dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]") # turn list-of-messages → a single “text” string per example, using Unsloth’s ChatML template def fmt(examples): texts = [ tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) for msgs in examples["messages"] ] return {"text": texts} dataset = dataset.map(fmt, batched=True, remove_columns=["messages"]) new_dataset = dataset.train_test_split(test_size = 0.01) training_args = SFTConfig( fp16_full_eval = False, per_device_eval_batch_size = 1, eval_accumulation_steps = 4, eval_strategy = "epoch", eval_steps = 1, dataset_text_field="text", output_dir=PROJECT_NAME, max_seq_length = MAX_LEN, per_device_train_batch_size = 1, gradient_accumulation_steps = 4, warmup_steps = 10, max_steps = 60, logging_steps = 1, optim = "adamw_8bit", seed = 3407, # ─── W&B integration ─── logging_dir=f"{PROJECT_NAME}/logs", # where to store TensorBoard/W&B logs report_to=["wandb"], # enable W&B reporting run_name=f"{PROJECT_NAME}_CLOUD", # name this run in your W&B project push_to_hub=False, gradient_checkpointing=True ) # Training with Trainer trainer = SFTTrainer( model=model, args=training_args, data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False), tokenizer=tokenizer, callbacks=[WandbCallback], # ensure the W&B callback is attached train_dataset = new_dataset["train"], eval_dataset = new_dataset["test"], ) torch.cuda.empty_cache() trainer = train_on_responses_only( trainer, instruction_part = "<|im_start|>user\n", response_part = "<|im_start|>assistant\n", ) trainer.train() # 1. merge the LoRA weights into the base model and unload the adapter model.push_to_hub_merged( f'Luigi/{PROJECT_NAME}', tokenizer, save_method="merged_16bit", safe_serialization=None ) prompt = "請問台北今天的天氣如何?" # tokenize and move to device inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # call HF generate with explicit params outputs = model.generate( **inputs, max_new_tokens=100, do_sample=True, temperature=0.8, pad_token_id=tokenizer.eos_token_id ) print(tokenizer.decode(outputs[0], skip_special_tokens=True))