Luigi
/

SmolLM2-135M-Instruct-TaiwanChat

+#! /usr/bin/env python3
+import unsloth
+from transformers import AutoTokenizer
+from unsloth import FastLanguageModel
+from transformers import DataCollatorForLanguageModeling
+from transformers import TrainingArguments, Trainer
+from transformers import pipeline
+from datasets import load_dataset
+import torch
+import os
+import wandb
+from transformers.integrations import WandbCallback
+PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
+BASE_MODEL_ID="HuggingFaceTB/SmolLM2-135M-Instruct"
+DATASET_ID="yentinglin/TaiwanChat"
+N_SAMPLES=-1
+MAX_LEN=256
+# Tell wandb which project to use, and that you want to log your model
+os.environ["WANDB_PROJECT"]    = PROJECT_NAME
+os.environ["WANDB_LOG_MODEL"]  = "end"
+# Detect GPU Type
+device_str='cpu'
+if torch.xpu.is_available():
+    device_str='xpu'
+elif torch.cuda.is_available():
+    device_str='cuda'
+print(f'Device is {device_str}')
+## Load with Unsloth’s optimized API
+# 1) Load quantized model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name     = BASE_MODEL_ID,
+    max_seq_length = MAX_LEN,
+    dtype          = torch.float16,
+    load_in_4bit   = True,
+    full_finetuning= False,  # we will add LoRA adapters next
+)
+# 2) Prepare it for k‑bit training (sets up layer norms, disables caching, etc.)
+from peft import prepare_model_for_kbit_training
+model = prepare_model_for_kbit_training(model)  # :contentReference[oaicite:0]{index=0}
+# 3) Attach LoRA adapters on top of the quantized weights
+from peft import LoraConfig, get_peft_model, TaskType
+lora_config = LoraConfig(
+    r               = 8,                     # low‑rank dimension
+    lora_alpha      = 16,                    # scaling
+    target_modules  = ["q_proj", "v_proj"],  # apply to attention
+    bias            = "none",
+    task_type       = TaskType.CAUSAL_LM,
+    inference_mode  = False,
+)
+model = get_peft_model(model, lora_config)  # :contentReference[oaicite:1]{index=1}
+# Now `model` has ~1–2% trainable parameters (the LoRA adapters),
+# and Trainer will no longer throw the “purely quantized” error.
+# Prepare the TaiwanChat Dataset
+dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")
+# Preprocessing Function
+def preprocess_examples(examples):
+    # Each 'messages' entry is a list of {"role","content"} dicts
+    chats = examples["messages"]
+    # Render into a single string via ChatML template
+    text = tokenizer.apply_chat_template(chats, tokenize=False, add_generation_prompt=True)
+    # Tokenize with truncation
+    tokens = tokenizer(text, truncation=True, max_length=MAX_LEN)
+    return {"input_ids": tokens["input_ids"],
+            "attention_mask": tokens["attention_mask"]}
+# Tokenization & Data Collator
+tokenized_ds = dataset.map(
+    preprocess_examples,
+    batched=True,
+    remove_columns=dataset.column_names,
+)
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer, mlm=False
+)
+training_args = TrainingArguments(
+    output_dir=PROJECT_NAME,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps    = 16,
+    learning_rate=5e-5,
+    num_train_epochs=3,
+    fp16=False if device_str == 'xpu' else True,
+    bf16=True if device_str == 'xpu' else False,
+    logging_steps=1000,
+    save_steps=5000,
+    # ─── W&B integration ───
+    logging_dir=f"{PROJECT_NAME}/logs",    # where to store TensorBoard/W&B logs
+    report_to=["wandb"],                   # enable W&B reporting
+    run_name=PROJECT_NAME,                 # name this run in your W&B project
+    push_to_hub=True,
+    gradient_checkpointing=True,
+)
+# Enable gradient checkpointing on the model
+model.gradient_checkpointing_enable()
+# Training with Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_ds,
+    data_collator=data_collator,
+    callbacks=[WandbCallback],  # ensure the W&B callback is attached
+)
+trainer.train(resume_from_checkpoint=False)
+# Save Model & Tokenizer Locally
+trainer.save_model(PROJECT_NAME)
+trainer.push_to_hub(f'Luigi/{PROJECT_NAME}')
+tokenizer.save_pretrained(PROJECT_NAME)
+# 1) Load from local folder
+model_dir = PROJECT_NAME
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+model     = AutoModelForCausalLM.from_pretrained(model_dir)  # loads your fine‑tuned weights :contentReference[oaicite:2]{index=2}
+# Test Fine-tuned Model
+hf_device = 0 if device_str in ("cuda","xpu") else -1
+gen = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device=hf_device,                # or device=0 for GPU
+    max_new_tokens=512,        # customize as desired
+)
+prompt = "請問台北今天的天氣如何？"
+output = gen(prompt, do_sample=True, temperature=0.8)
+print(output[0]["generated_text"])