#! /usr/bin/env python3 from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import DataCollatorForLanguageModeling from transformers import TrainingArguments, Trainer from transformers import pipeline from datasets import load_dataset import torch import os import wandb from transformers.integrations import WandbCallback PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat' BASE_MODEL_ID="HuggingFaceTB/SmolLM2-135M-Instruct" DATASET_ID="yentinglin/TaiwanChat" N_SAMPLES=40000 MAX_LEN=512 # Tell wandb which project to use, and that you want to log your model os.environ["WANDB_PROJECT"] = PROJECT_NAME os.environ["WANDB_LOG_MODEL"] = "end" # Detect GPU Type device_str='cpu' if torch.xpu.is_available(): device_str='xpu' elif torch.cuda.is_available(): device_str='cuda' print(f'Device is {device_str}') # Load Model & Tokenizer model_name = BASE_MODEL_ID tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) model.to(device_str) # Prepare the TaiwanChat Dataset dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]") # Preprocessing Function def preprocess_examples(examples): # Each 'messages' entry is a list of {"role","content"} dicts chats = examples["messages"] # Render into a single string via ChatML template text = tokenizer.apply_chat_template(chats, tokenize=False, add_generation_prompt=True) # Tokenize with truncation tokens = tokenizer(text, truncation=True, max_length=MAX_LEN) return {"input_ids": tokens["input_ids"], "attention_mask": tokens["attention_mask"]} # Tokenization & Data Collator tokenized_ds = dataset.map( preprocess_examples, batched=True, remove_columns=dataset.column_names, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) training_args = TrainingArguments( output_dir=PROJECT_NAME, per_device_train_batch_size=4, learning_rate=5e-5, num_train_epochs=3, fp16=False if device_str == 'xpu' else True, bf16=True if device_str == 'xpu' else False, logging_steps=1000, save_steps=5000, # ─── W&B integration ─── logging_dir=f"{PROJECT_NAME}/logs", # where to store TensorBoard/W&B logs report_to=["wandb"], # enable W&B reporting run_name=PROJECT_NAME, # name this run in your W&B project push_to_hub=True, gradient_checkpointing=True, ) # Training with Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_ds, data_collator=data_collator, callbacks=[WandbCallback], # ensure the W&B callback is attached ) trainer.train(resume_from_checkpoint=False) # Save Model & Tokenizer Locally trainer.save_model(PROJECT_NAME) trainer.push_to_hub(f'Luigi/{PROJECT_NAME}') tokenizer.save_pretrained(PROJECT_NAME) # 1) Load from local folder model_dir = PROJECT_NAME tokenizer = AutoTokenizer.from_pretrained(model_dir) model = AutoModelForCausalLM.from_pretrained(model_dir) # loads your fine‑tuned weights :contentReference[oaicite:2]{index=2} # Test Fine-tuned Model hf_device = 0 if device_str in ("cuda","xpu") else -1 gen = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=hf_device, # or device=0 for GPU max_new_tokens=512, # customize as desired ) prompt = "請問台北今天的天氣如何?" output = gen(prompt, do_sample=True, temperature=0.8) print(output[0]["generated_text"])