#! /usr/bin/env python3 from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import DataCollatorForLanguageModeling from transformers import TrainingArguments, Trainer, EvalPrediction from transformers import pipeline from datasets import load_dataset import torch import os import math from transformers.integrations import WandbCallback PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat' BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct" DATASET_ID = "yentinglin/TaiwanChat" N_SAMPLES = 3000 MAX_LEN = 512 VAL_FRACTION = 0.1 PER_DEVICE_TRAIN_BATCH_SIZE=8 NUM_TRAIN_EPOCHS=3 # Tell wandb which project to use, and that you want to log your model os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL' os.environ["WANDB_LOG_MODEL"] = "end" # Detect GPU Type device_str = 'cpu' if torch.xpu.is_available(): device_str = 'xpu' elif torch.cuda.is_available(): device_str = 'cuda' print(f'Device is {device_str}') # Load Model & Tokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID) model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, low_cpu_mem_usage=True ) model.to(device_str, dtype=torch.bfloat16 if device_str == 'xpu' else torch.float16) # Prepare the TaiwanChat Dataset # Load and split into train/validation # 1) Load the raw train split as a stream raw_stream = load_dataset( DATASET_ID, split="train", # no slicing here streaming=True ) # 2) (Optional) Shuffle the stream with a buffer shuffled = raw_stream.shuffle(buffer_size=100, seed=42) # 3) Take exactly N_SAMPLES examples limited = shuffled.take(N_SAMPLES) # 4) Split into train / validation n_val = int(N_SAMPLES * VAL_FRACTION) n_train = N_SAMPLES - n_val train_stream = limited.take(n_train) val_stream = limited.skip(n_train).take(n_val) # Preprocessing function def preprocess_examples(examples): chats = examples["messages"] # list of chat messages # 1) Render ChatML text = tokenizer.apply_chat_template( chats, tokenize=False, add_generation_prompt=True ) # 2) Tokenize, pad/truncate to MAX_LEN toks = tokenizer( text, truncation=True, padding="max_length", max_length=MAX_LEN, ) input_ids = toks["input_ids"] attention_mask = toks["attention_mask"] # 3) Find where the assistant reply starts role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant") if role_id in input_ids: idx = input_ids.index(role_id) start_of_reply = idx + 2 else: start_of_reply = 0 # 4) Build labels: -100 before reply, then copy the rest labels = [-100] * start_of_reply + input_ids[start_of_reply:] # 5) Pad/truncate labels to length of input_ids if len(labels) < len(input_ids): labels += [-100] * (len(input_ids) - len(labels)) else: labels = labels[: len(input_ids)] return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} # 5) Tokenize on the fly with a small batch tokenized_train = train_stream.map( preprocess_examples, batched=True, batch_size=32, # controls RAM for each map() call remove_columns=["messages"] # or whatever your raw column names are ) tokenized_val = val_stream.map( preprocess_examples, batched=True, batch_size=32, remove_columns=["messages"] ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # 1) Compute steps_per_epoch from your constants: steps_per_epoch = math.ceil(N_SAMPLES / PER_DEVICE_TRAIN_BATCH_SIZE) total_steps = steps_per_epoch * NUM_TRAIN_EPOCHS # Define training arguments with evaluation training_args = TrainingArguments( max_steps=total_steps, output_dir=PROJECT_NAME, per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, learning_rate=5e-5, num_train_epochs=NUM_TRAIN_EPOCHS, fp16=False if device_str == 'xpu' else True, bf16=True if device_str == 'xpu' else False, logging_steps=1000, save_steps=5000, greater_is_better=False, # W&B integration logging_dir=f"{PROJECT_NAME}/logs", report_to=["wandb"], run_name=f'{PROJECT_NAME}_LOCAL', push_to_hub=True, gradient_checkpointing=True, ) # Enable gradient checkpointing on the model model.gradient_checkpointing_enable() # Metrics function for perplexity def compute_metrics(p: EvalPrediction): eval_loss = p.metrics.get("eval_loss") if eval_loss is None: raise ValueError("eval_loss not found. Ensure evaluation_strategy is set.") return {"perplexity": math.exp(eval_loss)} # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, compute_metrics=compute_metrics, data_collator=data_collator, callbacks=[WandbCallback], ) # Start training trainer.train(resume_from_checkpoint=False) # Save and push model and tokenizer trainer.save_model(PROJECT_NAME) trainer.push_to_hub(f'Luigi/{PROJECT_NAME}') tokenizer.save_pretrained(PROJECT_NAME) # Test the fine-tuned model hf_device = 0 if device_str in ("cuda","xpu") else -1 gen = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=hf_device, max_new_tokens=512, ) prompt = "請問台北今天的天氣如何?" output = gen(prompt, do_sample=True, temperature=0.8) print(output[0]["generated_text"])