Luigi's picture
remove evaluation because it leads to crash due to oom on my pc
7ee8171
raw
history blame
5.4 kB
#! /usr/bin/env python3
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer, EvalPrediction
from transformers import pipeline
from datasets import load_dataset
import torch
import os
import math
from transformers.integrations import WandbCallback
PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat'
BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
DATASET_ID = "yentinglin/TaiwanChat"
N_SAMPLES = 3000
MAX_LEN = 512
VAL_FRACTION = 0.1
PER_DEVICE_TRAIN_BATCH_SIZE=8
NUM_TRAIN_EPOCHS=3
# Tell wandb which project to use, and that you want to log your model
os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL'
os.environ["WANDB_LOG_MODEL"] = "end"
# Detect GPU Type
device_str = 'cpu'
if torch.xpu.is_available():
device_str = 'xpu'
elif torch.cuda.is_available():
device_str = 'cuda'
print(f'Device is {device_str}')
# Load Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, low_cpu_mem_usage=True )
model.to(device_str, dtype=torch.bfloat16 if device_str == 'xpu' else torch.float16)
# Prepare the TaiwanChat Dataset
# Load and split into train/validation
# 1) Load the raw train split as a stream
raw_stream = load_dataset(
DATASET_ID,
split="train", # no slicing here
streaming=True
)
# 2) (Optional) Shuffle the stream with a buffer
shuffled = raw_stream.shuffle(buffer_size=100, seed=42)
# 3) Take exactly N_SAMPLES examples
limited = shuffled.take(N_SAMPLES)
# 4) Split into train / validation
n_val = int(N_SAMPLES * VAL_FRACTION)
n_train = N_SAMPLES - n_val
train_stream = limited.take(n_train)
val_stream = limited.skip(n_train).take(n_val)
# Preprocessing function
def preprocess_examples(examples):
chats = examples["messages"] # list of chat messages
# 1) Render ChatML
text = tokenizer.apply_chat_template(
chats, tokenize=False, add_generation_prompt=True
)
# 2) Tokenize, pad/truncate to MAX_LEN
toks = tokenizer(
text,
truncation=True,
padding="max_length",
max_length=MAX_LEN,
)
input_ids = toks["input_ids"]
attention_mask = toks["attention_mask"]
# 3) Find where the assistant reply starts
role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
if role_id in input_ids:
idx = input_ids.index(role_id)
start_of_reply = idx + 2
else:
start_of_reply = 0
# 4) Build labels: -100 before reply, then copy the rest
labels = [-100] * start_of_reply + input_ids[start_of_reply:]
# 5) Pad/truncate labels to length of input_ids
if len(labels) < len(input_ids):
labels += [-100] * (len(input_ids) - len(labels))
else:
labels = labels[: len(input_ids)]
return {"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels}
# 5) Tokenize on the fly with a small batch
tokenized_train = train_stream.map(
preprocess_examples,
batched=True,
batch_size=32, # controls RAM for each map() call
remove_columns=["messages"] # or whatever your raw column names are
)
tokenized_val = val_stream.map(
preprocess_examples,
batched=True,
batch_size=32,
remove_columns=["messages"]
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False
)
# 1) Compute steps_per_epoch from your constants:
steps_per_epoch = math.ceil(N_SAMPLES / PER_DEVICE_TRAIN_BATCH_SIZE)
total_steps = steps_per_epoch * NUM_TRAIN_EPOCHS
# Define training arguments with evaluation
training_args = TrainingArguments(
max_steps=total_steps,
output_dir=PROJECT_NAME,
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
learning_rate=5e-5,
num_train_epochs=NUM_TRAIN_EPOCHS,
fp16=False if device_str == 'xpu' else True,
bf16=True if device_str == 'xpu' else False,
logging_steps=1000,
save_steps=5000,
greater_is_better=False,
# W&B integration
logging_dir=f"{PROJECT_NAME}/logs",
report_to=["wandb"],
run_name=f'{PROJECT_NAME}_LOCAL',
push_to_hub=True,
gradient_checkpointing=True,
)
# Enable gradient checkpointing on the model
model.gradient_checkpointing_enable()
# Metrics function for perplexity
def compute_metrics(p: EvalPrediction):
eval_loss = p.metrics.get("eval_loss")
if eval_loss is None:
raise ValueError("eval_loss not found. Ensure evaluation_strategy is set.")
return {"perplexity": math.exp(eval_loss)}
# Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
compute_metrics=compute_metrics,
data_collator=data_collator,
callbacks=[WandbCallback],
)
# Start training
trainer.train(resume_from_checkpoint=False)
# Save and push model and tokenizer
trainer.save_model(PROJECT_NAME)
trainer.push_to_hub(f'Luigi/{PROJECT_NAME}')
tokenizer.save_pretrained(PROJECT_NAME)
# Test the fine-tuned model
hf_device = 0 if device_str in ("cuda","xpu") else -1
gen = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=hf_device,
max_new_tokens=512,
)
prompt = "請問台北今天的天氣如何?"
output = gen(prompt, do_sample=True, temperature=0.8)
print(output[0]["generated_text"])