|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from transformers import DataCollatorForLanguageModeling |
|
|
from transformers import TrainingArguments, Trainer, EvalPrediction |
|
|
from transformers import pipeline |
|
|
from datasets import load_dataset |
|
|
import torch |
|
|
import os |
|
|
import math |
|
|
from transformers.integrations import WandbCallback |
|
|
|
|
|
PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat' |
|
|
BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct" |
|
|
DATASET_ID = "yentinglin/TaiwanChat" |
|
|
N_SAMPLES = 3000 |
|
|
MAX_LEN = 512 |
|
|
VAL_FRACTION = 0.1 |
|
|
PER_DEVICE_TRAIN_BATCH_SIZE=8 |
|
|
NUM_TRAIN_EPOCHS=3 |
|
|
|
|
|
|
|
|
os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL' |
|
|
os.environ["WANDB_LOG_MODEL"] = "end" |
|
|
|
|
|
|
|
|
device_str = 'cpu' |
|
|
if torch.xpu.is_available(): |
|
|
device_str = 'xpu' |
|
|
elif torch.cuda.is_available(): |
|
|
device_str = 'cuda' |
|
|
print(f'Device is {device_str}') |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID) |
|
|
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, low_cpu_mem_usage=True ) |
|
|
model.to(device_str, dtype=torch.bfloat16 if device_str == 'xpu' else torch.float16) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raw_stream = load_dataset( |
|
|
DATASET_ID, |
|
|
split="train", |
|
|
streaming=True |
|
|
) |
|
|
|
|
|
|
|
|
shuffled = raw_stream.shuffle(buffer_size=100, seed=42) |
|
|
|
|
|
|
|
|
limited = shuffled.take(N_SAMPLES) |
|
|
|
|
|
|
|
|
n_val = int(N_SAMPLES * VAL_FRACTION) |
|
|
n_train = N_SAMPLES - n_val |
|
|
|
|
|
train_stream = limited.take(n_train) |
|
|
val_stream = limited.skip(n_train).take(n_val) |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_examples(examples): |
|
|
chats = examples["messages"] |
|
|
|
|
|
text = tokenizer.apply_chat_template( |
|
|
chats, tokenize=False, add_generation_prompt=True |
|
|
) |
|
|
|
|
|
|
|
|
toks = tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
padding="max_length", |
|
|
max_length=MAX_LEN, |
|
|
) |
|
|
input_ids = toks["input_ids"] |
|
|
attention_mask = toks["attention_mask"] |
|
|
|
|
|
|
|
|
role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant") |
|
|
if role_id in input_ids: |
|
|
idx = input_ids.index(role_id) |
|
|
start_of_reply = idx + 2 |
|
|
else: |
|
|
start_of_reply = 0 |
|
|
|
|
|
|
|
|
labels = [-100] * start_of_reply + input_ids[start_of_reply:] |
|
|
|
|
|
if len(labels) < len(input_ids): |
|
|
labels += [-100] * (len(input_ids) - len(labels)) |
|
|
else: |
|
|
labels = labels[: len(input_ids)] |
|
|
|
|
|
return {"input_ids": input_ids, |
|
|
"attention_mask": attention_mask, |
|
|
"labels": labels} |
|
|
|
|
|
|
|
|
tokenized_train = train_stream.map( |
|
|
preprocess_examples, |
|
|
batched=True, |
|
|
batch_size=32, |
|
|
remove_columns=["messages"] |
|
|
) |
|
|
|
|
|
tokenized_val = val_stream.map( |
|
|
preprocess_examples, |
|
|
batched=True, |
|
|
batch_size=32, |
|
|
remove_columns=["messages"] |
|
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, mlm=False |
|
|
) |
|
|
|
|
|
|
|
|
steps_per_epoch = math.ceil(N_SAMPLES / PER_DEVICE_TRAIN_BATCH_SIZE) |
|
|
total_steps = steps_per_epoch * NUM_TRAIN_EPOCHS |
|
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
max_steps=total_steps, |
|
|
output_dir=PROJECT_NAME, |
|
|
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, |
|
|
learning_rate=5e-5, |
|
|
num_train_epochs=NUM_TRAIN_EPOCHS, |
|
|
fp16=False if device_str == 'xpu' else True, |
|
|
bf16=True if device_str == 'xpu' else False, |
|
|
logging_steps=1000, |
|
|
save_steps=5000, |
|
|
greater_is_better=False, |
|
|
|
|
|
|
|
|
logging_dir=f"{PROJECT_NAME}/logs", |
|
|
report_to=["wandb"], |
|
|
run_name=f'{PROJECT_NAME}_LOCAL', |
|
|
|
|
|
push_to_hub=True, |
|
|
gradient_checkpointing=True, |
|
|
) |
|
|
|
|
|
|
|
|
model.gradient_checkpointing_enable() |
|
|
|
|
|
|
|
|
def compute_metrics(p: EvalPrediction): |
|
|
eval_loss = p.metrics.get("eval_loss") |
|
|
if eval_loss is None: |
|
|
raise ValueError("eval_loss not found. Ensure evaluation_strategy is set.") |
|
|
return {"perplexity": math.exp(eval_loss)} |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_train, |
|
|
compute_metrics=compute_metrics, |
|
|
data_collator=data_collator, |
|
|
callbacks=[WandbCallback], |
|
|
) |
|
|
|
|
|
trainer.train(resume_from_checkpoint=False) |
|
|
|
|
|
|
|
|
trainer.save_model(PROJECT_NAME) |
|
|
trainer.push_to_hub(f'Luigi/{PROJECT_NAME}') |
|
|
tokenizer.save_pretrained(PROJECT_NAME) |
|
|
|
|
|
|
|
|
hf_device = 0 if device_str in ("cuda","xpu") else -1 |
|
|
gen = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
device=hf_device, |
|
|
max_new_tokens=512, |
|
|
) |
|
|
prompt = "請問台北今天的天氣如何?" |
|
|
output = gen(prompt, do_sample=True, temperature=0.8) |
|
|
print(output[0]["generated_text"]) |
|
|
|