remove evaluation because it leads to crash due to oom on my pc

7ee8171 7 months ago

5.4 kB

	#! /usr/bin/env python3
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers import DataCollatorForLanguageModeling
	from transformers import TrainingArguments, Trainer, EvalPrediction
	from transformers import pipeline
	from datasets import load_dataset
	import torch
	import os
	import math
	from transformers.integrations import WandbCallback

	PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat'
	BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
	DATASET_ID = "yentinglin/TaiwanChat"
	N_SAMPLES = 3000
	MAX_LEN = 512
	VAL_FRACTION = 0.1
	PER_DEVICE_TRAIN_BATCH_SIZE=8
	NUM_TRAIN_EPOCHS=3

	# Tell wandb which project to use, and that you want to log your model
	os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL'
	os.environ["WANDB_LOG_MODEL"] = "end"

	# Detect GPU Type
	device_str = 'cpu'
	if torch.xpu.is_available():
	device_str = 'xpu'
	elif torch.cuda.is_available():
	device_str = 'cuda'
	print(f'Device is {device_str}')

	# Load Model & Tokenizer
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
	model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, low_cpu_mem_usage=True )
	model.to(device_str, dtype=torch.bfloat16 if device_str == 'xpu' else torch.float16)

	# Prepare the TaiwanChat Dataset
	# Load and split into train/validation
	# 1) Load the raw train split as a stream
	raw_stream = load_dataset(
	DATASET_ID,
	split="train", # no slicing here
	streaming=True
	)

	# 2) (Optional) Shuffle the stream with a buffer
	shuffled = raw_stream.shuffle(buffer_size=100, seed=42)

	# 3) Take exactly N_SAMPLES examples
	limited = shuffled.take(N_SAMPLES)

	# 4) Split into train / validation
	n_val = int(N_SAMPLES * VAL_FRACTION)
	n_train = N_SAMPLES - n_val

	train_stream = limited.take(n_train)
	val_stream = limited.skip(n_train).take(n_val)


	# Preprocessing function
	def preprocess_examples(examples):
	chats = examples["messages"] # list of chat messages
	# 1) Render ChatML
	text = tokenizer.apply_chat_template(
	chats, tokenize=False, add_generation_prompt=True
	)

	# 2) Tokenize, pad/truncate to MAX_LEN
	toks = tokenizer(
	text,
	truncation=True,
	padding="max_length",
	max_length=MAX_LEN,
	)
	input_ids = toks["input_ids"]
	attention_mask = toks["attention_mask"]

	# 3) Find where the assistant reply starts
	role_id = tokenizer.convert_tokens_to_ids("<\|im_start\|>assistant")
	if role_id in input_ids:
	idx = input_ids.index(role_id)
	start_of_reply = idx + 2
	else:
	start_of_reply = 0

	# 4) Build labels: -100 before reply, then copy the rest
	labels = [-100] * start_of_reply + input_ids[start_of_reply:]
	# 5) Pad/truncate labels to length of input_ids
	if len(labels) < len(input_ids):
	labels += [-100] * (len(input_ids) - len(labels))
	else:
	labels = labels[: len(input_ids)]

	return {"input_ids": input_ids,
	"attention_mask": attention_mask,
	"labels": labels}

	# 5) Tokenize on the fly with a small batch
	tokenized_train = train_stream.map(
	preprocess_examples,
	batched=True,
	batch_size=32, # controls RAM for each map() call
	remove_columns=["messages"] # or whatever your raw column names are
	)

	tokenized_val = val_stream.map(
	preprocess_examples,
	batched=True,
	batch_size=32,
	remove_columns=["messages"]
	)

	data_collator = DataCollatorForLanguageModeling(
	tokenizer=tokenizer, mlm=False
	)

	# 1) Compute steps_per_epoch from your constants:
	steps_per_epoch = math.ceil(N_SAMPLES / PER_DEVICE_TRAIN_BATCH_SIZE)
	total_steps = steps_per_epoch * NUM_TRAIN_EPOCHS


	# Define training arguments with evaluation
	training_args = TrainingArguments(
	max_steps=total_steps,
	output_dir=PROJECT_NAME,
	per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
	learning_rate=5e-5,
	num_train_epochs=NUM_TRAIN_EPOCHS,
	fp16=False if device_str == 'xpu' else True,
	bf16=True if device_str == 'xpu' else False,
	logging_steps=1000,
	save_steps=5000,
	greater_is_better=False,

	# W&B integration
	logging_dir=f"{PROJECT_NAME}/logs",
	report_to=["wandb"],
	run_name=f'{PROJECT_NAME}_LOCAL',

	push_to_hub=True,
	gradient_checkpointing=True,
	)

	# Enable gradient checkpointing on the model
	model.gradient_checkpointing_enable()

	# Metrics function for perplexity
	def compute_metrics(p: EvalPrediction):
	eval_loss = p.metrics.get("eval_loss")
	if eval_loss is None:
	raise ValueError("eval_loss not found. Ensure evaluation_strategy is set.")
	return {"perplexity": math.exp(eval_loss)}

	# Trainer setup
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_train,
	compute_metrics=compute_metrics,
	data_collator=data_collator,
	callbacks=[WandbCallback],
	)
	# Start training
	trainer.train(resume_from_checkpoint=False)

	# Save and push model and tokenizer
	trainer.save_model(PROJECT_NAME)
	trainer.push_to_hub(f'Luigi/{PROJECT_NAME}')
	tokenizer.save_pretrained(PROJECT_NAME)

	# Test the fine-tuned model
	hf_device = 0 if device_str in ("cuda","xpu") else -1
	gen = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=hf_device,
	max_new_tokens=512,
	)
	prompt = "請問台北今天的天氣如何？"
	output = gen(prompt, do_sample=True, temperature=0.8)
	print(output[0]["generated_text"])