#! /usr/bin/env python3
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from datasets import load_dataset
import torch
import os
import wandb
from transformers.integrations import WandbCallback

PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
BASE_MODEL_ID="HuggingFaceTB/SmolLM2-135M-Instruct"
DATASET_ID="yentinglin/TaiwanChat"
N_SAMPLES=40000
MAX_LEN=512

# Tell wandb which project to use, and that you want to log your model
os.environ["WANDB_PROJECT"]    = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"]  = "end"

# Detect GPU Type
device_str='cpu'
if torch.xpu.is_available():
    device_str='xpu'
elif torch.cuda.is_available():
    device_str='cuda'
print(f'Device is {device_str}')

# Load Model & Tokenizer
model_name = BASE_MODEL_ID
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device_str)

# Prepare the TaiwanChat Dataset
dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")

# Preprocessing Function
def preprocess_examples(examples):
    # Each 'messages' entry is a list of {"role","content"} dicts
    chats = examples["messages"]
    # Render into a single string via ChatML template
    text = tokenizer.apply_chat_template(chats, tokenize=False, add_generation_prompt=True)
    # Tokenize with truncation
    tokens = tokenizer(text, truncation=True, max_length=MAX_LEN)
    return {"input_ids": tokens["input_ids"], 
            "attention_mask": tokens["attention_mask"]}

# Tokenization & Data Collator
tokenized_ds = dataset.map(
    preprocess_examples,
    batched=True,
    remove_columns=dataset.column_names,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

training_args = TrainingArguments(
    output_dir=PROJECT_NAME,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    fp16=False if device_str == 'xpu' else True,
    bf16=True if device_str == 'xpu' else False,
    logging_steps=1000,
    save_steps=5000,

    # ─── W&B integration ───
    logging_dir=f"{PROJECT_NAME}/logs",    # where to store TensorBoard/W&B logs
    report_to=["wandb"],                   # enable W&B reporting
    run_name=PROJECT_NAME,                 # name this run in your W&B project

    push_to_hub=True,
    gradient_checkpointing=True,
)

# Training with Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
    callbacks=[WandbCallback],  # ensure the W&B callback is attached
)
trainer.train(resume_from_checkpoint=False)

# Save Model & Tokenizer Locally
trainer.save_model(PROJECT_NAME)
trainer.push_to_hub(f'Luigi/{PROJECT_NAME}')
tokenizer.save_pretrained(PROJECT_NAME)

# 1) Load from local folder
model_dir = PROJECT_NAME
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForCausalLM.from_pretrained(model_dir)  # loads your fine‑tuned weights :contentReference[oaicite:2]{index=2}

# Test Fine-tuned Model
hf_device = 0 if device_str in ("cuda","xpu") else -1
gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=hf_device,                # or device=0 for GPU
    max_new_tokens=512,        # customize as desired
)

prompt = "請問台北今天的天氣如何？"
output = gen(prompt, do_sample=True, temperature=0.8)
print(output[0]["generated_text"])