#! /usr/bin/env python3
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer, EvalPrediction
from transformers import pipeline
from datasets import load_dataset
import torch
import os
import math
from transformers.integrations import WandbCallback

PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat'
BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
DATASET_ID = "yentinglin/TaiwanChat"
N_SAMPLES = 3000
MAX_LEN = 512
VAL_FRACTION = 0.1
PER_DEVICE_TRAIN_BATCH_SIZE=8
NUM_TRAIN_EPOCHS=3

# Tell wandb which project to use, and that you want to log your model
os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL'
os.environ["WANDB_LOG_MODEL"] = "end"

# Detect GPU Type
device_str = 'cpu'
if torch.xpu.is_available():
    device_str = 'xpu'
elif torch.cuda.is_available():
    device_str = 'cuda'
print(f'Device is {device_str}')

# Load Model & Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, low_cpu_mem_usage=True )
model.to(device_str, dtype=torch.bfloat16 if device_str == 'xpu' else torch.float16)

# Prepare the TaiwanChat Dataset
# Load and split into train/validation
# 1) Load the raw train split as a stream
raw_stream = load_dataset(
    DATASET_ID,
    split="train",         # no slicing here
    streaming=True
)

# 2) (Optional) Shuffle the stream with a buffer
shuffled = raw_stream.shuffle(buffer_size=100, seed=42)

# 3) Take exactly N_SAMPLES examples
limited = shuffled.take(N_SAMPLES)

# 4) Split into train / validation
n_val   = int(N_SAMPLES * VAL_FRACTION)
n_train = N_SAMPLES - n_val

train_stream = limited.take(n_train)
val_stream   = limited.skip(n_train).take(n_val)


# Preprocessing function
def preprocess_examples(examples):
    chats = examples["messages"]  # list of chat messages
    # 1) Render ChatML
    text = tokenizer.apply_chat_template(
        chats, tokenize=False, add_generation_prompt=True
    )

    # 2) Tokenize, pad/truncate to MAX_LEN
    toks = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    input_ids = toks["input_ids"]
    attention_mask = toks["attention_mask"]

    # 3) Find where the assistant reply starts
    role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
    if role_id in input_ids:
        idx = input_ids.index(role_id)
        start_of_reply = idx + 2
    else:
        start_of_reply = 0

    # 4) Build labels: -100 before reply, then copy the rest
    labels = [-100] * start_of_reply + input_ids[start_of_reply:]
    # 5) Pad/truncate labels to length of input_ids
    if len(labels) < len(input_ids):
        labels += [-100] * (len(input_ids) - len(labels))
    else:
        labels = labels[: len(input_ids)]

    return {"input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels}

# 5) Tokenize on the fly with a small batch
tokenized_train = train_stream.map(
    preprocess_examples,
    batched=True,
    batch_size=32,               # controls RAM for each map() call
    remove_columns=["messages"]  # or whatever your raw column names are
)

tokenized_val = val_stream.map(
    preprocess_examples,
    batched=True,
    batch_size=32,
    remove_columns=["messages"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 1) Compute steps_per_epoch from your constants:
steps_per_epoch = math.ceil(N_SAMPLES / PER_DEVICE_TRAIN_BATCH_SIZE)
total_steps     = steps_per_epoch * NUM_TRAIN_EPOCHS


# Define training arguments with evaluation
training_args = TrainingArguments(
    max_steps=total_steps,
    output_dir=PROJECT_NAME,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    learning_rate=5e-5,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    fp16=False if device_str == 'xpu' else True,
    bf16=True if device_str == 'xpu' else False,
    logging_steps=1000,
    save_steps=5000,
    greater_is_better=False,

    # W&B integration
    logging_dir=f"{PROJECT_NAME}/logs",
    report_to=["wandb"],
    run_name=f'{PROJECT_NAME}_LOCAL',

    push_to_hub=True,
    gradient_checkpointing=True,
)

# Enable gradient checkpointing on the model
model.gradient_checkpointing_enable()

# Metrics function for perplexity
def compute_metrics(p: EvalPrediction):
    eval_loss = p.metrics.get("eval_loss")
    if eval_loss is None:
        raise ValueError("eval_loss not found. Ensure evaluation_strategy is set.")
    return {"perplexity": math.exp(eval_loss)}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[WandbCallback],
)
# Start training
trainer.train(resume_from_checkpoint=False)

# Save and push model and tokenizer
trainer.save_model(PROJECT_NAME)
trainer.push_to_hub(f'Luigi/{PROJECT_NAME}')
tokenizer.save_pretrained(PROJECT_NAME)

# Test the fine-tuned model
hf_device = 0 if device_str in ("cuda","xpu") else -1
gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=hf_device,
    max_new_tokens=512,
)
prompt = "請問台北今天的天氣如何？"
output = gen(prompt, do_sample=True, temperature=0.8)
print(output[0]["generated_text"])