Luigi
/

SmolLM2-135M-Instruct-TaiwanChat

@@ -12,7 +12,7 @@ from transformers.integrations import WandbCallback
 PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
 BASE_MODEL_ID="HuggingFaceTB/SmolLM2-135M-Instruct"
 DATASET_ID="yentinglin/TaiwanChat"
-N_SAMPLES=40000
 MAX_LEN=512
 # Tell wandb which project to use, and that you want to log your model
@@ -38,38 +38,44 @@ dataset = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")
 def preprocess_examples(examples):
     chats = examples["messages"]
-    # 1) Render as ChatML with the “assistant:” generation prompt
     text = tokenizer.apply_chat_template(
-        chats,
-        tokenize=False,
-        add_generation_prompt=True
     )
-    # 2) Tokenize
-    toks = tokenizer(text, truncation=True, max_length=MAX_LEN)
-    input_ids = toks["input_ids"]
-    attention_mask = toks["attention_mask"]
-    # 3) Build labels that mask all tokens _before_ the assistant turn
-    #    so we only compute loss on the assistant’s response
-    #    Find the index where the assistant prompt token <|im_start|>assistant occurs:
-    role_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
-    # find first occurrence
-    try:
-        idx = input_ids.index(role_token_id)
-    except ValueError:
-        idx = 0
-    # +2 to skip the role token and the following newline
-    start_of_reply = idx + 2
     labels = [-100] * start_of_reply + input_ids[start_of_reply:]
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
         "labels": labels,
     }
 # Tokenization & Data Collator
 tokenized_ds = dataset.map(
     preprocess_examples,

 PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
 BASE_MODEL_ID="HuggingFaceTB/SmolLM2-135M-Instruct"
 DATASET_ID="yentinglin/TaiwanChat"
+N_SAMPLES=100
 MAX_LEN=512
 # Tell wandb which project to use, and that you want to log your model
 def preprocess_examples(examples):
     chats = examples["messages"]
+    # 1) Render ChatML
     text = tokenizer.apply_chat_template(
+        chats, tokenize=False, add_generation_prompt=True
     )
+    # 2) Tokenize _and_ pad/truncate to MAX_LEN
+    toks = tokenizer(
+        text,
+        truncation=True,
+        padding="max_length",
+        max_length=MAX_LEN,
+    )
+    input_ids     = toks["input_ids"]
+    attention_mask= toks["attention_mask"]
+    # 3) Find where the assistant reply starts
+    role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
+    if role_id in input_ids:
+        idx = input_ids.index(role_id)
+        start_of_reply = idx + 2
+    else:
+        start_of_reply = 0
+    # 4) Build labels: -100 before reply, then copy the rest
     labels = [-100] * start_of_reply + input_ids[start_of_reply:]
+    # 5) Pad or truncate labels to EXACTLY len(input_ids)
+    if len(labels) < len(input_ids):
+        labels += [-100] * (len(input_ids) - len(labels))
+    else:
+        labels = labels[: len(input_ids)]
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
         "labels": labels,
     }
 # Tokenization & Data Collator
 tokenized_ds = dataset.map(
     preprocess_examples,

train_with_unsloth.py CHANGED Viewed

@@ -71,38 +71,43 @@ val_ds   = splits["test"]
 # Preprocessing Function
 def preprocess_examples(examples):
     chats = examples["messages"]
-    # 1) Render as ChatML with the “assistant:” generation prompt
     text = tokenizer.apply_chat_template(
-        chats,
-        tokenize=False,
-        add_generation_prompt=True
     )
-    # 2) Tokenize
-    toks = tokenizer(text, truncation=True, max_length=MAX_LEN)
-    input_ids = toks["input_ids"]
-    attention_mask = toks["attention_mask"]
-    # 3) Build labels that mask all tokens _before_ the assistant turn
-    #    so we only compute loss on the assistant’s response
-    #    Find the index where the assistant prompt token <|im_start|>assistant occurs:
-    role_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
-    # find first occurrence
-    try:
-        idx = input_ids.index(role_token_id)
-    except ValueError:
-        idx = 0
-    # +2 to skip the role token and the following newline
-    start_of_reply = idx + 2
     labels = [-100] * start_of_reply + input_ids[start_of_reply:]
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
         "labels": labels,
     }
 # Tokenization & Data Collator
 tokenized_train = train_ds.map(
     preprocess_examples, batched=True, remove_columns=train_ds.column_names

 # Preprocessing Function
 def preprocess_examples(examples):
     chats = examples["messages"]
+    # 1) Render ChatML
     text = tokenizer.apply_chat_template(
+        chats, tokenize=False, add_generation_prompt=True
     )
+    # 2) Tokenize _and_ pad/truncate to MAX_LEN
+    toks = tokenizer(
+        text,
+        truncation=True,
+        padding="max_length",
+        max_length=MAX_LEN,
+    )
+    input_ids     = toks["input_ids"]
+    attention_mask= toks["attention_mask"]
+    # 3) Find where the assistant reply starts
+    role_id = tokenizer.convert_tokens_to_ids("<|im_start|>assistant")
+    if role_id in input_ids:
+        idx = input_ids.index(role_id)
+        start_of_reply = idx + 2
+    else:
+        start_of_reply = 0
+    # 4) Build labels: -100 before reply, then copy the rest
     labels = [-100] * start_of_reply + input_ids[start_of_reply:]
+    # 5) Pad or truncate labels to EXACTLY len(input_ids)
+    if len(labels) < len(input_ids):
+        labels += [-100] * (len(input_ids) - len(labels))
+    else:
+        labels = labels[: len(input_ids)]
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
         "labels": labels,
     }
 # Tokenization & Data Collator
 tokenized_train = train_ds.map(
     preprocess_examples, batched=True, remove_columns=train_ds.column_names