Luigi
/

SmolLM2-135M-Instruct-TaiwanChat

@@ -7,14 +7,16 @@ from datasets import load_dataset
 import torch
 import os
 import math
-import wandb
 from transformers.integrations import WandbCallback
 PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat'
 BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
 DATASET_ID = "yentinglin/TaiwanChat"
-N_SAMPLES = 1000
-MAX_LEN = 256
 # Tell wandb which project to use, and that you want to log your model
 os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL'
@@ -30,15 +32,31 @@ print(f'Device is {device_str}')
 # Load Model & Tokenizer
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID)
-model.to(device_str)
 # Prepare the TaiwanChat Dataset
 # Load and split into train/validation
-full_ds = load_dataset(DATASET_ID, split=f"train[:{N_SAMPLES}]")
-splits = full_ds.train_test_split(test_size=0.1, seed=42)
-train_ds = splits['train']
-val_ds   = splits['test']
 # Preprocessing function
 def preprocess_examples(examples):
@@ -78,24 +96,37 @@ def preprocess_examples(examples):
             "attention_mask": attention_mask,
             "labels": labels}
-# Tokenize and collate
-tokenized_train = train_ds.map(
-    preprocess_examples, batched=True, remove_columns=train_ds.column_names
 )
-tokenized_val = val_ds.map(
-    preprocess_examples, batched=True, remove_columns=val_ds.column_names
 )
 data_collator = DataCollatorForLanguageModeling(
     tokenizer=tokenizer, mlm=False
 )
 # Define training arguments with evaluation
 training_args = TrainingArguments(
     output_dir=PROJECT_NAME,
-    per_device_train_batch_size=4,
     learning_rate=5e-5,
-    num_train_epochs=3,
     fp16=False if device_str == 'xpu' else True,
     bf16=True if device_str == 'xpu' else False,
     logging_steps=1000,

 import torch
 import os
 import math
 from transformers.integrations import WandbCallback
 PROJECT_NAME = 'SmolLM2-135M-Instruct-TaiwanChat'
 BASE_MODEL_ID = "HuggingFaceTB/SmolLM2-135M-Instruct"
 DATASET_ID = "yentinglin/TaiwanChat"
+N_SAMPLES = 9000
+MAX_LEN = 512
+VAL_FRACTION = 0.1
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+NUM_TRAIN_EPOCHS=3
 # Tell wandb which project to use, and that you want to log your model
 os.environ["WANDB_PROJECT"] = f'{PROJECT_NAME}_LOCAL'
 # Load Model & Tokenizer
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, low_cpu_mem_usage=True )
+model.to(device_str, dtype=torch.bfloat16 if device_str == 'xpu' else torch.float16)
 # Prepare the TaiwanChat Dataset
 # Load and split into train/validation
+# 1) Load the raw train split as a stream
+raw_stream = load_dataset(
+    DATASET_ID,
+    split="train",         # no slicing here
+    streaming=True
+)
+# 2) (Optional) Shuffle the stream with a buffer
+shuffled = raw_stream.shuffle(buffer_size=5_000, seed=42)
+# 3) Take exactly N_SAMPLES examples
+limited = shuffled.take(N_SAMPLES)
+# 4) Split into train / validation
+n_val   = int(N_SAMPLES * VAL_FRACTION)
+n_train = N_SAMPLES - n_val
+train_stream = limited.take(n_train)
+val_stream   = limited.skip(n_train).take(n_val)
 # Preprocessing function
 def preprocess_examples(examples):
             "attention_mask": attention_mask,
             "labels": labels}
+# 5) Tokenize on the fly with a small batch
+tokenized_train = train_stream.map(
+    preprocess_examples,
+    batched=True,
+    batch_size=32,               # controls RAM for each map() call
+    remove_columns=["messages"]  # or whatever your raw column names are
 )
+tokenized_val = val_stream.map(
+    preprocess_examples,
+    batched=True,
+    batch_size=32,
+    remove_columns=["messages"]
 )
 data_collator = DataCollatorForLanguageModeling(
     tokenizer=tokenizer, mlm=False
 )
+# 1) Compute steps_per_epoch from your constants:
+steps_per_epoch = math.ceil(N_SAMPLES / PER_DEVICE_TRAIN_BATCH_SIZE)
+total_steps     = steps_per_epoch * NUM_TRAIN_EPOCHS
 # Define training arguments with evaluation
 training_args = TrainingArguments(
+    max_steps=total_steps,
     output_dir=PROJECT_NAME,
+    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
     learning_rate=5e-5,
+    num_train_epochs=NUM_TRAIN_EPOCHS,
     fp16=False if device_str == 'xpu' else True,
     bf16=True if device_str == 'xpu' else False,
     logging_steps=1000,