Luigi commited on
Commit
2340301
·
1 Parent(s): 22d1cb2

ajustement to avoid oom err

Browse files
Files changed (1) hide show
  1. train_with_unsloth.py +8 -4
train_with_unsloth.py CHANGED
@@ -20,6 +20,7 @@ from datasets import load_dataset
20
  import os
21
  from transformers.integrations import WandbCallback
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
23
 
24
  PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
25
  BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct"
@@ -27,6 +28,8 @@ DATASET_ID="yentinglin/TaiwanChat"
27
  N_SAMPLES=80000
28
  MAX_LEN=2048
29
 
 
 
30
  # Tell wandb which project to use, and that you want to log your model
31
  os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD"
32
  os.environ["WANDB_LOG_MODEL"] = "end"
@@ -73,15 +76,15 @@ dataset = dataset.map(fmt, batched=True, remove_columns=["messages"])
73
  new_dataset = dataset.train_test_split(test_size = 0.01)
74
 
75
  training_args = SFTConfig(
76
- fp16_full_eval = True,
77
- per_device_eval_batch_size = 2,
78
  eval_accumulation_steps = 4,
79
- eval_strategy = "steps",
80
  eval_steps = 1,
81
  dataset_text_field="text",
82
  output_dir=PROJECT_NAME,
83
  max_seq_length = MAX_LEN,
84
- per_device_train_batch_size = 2,
85
  gradient_accumulation_steps = 4,
86
  warmup_steps = 10,
87
  max_steps = 60,
@@ -106,6 +109,7 @@ trainer = SFTTrainer(
106
  train_dataset = new_dataset["train"],
107
  eval_dataset = new_dataset["test"],
108
  )
 
109
  trainer = train_on_responses_only(
110
  trainer,
111
  instruction_part = "<|im_start|>user\n",
 
20
  import os
21
  from transformers.integrations import WandbCallback
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
23
+ import torch
24
 
25
  PROJECT_NAME='SmolLM2-135M-Instruct-TaiwanChat'
26
  BASE_MODEL_ID="unsloth/SmolLM2-135M-Instruct"
 
28
  N_SAMPLES=80000
29
  MAX_LEN=2048
30
 
31
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
32
+
33
  # Tell wandb which project to use, and that you want to log your model
34
  os.environ["WANDB_PROJECT"] = f"{PROJECT_NAME}_CLOUD"
35
  os.environ["WANDB_LOG_MODEL"] = "end"
 
76
  new_dataset = dataset.train_test_split(test_size = 0.01)
77
 
78
  training_args = SFTConfig(
79
+ fp16_full_eval = False,
80
+ per_device_eval_batch_size = 1,
81
  eval_accumulation_steps = 4,
82
+ eval_strategy = "epoch",
83
  eval_steps = 1,
84
  dataset_text_field="text",
85
  output_dir=PROJECT_NAME,
86
  max_seq_length = MAX_LEN,
87
+ per_device_train_batch_size = 1,
88
  gradient_accumulation_steps = 4,
89
  warmup_steps = 10,
90
  max_steps = 60,
 
109
  train_dataset = new_dataset["train"],
110
  eval_dataset = new_dataset["test"],
111
  )
112
+ torch.cuda.empty_cache()
113
  trainer = train_on_responses_only(
114
  trainer,
115
  instruction_part = "<|im_start|>user\n",