| training_config = TrainingArguments( | |
| model_dir_str, False, True, True, False, "steps", | |
| per_device_train_batch_size=64, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time | |
| per_device_eval_batch_size=64, #was 24 now 32 | |
| gradient_accumulation_steps=3, #change this to 4 | |
| eval_accumulation_steps=None, | |
| eval_steps=2000, | |
| learning_rate=1e-4, | |
| weight_decay=0.01, | |
| max_grad_norm=1.0, | |
| max_steps=50000, | |
| lr_scheduler_type="cosine", | |
| warmup_ratio=0.08, | |
| log_level="debug", | |
| logging_strategy="steps", | |
| logging_steps=20, | |
| save_strategy="steps", | |
| save_steps=1000, | |
| save_total_limit=5, | |
| no_cuda=not USE_CUDA, | |
| seed=444, | |
| fp16=FP16, | |
| fp16_full_eval=FP16_EVAL, | |
| bf16=BF16, | |
| bf16_full_eval=BF16_EVAL, | |
| load_best_model_at_end=True, | |
| label_smoothing_factor=0., | |
| optim="adamw_torch", | |
| report_to=["tensorboard"], | |
| gradient_checkpointing=True, | |
| dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process | |
| dataloader_pin_memory=True, #we want the dataset in memory | |
| torch_compile=True #added to speed up | |
| Better sugested by ai | |
| training_config = TrainingArguments( | |
| model_dir_str, False, True, True, False, "steps", | |
| per_device_train_batch_size=64, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time | |
| per_device_eval_batch_size=64, #was 24 now 32 | |
| gradient_accumulation_steps=3, #change this to 4 | |
| eval_accumulation_steps=None, | |
| eval_steps=3000, | |
| eval_delay=6000, | |
| learning_rate=1e-4, | |
| weight_decay=0.01, | |
| max_grad_norm=1.0, | |
| max_steps=30000, | |
| lr_scheduler_type="cosine", | |
| warmup_ratio=0.08, | |
| log_level="debug", | |
| logging_strategy="steps", | |
| logging_steps=100, | |
| save_strategy="steps", | |
| save_steps=3000, | |
| save_total_limit=5, | |
| no_cuda=not USE_CUDA, | |
| seed=444, | |
| fp16=FP16, | |
| fp16_full_eval=FP16_EVAL, | |
| bf16=BF16, | |
| bf16_full_eval=BF16_EVAL, | |
| load_best_model_at_end=True, | |
| label_smoothing_factor=0.05, | |
| optim="adamw_torch", | |
| report_to=["tensorboard"], | |
| gradient_checkpointing=False, | |
| dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process | |
| dataloader_pin_memory=True, #we want the dataset in memory | |
| torch_compile=True #added to speed up |