{ "model_name_or_path": "DeepXR/Helion-2.5-Rnd", "output_dir": "./checkpoints/helion-2.5-rnd", "overwrite_output_dir": true, "do_train": true, "do_eval": true, "evaluation_strategy": "steps", "eval_steps": 500, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 4, "gradient_accumulation_steps": 8, "learning_rate": 2e-05, "weight_decay": 0.01, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": 150000, "lr_scheduler_type": "cosine_with_restarts", "warmup_steps": 2000, "logging_dir": "./logs", "logging_strategy": "steps", "logging_steps": 10, "save_strategy": "steps", "save_steps": 1000, "save_total_limit": 5, "fp16": false, "bf16": true, "dataloader_num_workers": 8, "dataloader_pin_memory": true, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "deepspeed": { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": 1.0, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "offload_param": { "device": "cpu", "pin_memory": true }, "overlap_comm": true, "contiguous_gradients": true, "reduce_bucket_size": 5e7, "stage3_prefetch_bucket_size": 5e7, "stage3_param_persistence_threshold": 1e5 }, "fp16": { "enabled": false }, "bf16": { "enabled": true }, "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupDecayLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto", "total_num_steps": "auto" } }, "zero_allow_untested_optimizer": true, "wall_clock_breakdown": false }, "fsdp": "", "fsdp_config": {}, "report_to": ["tensorboard", "wandb"], "run_name": "helion-2.5-rnd", "disable_tqdm": false, "remove_unused_columns": false, "label_names": ["labels"], "load_best_model_at_end": true, "metric_for_best_model": "eval_loss", "greater_is_better": false, "ignore_data_skip": false, "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": "inductor", "torch_compile_mode": null, "optim": "adamw_torch_fused", "group_by_length": false, "length_column_name": "length", "ddp_find_unused_parameters": false, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "data_preprocessing": { "max_seq_length": 131072, "truncation": true, "padding": "max_length", "return_tensors": "pt" }, "model_config_updates": { "use_cache": false, "attention_dropout": 0.0, "hidden_dropout": 0.0 } }