See axolotl config

axolotl version: 0.13.0.dev0

base_model: Goader/gemma-3-12b-pt-transtokenizers
tokenizer_config: lapa-llm/tokenizer

ddp_find_unused_parameters: false #Chat-GPT recomendation: Set this to false unless you know there are unused parameters. It introduces overhead during distributed training.
shuffle_merged_datasets: false
shuffle_before_merging_datasets: false # Shuffle each dataset individually before merging them
#Need to discuss how to load datas (datasets / pretraining_dataset)
pretraining_dataset:
  - path: Goader/kobza-2m-jsonl
    type: pretrain
#   - path: le-llm/high-estimated-pretraining-data
#     type:
pretrain_multipack_buffer_size: 10000
dataset_processes: 64
# dataset_keep_in_memory: true
dataloader_num_workers: 8
dataloader_prefetch_factor: 9

output_dir: ./outputs/gemma-3-12b-transtokenizers-pt
dataset_prepared_path: last_run_prepared_embeddings

sequence_len: 1024
sample_packing: true
pad_to_sequence_len: true
train_on_inputs: true #maybe unnecessary for pretraining

# Number of GPUs for Tensor Parallelism.

# deepspeed: #Need to discuss deepspeed configuratiin, I recommend to use deepspeed zero2 for faster training
#My deepspeed Zero2 config:
# {
#   "zero_optimization": {
#     "stage": 2,
#     "overlap_comm": true,
#     "contiguous_gradients": true,
#     "sub_group_size": 0,
#     "reduce_scatter": true,
#     "allgather_bucket_size": 500000000,
#     "reduce_bucket_size": 500000000
#   },
#   "bf16": { "enabled": true },
#   "gradient_accumulation_steps": "auto",
#   "gradient_clipping": "auto",
#   "train_batch_size": "auto",
#   "train_micro_batch_size_per_gpu": "auto",
#   "wall_clock_breakdown": false
# }
# deepcompile: true # TODO: let's try? should speed up training

unfrozen_parameters:
  - ^lm_head.weight$
  - ^model.language_model.embed_tokens.weight$

plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

wandb_project: matt
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 4
# num_epochs: 1
max_steps: 15000 # better to avoid using max_steps and use num_epochs instead if will be able to use not streaming dataset
save_steps: 5000 #(around 15-20 checkpoints) Taking into accout that we have problem with resuming training from checkpoints lets use checkpointing for tracking training in different stages
save_total_limit: 30

ddp_timeout: 7200 # TODO: timeout of 2 hours for waiting for nodes

gradient_checkpointing: false #not find the way to avoid using this due GPU memory limitations. potentially set in false - speed-up training on 20-30%
#gradient_checkpointing_kwargs:
#  use_reentrant: false   
logging_steps: 10
flash_attention: true

#Some colculations for our case by Chat-GPT based on https://arxiv.org/pdf/2507.07101 - https://chatgpt.com/s/t_68c57258c87c81918b9da47a017888e8
optimizer: adamw_torch_fused #adamw_bnb_8bit
warmup_ratio: 0.1
lr_scheduler: warmup_stable_decay
lr_scheduler_kwargs: {
  "num_decay_steps": 10000, # 30% in KIMI K2 paper . Should be calculated based on size of good dataset. Recomended by Chat-GPT Decay (~32%) around 45k steps for our case
  "min_lr_ratio": 0.05}
# FIXME num decay steps !!!!!!!!!!

learning_rate: 5e-5
max_grad_norm: 1.0
# adamw hyperparams
adam_epsilon: 1e-6 #bf16 edge-cases
# adamw hyperparams
adam_beta1: 0.9
# adamw hyperparams 
# adam_beta2: 0.977543
adam_beta2: 0.98
# Could be 0.01 but CheGPT recommended exclude LayerNorm and bias from decay if want to use 0.01
weight_decay: 0.01

outputs/gemma-3-12b-transtokenizers-pt

This model is a fine-tuned version of Goader/gemma-3-12b-pt-transtokenizers on an unknown dataset.

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

learning_rate: 5e-05
train_batch_size: 1
eval_batch_size: 4
seed: 42
gradient_accumulation_steps: 2
total_train_batch_size: 2
optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.98) and epsilon=1e-06 and optimizer_args=No additional optimizer arguments
lr_scheduler_type: warmup_stable_decay
lr_scheduler_warmup_steps: 1500
training_steps: 15000