diff --git "a/output.log" "b/output.log" new file mode 100644--- /dev/null +++ "b/output.log" @@ -0,0 +1,10303 @@ +[2025-05-27 14:06:18,510] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:32,750] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:32,873] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:32,921] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:32,987] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:33,612] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:33,711] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:33,780] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:34,111] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-05-27 14:06:38,047] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:38 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1 distributed training: True, 16-bits training: False +[2025-05-27 14:06:38,313] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:38 - WARNING - __main__ - Process rank: 3, device: cuda:3, n_gpu: 1 distributed training: True, 16-bits training: False +[2025-05-27 14:06:38,388] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:38 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False +2025-05-27 14:06:38 - INFO - __main__ - Model parameters ModelArguments(base_model_revision=None, model_name_or_path='/workspace/amd-mla/llama3.1_8b-instruct', model_revision='main', model_code_revision=None, torch_dtype='bfloat16', tokenizer_name_or_path=None, trust_remote_code=False, use_flash_attention_2=True, use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bnb_4bit_quant_storage='uint8') +2025-05-27 14:06:38 - INFO - __main__ - Data parameters DataArguments(chat_template="{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", dataset_mixer={'JunxiongWang/sftdatasetv3': 1.0}, text_column='text', dataset_splits=['train', 'test'], dataset_configs=None, preprocessing_num_workers=128, truncation_side=None, auto_insert_empty_system_msg=True) +2025-05-27 14:06:38 - INFO - __main__ - Training/evaluation parameters SFTDistillConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +batch_eval_metrics=False, +bf16=True, +bf16_full_eval=False, +ce_weight=0.0, +data_ratio=0, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_kwargs=None, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +decontaminate=True, +deepspeed=None, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_steps=None, +eval_strategy=epoch, +eval_use_gather_object=False, +evaluation_strategy=epoch, +first_mha=False, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +freeze_non_mla=False, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=2, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +init_with_kqvo=False, +init_with_svd=False, +jit_mode_eval=False, +kl_hidden_weight=0.0, +kl_weight=1.0, +kv_energy_ratio=None, +kv_lora_rank=160, +label_names=None, +label_smoothing_factor=0.0, +layer_rank_list=None, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/runs/May27_14-06-38_pdfc-aig-000004, +logging_first_step=True, +logging_nan_inf_filter=True, +logging_steps=10, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +mamba_model_path=/workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1, +mamba_to_mha=False, +max_grad_norm=1.0, +max_seq_length=2048, +max_steps=-1, +metric_for_best_model=None, +mla_layers=[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30], +mla_model_path=/workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1, +mla_to_mha=False, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.4, +optim=adamw_torch, +optim_args=None, +optim_target_modules=None, +output_dir=hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=2, +per_device_train_batch_size=4, +prediction_loss_only=False, +prev_checkpoint_path=None, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +q_energy_ratio=None, +q_lora_rank=2048, +qk_nope_head_dim=64, +qk_rope_head_dim=64, +ray_scope=last, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=2000, +save_strategy=steps, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +ssm_layers=None, +stage=1, +start_checkpoint_path=None, +teacher_load_in_8bit=False, +teacher_model_name_or_path=/workspace/amd-mla/llama3.1_8b-instruct, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_fixed_rank_for_first_and_last_block=True, +use_full_kv_head=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_lora_layer_norm=False, +use_mps_device=False, +v_head_dim=128, +warmup_ratio=0.01, +warmup_steps=0, +weight_decay=0.0, +with_distill=False, +) +[2025-05-27 14:06:38,436] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:38 - WARNING - __main__ - Process rank: 7, device: cuda:7, n_gpu: 1 distributed training: True, 16-bits training: False +[2025-05-27 14:06:38,815] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:38 - WARNING - __main__ - Process rank: 6, device: cuda:6, n_gpu: 1 distributed training: True, 16-bits training: False +[2025-05-27 14:06:39,027] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:39 - WARNING - __main__ - Process rank: 5, device: cuda:5, n_gpu: 1 distributed training: True, 16-bits training: False +[2025-05-27 14:06:39,185] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:39 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1 distributed training: True, 16-bits training: False +[2025-05-27 14:06:39,512] [INFO] [comm.py:637:init_distributed] cdb=None +2025-05-27 14:06:39 - WARNING - __main__ - Process rank: 4, device: cuda:4, n_gpu: 1 distributed training: True, 16-bits training: False +2025-05-27 14:06:40 - INFO - datasets.builder - No config specified, defaulting to the single config: sftdatasetv3/default +2025-05-27 14:06:40 - INFO - datasets.info - Loading Dataset Infos from /opt/conda/envs/py_3.10/lib/python3.10/site-packages/datasets/packaged_modules/json +2025-05-27 14:06:40 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists. +2025-05-27 14:06:40 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3 +2025-05-27 14:06:40 - INFO - datasets.builder - Found cached dataset sftdatasetv3 (/root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3) +2025-05-27 14:06:40 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3 +2025-05-27 14:06:43 - INFO - datasets.builder - No config specified, defaulting to the single config: sftdatasetv3/default +2025-05-27 14:06:43 - INFO - datasets.info - Loading Dataset Infos from /opt/conda/envs/py_3.10/lib/python3.10/site-packages/datasets/packaged_modules/json +2025-05-27 14:06:43 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists. +2025-05-27 14:06:43 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3 +2025-05-27 14:06:43 - INFO - datasets.builder - Found cached dataset sftdatasetv3 (/root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3) +2025-05-27 14:06:43 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3 +2025-05-27 14:06:44 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-3c0997594398e8d2.arrow +2025-05-27 14:06:44 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-7bbe4f49b31cd388.arrow +2025-05-27 14:06:44 - INFO - __main__ - Training on the following datasets and their proportions: ['train : 19473367', 'test : 4096'] +2025-05-27 14:06:44 - INFO - __main__ - *** Load pretrained model *** +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #0 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00000_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #1 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00001_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #2 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00002_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #3 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00003_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #4 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00004_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #5 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00005_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #6 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00006_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #7 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00007_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #8 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00008_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #9 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00009_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #10 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00010_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #11 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00011_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #12 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00012_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #13 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00013_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #14 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00014_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #15 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00015_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #16 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00016_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #17 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00017_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #18 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00018_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #19 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00019_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #20 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00020_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #21 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00021_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #22 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00022_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #23 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00023_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #24 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00024_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #25 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00025_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #26 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00026_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #27 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00027_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #28 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00028_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #29 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00029_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #30 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00030_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #31 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00031_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #32 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00032_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #33 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00033_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #34 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00034_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #35 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00035_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #36 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00036_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #37 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00037_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #38 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00038_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #39 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00039_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #40 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00040_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #41 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00041_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #42 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00042_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #43 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00043_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #44 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00044_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #45 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00045_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #46 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00046_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #47 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00047_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #48 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00048_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #49 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00049_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #50 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00050_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #51 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00051_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #52 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00052_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #53 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00053_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #54 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00054_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #55 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00055_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #56 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00056_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #57 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00057_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #58 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00058_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #59 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00059_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #60 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00060_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #61 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00061_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #62 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00062_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #63 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00063_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #64 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00064_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #65 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00065_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #66 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00066_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #67 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00067_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #68 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00068_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #69 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00069_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #70 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00070_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #71 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00071_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #72 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00072_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #73 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00073_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #74 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00074_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #75 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00075_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #76 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00076_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #77 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00077_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #78 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00078_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #79 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00079_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #80 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00080_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #81 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00081_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #82 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00082_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #83 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00083_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #84 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00084_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #85 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00085_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #86 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00086_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #87 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00087_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #88 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00088_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #89 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00089_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #90 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00090_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #91 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00091_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #92 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00092_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #93 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00093_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #94 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00094_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #95 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00095_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #96 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00096_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #97 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00097_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #98 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00098_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #99 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00099_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #100 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00100_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #101 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00101_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #102 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00102_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #103 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00103_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #104 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00104_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #105 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00105_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #106 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00106_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #107 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00107_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #108 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00108_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #109 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00109_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #110 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00110_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #111 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00111_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #112 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00112_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #113 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00113_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #114 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00114_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #115 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00115_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #116 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00116_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #117 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00117_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #118 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00118_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #119 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00119_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #120 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00120_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #121 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00121_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #122 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00122_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #123 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00123_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #124 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00124_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #125 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00125_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #126 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00126_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Process #127 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_00127_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-53367a923630a99b_*_of_00128.arrow +2025-05-27 14:06:53 - INFO - datasets.arrow_dataset - Concatenating 128 shards +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #0 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00000_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #1 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00001_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #2 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00002_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #3 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00003_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #4 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00004_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #5 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00005_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #6 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00006_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #7 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00007_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #8 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00008_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #9 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00009_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #10 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00010_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #11 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00011_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #12 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00012_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #13 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00013_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #14 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00014_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #15 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00015_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #16 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00016_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #17 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00017_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #18 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00018_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #19 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00019_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #20 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00020_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #21 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00021_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #22 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00022_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #23 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00023_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #24 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00024_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #25 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00025_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #26 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00026_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #27 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00027_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #28 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00028_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #29 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00029_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #30 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00030_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #31 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00031_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #32 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00032_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #33 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00033_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #34 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00034_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #35 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00035_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #36 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00036_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #37 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00037_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #38 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00038_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #39 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00039_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #40 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00040_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #41 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00041_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #42 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00042_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #43 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00043_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #44 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00044_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #45 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00045_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #46 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00046_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #47 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00047_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #48 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00048_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #49 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00049_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #50 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00050_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #51 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00051_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #52 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00052_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #53 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00053_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #54 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00054_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #55 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00055_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #56 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00056_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #57 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00057_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #58 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00058_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #59 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00059_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #60 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00060_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #61 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00061_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #62 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00062_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #63 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00063_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #64 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00064_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #65 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00065_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #66 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00066_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #67 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00067_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #68 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00068_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #69 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00069_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #70 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00070_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #71 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00071_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #72 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00072_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #73 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00073_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #74 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00074_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #75 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00075_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #76 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00076_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #77 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00077_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #78 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00078_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #79 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00079_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #80 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00080_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #81 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00081_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #82 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00082_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #83 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00083_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #84 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00084_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #85 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00085_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #86 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00086_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #87 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00087_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #88 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00088_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #89 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00089_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #90 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00090_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #91 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00091_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #92 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00092_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #93 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00093_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #94 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00094_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #95 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00095_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #96 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00096_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #97 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00097_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #98 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00098_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #99 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00099_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #100 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00100_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #101 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00101_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #102 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00102_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #103 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00103_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #104 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00104_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #105 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00105_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #106 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00106_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #107 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00107_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #108 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00108_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #109 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00109_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #110 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00110_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #111 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00111_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #112 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00112_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #113 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00113_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #114 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00114_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #115 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00115_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #116 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00116_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #117 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00117_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #118 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00118_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #119 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00119_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #120 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00120_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #121 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00121_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #122 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00122_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #123 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00123_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #124 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00124_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #125 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00125_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #126 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00126_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Process #127 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_00127_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-06657d3d9d8fe92c_*_of_00128.arrow +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Concatenating 128 shards +2025-05-27 14:06:54 - INFO - datasets.arrow_dataset - Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-28b0dd13c85ad0f2.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #0 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00000_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #1 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00001_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #2 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00002_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #3 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00003_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #4 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00004_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #5 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00005_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #6 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00006_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #7 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00007_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #8 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00008_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #9 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00009_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #10 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00010_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #11 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00011_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #12 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00012_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #13 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00013_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #14 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00014_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #15 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00015_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #16 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00016_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #17 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00017_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #18 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00018_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #19 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00019_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #20 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00020_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #21 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00021_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #22 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00022_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #23 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00023_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #24 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00024_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #25 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00025_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #26 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00026_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #27 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00027_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #28 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00028_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #29 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00029_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #30 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00030_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #31 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00031_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #32 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00032_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #33 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00033_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #34 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00034_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #35 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00035_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #36 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00036_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #37 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00037_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #38 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00038_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #39 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00039_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #40 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00040_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #41 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00041_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #42 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00042_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #43 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00043_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #44 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00044_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #45 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00045_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #46 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00046_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #47 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00047_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #48 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00048_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #49 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00049_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #50 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00050_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #51 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00051_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #52 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00052_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #53 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00053_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #54 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00054_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #55 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00055_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #56 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00056_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #57 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00057_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #58 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00058_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #59 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00059_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #60 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00060_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #61 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00061_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #62 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00062_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #63 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00063_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #64 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00064_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #65 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00065_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #66 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00066_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #67 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00067_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #68 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00068_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #69 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00069_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #70 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00070_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #71 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00071_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #72 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00072_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #73 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00073_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #74 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00074_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #75 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00075_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #76 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00076_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #77 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00077_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #78 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00078_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #79 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00079_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #80 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00080_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #81 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00081_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #82 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00082_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #83 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00083_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #84 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00084_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #85 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00085_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #86 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00086_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #87 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00087_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #88 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00088_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #89 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00089_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #90 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00090_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #91 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00091_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #92 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00092_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #93 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00093_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #94 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00094_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #95 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00095_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #96 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00096_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #97 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00097_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #98 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00098_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #99 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00099_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #100 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00100_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #101 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00101_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #102 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00102_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #103 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00103_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #104 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00104_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #105 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00105_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #106 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00106_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #107 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00107_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #108 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00108_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #109 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00109_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #110 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00110_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #111 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00111_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #112 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00112_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #113 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00113_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #114 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00114_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #115 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00115_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #116 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00116_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #117 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00117_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #118 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00118_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #119 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00119_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #120 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00120_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #121 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00121_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #122 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00122_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #123 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00123_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #124 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00124_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #125 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00125_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #126 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00126_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Process #127 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_00127_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-b3a168c06d172ae5_*_of_00128.arrow +2025-05-27 14:07:23 - INFO - datasets.arrow_dataset - Concatenating 128 shards +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #0 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00000_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #1 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00001_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #2 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00002_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #3 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00003_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #4 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00004_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #5 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00005_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #6 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00006_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #7 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00007_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #8 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00008_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #9 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00009_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #10 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00010_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #11 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00011_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #12 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00012_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #13 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00013_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #14 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00014_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #15 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00015_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #16 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00016_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #17 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00017_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #18 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00018_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #19 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00019_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #20 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00020_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #21 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00021_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #22 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00022_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #23 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00023_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #24 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00024_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #25 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00025_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #26 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00026_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #27 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00027_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #28 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00028_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #29 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00029_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #30 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00030_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #31 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00031_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #32 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00032_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #33 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00033_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #34 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00034_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #35 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00035_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #36 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00036_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #37 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00037_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #38 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00038_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #39 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00039_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #40 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00040_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #41 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00041_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #42 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00042_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #43 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00043_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #44 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00044_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #45 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00045_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #46 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00046_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #47 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00047_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #48 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00048_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #49 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00049_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #50 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00050_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #51 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00051_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #52 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00052_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #53 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00053_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #54 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00054_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #55 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00055_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #56 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00056_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #57 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00057_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #58 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00058_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #59 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00059_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #60 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00060_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #61 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00061_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #62 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00062_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #63 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00063_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #64 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00064_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #65 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00065_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #66 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00066_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #67 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00067_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #68 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00068_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #69 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00069_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #70 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00070_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #71 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00071_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #72 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00072_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #73 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00073_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #74 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00074_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #75 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00075_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #76 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00076_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #77 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00077_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #78 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00078_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #79 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00079_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #80 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00080_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #81 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00081_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #82 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00082_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #83 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00083_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #84 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00084_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #85 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00085_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #86 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00086_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #87 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00087_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #88 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00088_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #89 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00089_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #90 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00090_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #91 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00091_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #92 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00092_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #93 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00093_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #94 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00094_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #95 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00095_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #96 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00096_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #97 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00097_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #98 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00098_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #99 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00099_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #100 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00100_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #101 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00101_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #102 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00102_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #103 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00103_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #104 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00104_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #105 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00105_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #106 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00106_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #107 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00107_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #108 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00108_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #109 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00109_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #110 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00110_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #111 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00111_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #112 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00112_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #113 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00113_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #114 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00114_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #115 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00115_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #116 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00116_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #117 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00117_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #118 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00118_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #119 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00119_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #120 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00120_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #121 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00121_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #122 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00122_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #123 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00123_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #124 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00124_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #125 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00125_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #126 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00126_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Process #127 will write at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_00127_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /root/.cache/huggingface/datasets/JunxiongWang___sftdatasetv3/default/0.0.0/f47b5668d12df9db8284d9f98d9e384374495ab3/cache-4eff8dc05779ba5f_*_of_00128.arrow +2025-05-27 14:07:24 - INFO - datasets.arrow_dataset - Concatenating 128 shards +2025-05-27 14:07:24 - INFO - __main__ - Decontaminated 286 (0.00%) samples from the training set. +2025-05-27 14:07:24 - INFO - __main__ - Sample 3735650 of the processed training set: + +<|begin_of_text|><|start_header_id|>user<|end_header_id|> + +How can I make predictions using a model trained on the XGBoost library in Python?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +To make predictions using an XGBoost model in Python, you can follow these steps: + +1. Load the trained model from a file. Here's an example: + + ```python + import xgboost as xgb + + # Load the model from a file + model = xgb.Booster() + model.load_model('xgboost-model.bin') + ``` + +2. Prepare your test data: + + ```python + test_data = pd.DataFrame(...) # Your test data + ``` + +3. Make predictions on your test data: + + ```python + predictions = model.predict(xgb.DMatrix(test_data)) + ``` + +4. Optionally, you can save the predictions to a file: + + ```python + import pickle + + with open('predictions.pkl', 'wb') as f: + pickle.dump(predictions, f) + ``` + +Here's an example of how to use the `predict` method with a specific dataset: + +```python +import xgboost as xgb +import pandas as pd + +# Load the model +model = xgb.Booster() +model.load_model('xgboost-model.bin') + +# Load the dataset +test_data = pd.read_csv('test_data.csv') + +# Make predictions +predictions = model.predict(xgb.DMatrix(test_data)) + +# Print the predictions +print(predictions) +``` + +This code loads the trained model from a file, loads the test data from a CSV file, makes predictions on the test data, and prints the predictions.<|eot_id|> +2025-05-27 14:07:24 - INFO - __main__ - Sample 839221 of the processed training set: + +<|begin_of_text|><|start_header_id|>user<|end_header_id|> + +Premise: three girls tug a rope. + +Hypothesis: Some people pull. + +.Multi-select problem: Can we conclude that the hypothesis is true if the premise is true? + +Possible answers: (I) yes. (II) it is not possible to tell. (III) no.<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +(I)<|eot_id|> +2025-05-27 14:07:24 - INFO - __main__ - Sample 9228452 of the processed training set: + +<|begin_of_text|><|start_header_id|>user<|end_header_id|> + +Please provide a set of at least 10 wildlife safety tips specifically for camping in bear country. The tips should include what to do in case of a bear encounter, how to store food and trash properly, and any additional precautions campers should take to prevent bear interactions. The tips should be written in a clear and concise manner, and actionable for campers of all experience levels. Additionally, please consider including any relevant regulations or laws regarding camping in bear country if applicable.<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +1. Store food and trash in bear-resistant containers or hung at least 10 feet off the ground and 4 feet away from the trunk of the tree. +2. Cook away from sleeping areas and store food smells away from your tent. +3. Never store food or scented items in your tent or nearby. +4. Never approach a bear, especially if it is with cubs. +5. Make noise while hiking to avoid surprise encounters. +6. Avoid hiking alone; always travel in groups. +7. Carry bear spray and know how to use it. +8. Do not run or climb a tree if you encounter a bear. +9. Keep campsites clean and free of food debris. +10. Check with park officials about any specific regulations or restrictions regarding camping in bear country.<|eot_id|><|start_header_id|>user<|end_header_id|> + +They're definitely helpful. Can you provide more information about how to properly use bear spray and any safety precautions to keep in mind when carrying it?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +Sure, here are some additional tips for using bear spray in case of a bear encounter: + +1. Carry the bear spray in an easily accessible location, such as a holster on your hip or chest. 2. When encountering a bear, make sure the bear is within range of the spray (typically 20-30 feet). 3. Remove the safety clip from the spray canister, aim slightly downwards, and aim at the bear's face when it is within range. 4. Spray the bear with a steady stream of bear spray and continue spraying for as long as possible (usually 5-7 seconds). 5. Use bear spray only as a last resort, after trying to scare the bear away by shouting, clapping, or making noise. In terms of safety precautions to keep in mind when carrying bear spray, here are a few tips: + +1. Read the instructions on the canister before using it. 2. Choose a windless day and avoid using the spray in strong winds or high heat. 3. Store the bear spray canister at room temperature, and avoid exposing it to extreme heat or cold. 4. Make sure the canister is not expired and has not been used before. 5. Practice using the spray canister before going camping, so you know how to use it if the need arises.<|eot_id|><|start_header_id|>user<|end_header_id|> + +Could you also provide some tips on how to properly dispose of trash in bear country?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +Sure, here are some tips for properly disposing of trash in bear country: + +1. Use bear-resistant trash cans or dumpsters when available. 2. If bear-resistant options are not available, pack out all trash and dispose of it properly in designated receptacles outside of bear habitat. 3. Double bag food waste and tie the bags tightly to prevent smells from escaping. 4. Avoid burning trash or leaving it in an open fire pit, as this can attract bears to your campsite. 5. Rinse all food containers and cooking utensils thoroughly with hot water after use to remove any remaining food smells. 6. Keep your campsite clean and free of food debris and trash. Remember, leaving trash in bear country can be dangerous for both humans and bears. It can habituate bears to human food and lead to them becoming more comfortable around people, putting both humans and bears in danger. Proper disposal of trash is crucial to protecting the wildlife and maintaining a safe camping experience.<|eot_id|> +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30][0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] + +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] +*****mamba_model_path*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 + +*****mla_model_path *****mla_model_path/workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +/workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct/workspace/amd-mla/llama3.1_8b-instruct + +*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 +*****mla_model_path /workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] +*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 +*****mla_model_path /workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct +*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 +*****mla_model_path /workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] +*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 +*****mla_model_path /workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] +*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 +*****mla_model_path /workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct +[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] +*****mamba_model_path /workspace/amd-mla/HybridInLlama_mla0_mamba100_8B_stage1 +*****mla_model_path /workspace/amd-mla/HybridInLlama_mla100_mamba0_8B_kv160_q2048_np64_rp64_stage1 +*****model_args.model_name_or_path /workspace/amd-mla/llama3.1_8b-instruct +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** ****** Mamba2 Config layers ******[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +****** Mamba2 Config layers ****** [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] +****** Model Name ****** LlamaModel +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 1, SSM initialization from MHA/GQA +layerid: 1, SSM initialization from MHA/GQA +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 1, SSM initialization from MHA/GQA +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 1, SSM initialization from MHA/GQA +layerid: 1, SSM initialization from MHA/GQA +layerid: 1, SSM initialization from MHA/GQA +layerid: 1, SSM initialization from MHA/GQA +layerid: 3, SSM initialization from MHA/GQA +layerid: 3, SSM initialization from MHA/GQA +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 0, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 3, SSM initialization from MHA/GQA +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 1, SSM initialization from MHA/GQA +layerid: 3, SSM initialization from MHA/GQA +layerid: 3, SSM initialization from MHA/GQA +layerid: 3, SSM initialization from MHA/GQA +layerid: 3, SSM initialization from MHA/GQA +layerid: 5, SSM initialization from MHA/GQA +layerid: 5, SSM initialization from MHA/GQA +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 2, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 5, SSM initialization from MHA/GQA +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 3, SSM initialization from MHA/GQA +layerid: 5, SSM initialization from MHA/GQA +layerid: 5, SSM initialization from MHA/GQA +layerid: 5, SSM initialization from MHA/GQA +layerid: 5, SSM initialization from MHA/GQA +layerid: 7, SSM initialization from MHA/GQA +layerid: 7, SSM initialization from MHA/GQA +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 4, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 7, SSM initialization from MHA/GQA +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 5, SSM initialization from MHA/GQA +layerid: 7, SSM initialization from MHA/GQA +layerid: 7, SSM initialization from MHA/GQA +layerid: 7, SSM initialization from MHA/GQA +layerid: 7, SSM initialization from MHA/GQA +layerid: 9, SSM initialization from MHA/GQA +layerid: 9, SSM initialization from MHA/GQA +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 6, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 9, SSM initialization from MHA/GQA +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 7, SSM initialization from MHA/GQA +layerid: 9, SSM initialization from MHA/GQA +layerid: 9, SSM initialization from MHA/GQA +layerid: 9, SSM initialization from MHA/GQA +layerid: 9, SSM initialization from MHA/GQA +layerid: 11, SSM initialization from MHA/GQA +layerid: 11, SSM initialization from MHA/GQA +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 8, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 11, SSM initialization from MHA/GQA +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 9, SSM initialization from MHA/GQA +layerid: 11, SSM initialization from MHA/GQA +layerid: 11, SSM initialization from MHA/GQA +layerid: 11, SSM initialization from MHA/GQA +layerid: 11, SSM initialization from MHA/GQA +layerid: 13, SSM initialization from MHA/GQA +layerid: 13, SSM initialization from MHA/GQA +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 10, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 13, SSM initialization from MHA/GQA +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 11, SSM initialization from MHA/GQA +layerid: 13, SSM initialization from MHA/GQA +layerid: 13, SSM initialization from MHA/GQA +layerid: 13, SSM initialization from MHA/GQA +layerid: 13, SSM initialization from MHA/GQA +layerid: 15, SSM initialization from MHA/GQA +layerid: 15, SSM initialization from MHA/GQA +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 12, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 15, SSM initialization from MHA/GQA +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 13, SSM initialization from MHA/GQA +layerid: 15, SSM initialization from MHA/GQA +layerid: 15, SSM initialization from MHA/GQA +layerid: 15, SSM initialization from MHA/GQA +layerid: 15, SSM initialization from MHA/GQA +layerid: 17, SSM initialization from MHA/GQA +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 17, SSM initialization from MHA/GQA +layerid: 14, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 17, SSM initialization from MHA/GQA +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 15, SSM initialization from MHA/GQA +layerid: 17, SSM initialization from MHA/GQA +layerid: 17, SSM initialization from MHA/GQA +layerid: 17, SSM initialization from MHA/GQA +layerid: 17, SSM initialization from MHA/GQA +layerid: 19, SSM initialization from MHA/GQA +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 19, SSM initialization from MHA/GQA +layerid: 16, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 19, SSM initialization from MHA/GQA +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 17, SSM initialization from MHA/GQA +layerid: 19, SSM initialization from MHA/GQA +layerid: 19, SSM initialization from MHA/GQA +layerid: 19, SSM initialization from MHA/GQA +layerid: 19, SSM initialization from MHA/GQA +layerid: 21, SSM initialization from MHA/GQA +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 21, SSM initialization from MHA/GQA +layerid: 18, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 21, SSM initialization from MHA/GQA +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 19, SSM initialization from MHA/GQA +layerid: 21, SSM initialization from MHA/GQA +layerid: 21, SSM initialization from MHA/GQA +layerid: 21, SSM initialization from MHA/GQA +layerid: 21, SSM initialization from MHA/GQA +layerid: 23, SSM initialization from MHA/GQA +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 23, SSM initialization from MHA/GQA +layerid: 20, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 23, SSM initialization from MHA/GQA +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 23, SSM initialization from MHA/GQA +layerid: 21, SSM initialization from MHA/GQA +layerid: 23, SSM initialization from MHA/GQA +layerid: 23, SSM initialization from MHA/GQA +layerid: 23, SSM initialization from MHA/GQA +layerid: 25, SSM initialization from MHA/GQA +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 25, SSM initialization from MHA/GQA +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 22, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 25, SSM initialization from MHA/GQA +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 25, SSM initialization from MHA/GQA +layerid: 23, SSM initialization from MHA/GQA +layerid: 25, SSM initialization from MHA/GQA +layerid: 25, SSM initialization from MHA/GQA +layerid: 25, SSM initialization from MHA/GQA +layerid: 27, SSM initialization from MHA/GQA +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 27, SSM initialization from MHA/GQA +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 24, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 27, SSM initialization from MHA/GQA +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 27, SSM initialization from MHA/GQA +layerid: 25, SSM initialization from MHA/GQA +layerid: 27, SSM initialization from MHA/GQA +layerid: 27, SSM initialization from MHA/GQA +layerid: 29, SSM initialization from MHA/GQA +layerid: 27, SSM initialization from MHA/GQA +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 29, SSM initialization from MHA/GQA +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 26, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 29, SSM initialization from MHA/GQA +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 29, SSM initialization from MHA/GQA +layerid: 29, SSM initialization from MHA/GQA +layerid: 27, SSM initialization from MHA/GQA +layerid: 29, SSM initialization from MHA/GQA +layerid: 31, SSM initialization from MHA/GQA +layerid: 29, SSM initialization from MHA/GQA +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 31, SSM initialization from MHA/GQA +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 28, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 31, SSM initialization from MHA/GQA +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +layerid: 31, SSM initialization from MHA/GQA +layerid: 31, SSM initialization from MHA/GQA +layerid: 29, SSM initialization from MHA/GQA +layerid: 31, SSM initialization from MHA/GQA +#Params: 8185746944 +layerid: 31, SSM initialization from MHA/GQA +2025-05-27 14:09:52 - INFO - datasets.builder - Using custom data configuration default-929e6124881cf23e +2025-05-27 14:09:52 - INFO - datasets.info - Loading Dataset Infos from /opt/conda/envs/py_3.10/lib/python3.10/site-packages/datasets/packaged_modules/generator +2025-05-27 14:09:52 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists. +2025-05-27 14:09:52 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/generator/default-929e6124881cf23e/0.0.0 +2025-05-27 14:09:52 - INFO - datasets.builder - Found cached dataset generator (/root/.cache/huggingface/datasets/generator/default-929e6124881cf23e/0.0.0) +2025-05-27 14:09:52 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/generator/default-929e6124881cf23e/0.0.0 +#Params: 8185746944 +2025-05-27 14:09:53 - INFO - datasets.builder - Using custom data configuration default-e36fd01e029398dd +2025-05-27 14:09:53 - INFO - datasets.info - Loading Dataset Infos from /opt/conda/envs/py_3.10/lib/python3.10/site-packages/datasets/packaged_modules/generator +2025-05-27 14:09:53 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists. +2025-05-27 14:09:53 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/generator/default-e36fd01e029398dd/0.0.0 +2025-05-27 14:09:53 - INFO - datasets.builder - Found cached dataset generator (/root/.cache/huggingface/datasets/generator/default-e36fd01e029398dd/0.0.0) +2025-05-27 14:09:53 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/generator/default-e36fd01e029398dd/0.0.0 +layerid: 30, MLA initialization from MHA/GQA +****** my_repeat_kv_before_conv ****** = False +#Params: 8185746944 +#Params: 8185746944 +#Params: 8185746944 +#Params: 8185746944 +layerid: 31, SSM initialization from MHA/GQA +#Params: 8185746944 +#Params: 8185746944 +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +teacher_model_init_kwargs: {'revision': 'main', 'trust_remote_code': False, 'use_flash_attention_2': True, 'torch_dtype': torch.bfloat16, 'use_cache': True, 'device_map': None, 'quantization_config': None} +[2025-05-27 14:09:57,890] [INFO] [partition_parameters.py:347:__exit__] finished initializing model - num_params = 291, num_elems = 8.03B +[2025-05-27 14:10:07,079] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.2, git-hash=unknown, git-branch=unknown +[2025-05-27 14:10:07,087] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-05-27 14:10:07,088] [INFO] [logging.py:96:log_dist] [Rank 0] Creating ZeRO Offload +[2025-05-27 14:10:07,248] [INFO] [utils.py:802:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2025-05-27 14:10:07,248] [INFO] [utils.py:803:see_memory_usage] MA 17.62 GB Max_MA 47.24 GB CA 47.29 GB Max_CA 47 GB +[2025-05-27 14:10:07,249] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 95.36 GB, percent = 5.2% +Parameter Offload: Total persistent parameters: 266240 in 65 params +[2025-05-27 14:10:07,383] [INFO] [utils.py:802:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2025-05-27 14:10:07,383] [INFO] [utils.py:803:see_memory_usage] MA 17.62 GB Max_MA 17.62 GB CA 47.29 GB Max_CA 47 GB +[2025-05-27 14:10:07,383] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 95.36 GB, percent = 5.2% +[2025-05-27 14:10:07,384] [INFO] [config.py:972:print] DeepSpeedEngine configuration: +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] amp_enabled .................. False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] amp_params ................... False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] bfloat16_enabled ............. True +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] checkpoint_parallel_write_pipeline False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] checkpoint_tag_validation_enabled True +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] checkpoint_tag_validation_fail False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] comms_config ................. +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] communication_data_type ...... None +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] curriculum_enabled_legacy .... False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] curriculum_params_legacy ..... False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] data_efficiency_enabled ...... False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] dataloader_drop_last ......... False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] disable_allgather ............ False +[2025-05-27 14:10:07,384] [INFO] [config.py:976:print] dump_state ................... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] dynamic_loss_scale_args ...... None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_enabled ........... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_gas_boundary_resolution 1 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_layer_num ......... 0 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_max_iter .......... 100 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_stability ......... 1e-06 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_tol ............... 0.01 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] eigenvalue_verbose ........... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] elasticity_enabled ........... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] fp16_auto_cast ............... None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] fp16_enabled ................. False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] fp16_master_weights_and_gradients False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] global_rank .................. 0 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] grad_accum_dtype ............. None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] gradient_accumulation_steps .. 2 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] gradient_clipping ............ 1.0 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] gradient_predivide_factor .... 1.0 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] initial_dynamic_scale ........ 1 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] load_universal_checkpoint .... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] loss_scale ................... 1.0 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] memory_breakdown ............. False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] mics_hierarchial_params_gather False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] mics_shard_size .............. -1 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] optimizer_legacy_fusion ...... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] optimizer_name ............... None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] optimizer_params ............. None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] pld_enabled .................. False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] pld_params ................... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] prescale_gradients ........... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] scheduler_name ............... None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] scheduler_params ............. None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] seq_parallel_communication_data_type torch.float32 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] sparse_attention ............. None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] sparse_gradients_enabled ..... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] steps_per_print .............. inf +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] train_batch_size ............. 64 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] train_micro_batch_size_per_gpu 4 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] use_node_local_storage ....... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] wall_clock_breakdown ......... False +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] weight_quantization_config ... None +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] world_size ................... 8 +[2025-05-27 14:10:07,385] [INFO] [config.py:976:print] zero_allow_untested_optimizer False +[2025-05-27 14:10:07,386] [INFO] [config.py:976:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-05-27 14:10:07,386] [INFO] [config.py:976:print] zero_enabled ................. True +[2025-05-27 14:10:07,386] [INFO] [config.py:976:print] zero_force_ds_cpu_optimizer .. True +[2025-05-27 14:10:07,386] [INFO] [config.py:976:print] zero_optimization_stage ...... 3 +[2025-05-27 14:10:07,386] [INFO] [config.py:962:print_user_config] json = { + "train_batch_size": 64, + "train_micro_batch_size_per_gpu": 4, + "gradient_accumulation_steps": 2, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "nvme_path": null + }, + "offload_param": { + "device": "none", + "nvme_path": null + }, + "stage3_gather_16bit_weights_on_model_save": false + }, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "bf16": { + "enabled": true + }, + "fp16": { + "enabled": false + }, + "zero_optimization.reduce_bucket_size": 1.677722e+07, + "zero_optimization.stage3_param_persistence_threshold": 4.096000e+04, + "zero_optimization.stage3_prefetch_bucket_size": 1.509949e+07 +} +2025-05-27 14:10:07 - INFO - __main__ - *** Train *** +[2025-05-27 14:10:07,864] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.2, git-hash=unknown, git-branch=unknown +[2025-05-27 14:10:08,599] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-05-27 14:10:08,600] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2025-05-27 14:10:08,600] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-05-27 14:10:08,611] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2025-05-27 14:10:08,611] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type= +[2025-05-27 14:10:08,611] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2025-05-27 14:10:08,611] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +[2025-05-27 14:10:08,829] [INFO] [utils.py:802:see_memory_usage] Stage 3 initialize beginning +[2025-05-27 14:10:08,829] [INFO] [utils.py:803:see_memory_usage] MA 17.62 GB Max_MA 17.62 GB CA 18.54 GB Max_CA 47 GB +[2025-05-27 14:10:08,830] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.73 GB, percent = 5.3% +[2025-05-27 14:10:08,831] [INFO] [stage3.py:126:__init__] Reduce bucket size 500,000,000 +[2025-05-27 14:10:08,831] [INFO] [stage3.py:127:__init__] Prefetch bucket size 50,000,000 +[2025-05-27 14:10:08,964] [INFO] [utils.py:802:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2025-05-27 14:10:08,964] [INFO] [utils.py:803:see_memory_usage] MA 17.62 GB Max_MA 17.62 GB CA 18.54 GB Max_CA 19 GB +[2025-05-27 14:10:08,964] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.73 GB, percent = 5.3% +Parameter Offload: Total persistent parameters: 824832 in 161 params +[2025-05-27 14:10:09,193] [INFO] [utils.py:802:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2025-05-27 14:10:09,193] [INFO] [utils.py:803:see_memory_usage] MA 4.28 GB Max_MA 17.74 GB CA 18.54 GB Max_CA 19 GB +[2025-05-27 14:10:09,194] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.73 GB, percent = 5.3% +[2025-05-27 14:10:09,331] [INFO] [utils.py:802:see_memory_usage] Before creating fp16 partitions +[2025-05-27 14:10:09,332] [INFO] [utils.py:803:see_memory_usage] MA 4.28 GB Max_MA 4.28 GB CA 18.54 GB Max_CA 19 GB +[2025-05-27 14:10:09,332] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.73 GB, percent = 5.3% +[2025-05-27 14:10:10,567] [INFO] [utils.py:802:see_memory_usage] After creating fp16 partitions: 2 +[2025-05-27 14:10:10,568] [INFO] [utils.py:803:see_memory_usage] MA 4.28 GB Max_MA 4.28 GB CA 5.21 GB Max_CA 19 GB +[2025-05-27 14:10:10,568] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.38 GB, percent = 5.3% +[2025-05-27 14:10:10,713] [INFO] [utils.py:802:see_memory_usage] Before creating fp32 partitions +[2025-05-27 14:10:10,713] [INFO] [utils.py:803:see_memory_usage] MA 4.28 GB Max_MA 4.28 GB CA 5.21 GB Max_CA 5 GB +[2025-05-27 14:10:10,714] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.38 GB, percent = 5.3% +[2025-05-27 14:10:10,845] [INFO] [utils.py:802:see_memory_usage] After creating fp32 partitions +[2025-05-27 14:10:10,846] [INFO] [utils.py:803:see_memory_usage] MA 8.09 GB Max_MA 9.99 GB CA 10.93 GB Max_CA 11 GB +[2025-05-27 14:10:10,846] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.38 GB, percent = 5.3% +[2025-05-27 14:10:11,006] [INFO] [utils.py:802:see_memory_usage] Before initializing optimizer states +[2025-05-27 14:10:11,006] [INFO] [utils.py:803:see_memory_usage] MA 8.09 GB Max_MA 8.09 GB CA 10.93 GB Max_CA 11 GB +[2025-05-27 14:10:11,006] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.4 GB, percent = 5.3% +[2025-05-27 14:10:11,203] [INFO] [utils.py:802:see_memory_usage] After initializing optimizer states +[2025-05-27 14:10:11,204] [INFO] [utils.py:803:see_memory_usage] MA 15.71 GB Max_MA 23.34 GB CA 26.18 GB Max_CA 26 GB +[2025-05-27 14:10:11,204] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 96.4 GB, percent = 5.3% +[2025-05-27 14:10:11,204] [INFO] [stage3.py:460:_setup_for_real_optimizer] optimizer state initialized +[2025-05-27 14:10:11,575] [INFO] [utils.py:802:see_memory_usage] After initializing ZeRO optimizer +[2025-05-27 14:10:11,575] [INFO] [utils.py:803:see_memory_usage] MA 18.55 GB Max_MA 20.51 GB CA 26.18 GB Max_CA 26 GB +[2025-05-27 14:10:11,575] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 95.36 GB, percent = 5.2% +[2025-05-27 14:10:11,576] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW +[2025-05-27 14:10:11,576] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2025-05-27 14:10:11,576] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2025-05-27 14:10:11,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999)] +[2025-05-27 14:10:11,576] [INFO] [config.py:972:print] DeepSpeedEngine configuration: +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] amp_enabled .................. False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] amp_params ................... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] bfloat16_enabled ............. True +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] checkpoint_parallel_write_pipeline False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] checkpoint_tag_validation_enabled True +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] checkpoint_tag_validation_fail False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] comms_config ................. +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] communication_data_type ...... None +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] curriculum_enabled_legacy .... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] curriculum_params_legacy ..... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] data_efficiency_enabled ...... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] dataloader_drop_last ......... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] disable_allgather ............ False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] dump_state ................... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] dynamic_loss_scale_args ...... None +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_enabled ........... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_gas_boundary_resolution 1 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_layer_num ......... 0 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_max_iter .......... 100 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_stability ......... 1e-06 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_tol ............... 0.01 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] eigenvalue_verbose ........... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] elasticity_enabled ........... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] fp16_auto_cast ............... None +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] fp16_enabled ................. False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] fp16_master_weights_and_gradients False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] global_rank .................. 0 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] grad_accum_dtype ............. None +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] gradient_accumulation_steps .. 2 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] gradient_clipping ............ 1.0 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] gradient_predivide_factor .... 1.0 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] initial_dynamic_scale ........ 1 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] load_universal_checkpoint .... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] loss_scale ................... 1.0 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] memory_breakdown ............. False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] mics_hierarchial_params_gather False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] mics_shard_size .............. -1 +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] optimizer_legacy_fusion ...... False +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] optimizer_name ............... None +[2025-05-27 14:10:11,577] [INFO] [config.py:976:print] optimizer_params ............. None +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] pld_enabled .................. False +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] pld_params ................... False +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] prescale_gradients ........... False +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] scheduler_name ............... None +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] scheduler_params ............. None +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] seq_parallel_communication_data_type torch.float32 +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] sparse_attention ............. None +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] sparse_gradients_enabled ..... False +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] steps_per_print .............. inf +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] train_batch_size ............. 64 +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] train_micro_batch_size_per_gpu 4 +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] use_node_local_storage ....... False +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] wall_clock_breakdown ......... False +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] weight_quantization_config ... None +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] world_size ................... 8 +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] zero_allow_untested_optimizer True +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] zero_enabled ................. True +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] zero_force_ds_cpu_optimizer .. True +[2025-05-27 14:10:11,578] [INFO] [config.py:976:print] zero_optimization_stage ...... 3 +[2025-05-27 14:10:11,578] [INFO] [config.py:962:print_user_config] json = { + "train_batch_size": 64, + "train_micro_batch_size_per_gpu": 4, + "gradient_accumulation_steps": 2, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "nvme_path": null + }, + "offload_param": { + "device": "none", + "nvme_path": null + }, + "stage3_gather_16bit_weights_on_model_save": false + }, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "bf16": { + "enabled": true + }, + "fp16": { + "enabled": false + }, + "zero_allow_untested_optimizer": true +} +2025-05-27 14:10:11 - ERROR - wandb.sdk.lib.gitlib - git root error: Cmd('git') failed due to: exit code(128) + cmdline: git rev-parse --show-toplevel + stderr: 'fatal: detected dubious ownership in repository at '/workspace/amd-mla' +To add an exception for this directory, call: + + git config --global --add safe.directory /workspace/amd-mla' +[2025-05-27 14:10:15,928] [WARNING] [parameter_offload.py:86:_apply_to_tensors_only] A module has unknown inputs or outputs type () and the tensors embedded in it cannot be detected. The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and output tensors and therefore may not get triggered properly. +{'loss': 818.6298, 'grad_norm': 136417.81977980322, 'learning_rate': 2.7472527472527476e-08, 'epoch': 0.0} +{'loss': 837.2885, 'grad_norm': 120668.22224730837, 'learning_rate': 2.7472527472527475e-07, 'epoch': 0.0} +{'loss': 773.1191, 'grad_norm': 88983.92887431842, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.0} +{'loss': 656.2198, 'grad_norm': 67107.94587251423, 'learning_rate': 8.241758241758242e-07, 'epoch': 0.0} +{'loss': 546.7615, 'grad_norm': 30928.56112827486, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.0} +{'loss': 479.398, 'grad_norm': 27922.582874317824, 'learning_rate': 1.3736263736263738e-06, 'epoch': 0.0} +{'loss': 441.4027, 'grad_norm': 31822.92220762882, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.0} +{'loss': 429.7585, 'grad_norm': 18681.621525587878, 'learning_rate': 1.9230769230769234e-06, 'epoch': 0.0} +{'loss': 393.1341, 'grad_norm': 20813.99236614337, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.0} +{'loss': 381.5655, 'grad_norm': 22460.303390185272, 'learning_rate': 2.472527472527473e-06, 'epoch': 0.0} +{'loss': 373.3451, 'grad_norm': 18182.300992149096, 'learning_rate': 2.7472527472527476e-06, 'epoch': 0.0} +{'loss': 375.9083, 'grad_norm': 27477.180510770395, 'learning_rate': 3.021978021978022e-06, 'epoch': 0.0} +{'loss': 360.9797, 'grad_norm': 25459.148998027773, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.0} +{'loss': 355.4623, 'grad_norm': 25249.059988303765, 'learning_rate': 3.5714285714285718e-06, 'epoch': 0.0} +{'loss': 355.2119, 'grad_norm': 26828.055155769885, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.0} +{'loss': 348.5976, 'grad_norm': 20181.911282894685, 'learning_rate': 4.120879120879121e-06, 'epoch': 0.0} +{'loss': 344.9646, 'grad_norm': 24227.229845046535, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.0} +{'loss': 342.0517, 'grad_norm': 19728.564355380186, 'learning_rate': 4.6703296703296706e-06, 'epoch': 0.0} +{'loss': 354.6103, 'grad_norm': 22679.771198200084, 'learning_rate': 4.945054945054946e-06, 'epoch': 0.0} +{'loss': 337.4747, 'grad_norm': 24194.09607148186, 'learning_rate': 5.219780219780221e-06, 'epoch': 0.0} +{'loss': 334.0955, 'grad_norm': 26534.461648198932, 'learning_rate': 5.494505494505495e-06, 'epoch': 0.0} +{'loss': 321.2979, 'grad_norm': 17874.616995334804, 'learning_rate': 5.769230769230769e-06, 'epoch': 0.0} +{'loss': 326.8981, 'grad_norm': 19752.240545328583, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.0} +{'loss': 326.3264, 'grad_norm': 19600.86447733362, 'learning_rate': 6.318681318681319e-06, 'epoch': 0.0} +{'loss': 315.748, 'grad_norm': 15331.087305953717, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.0} +{'loss': 324.4048, 'grad_norm': 18832.237336898797, 'learning_rate': 6.868131868131869e-06, 'epoch': 0.0} +{'loss': 322.3686, 'grad_norm': 27643.35451434385, 'learning_rate': 7.1428571428571436e-06, 'epoch': 0.01} +{'loss': 318.9111, 'grad_norm': 21769.686433771363, 'learning_rate': 7.417582417582418e-06, 'epoch': 0.01} +{'loss': 324.5833, 'grad_norm': 16552.75634514519, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.01} +{'loss': 312.6277, 'grad_norm': 21099.825397937897, 'learning_rate': 7.967032967032966e-06, 'epoch': 0.01} +{'loss': 321.9004, 'grad_norm': 23993.837660034256, 'learning_rate': 8.241758241758243e-06, 'epoch': 0.01} +{'loss': 322.0042, 'grad_norm': 24458.030984856992, 'learning_rate': 8.516483516483517e-06, 'epoch': 0.01} +{'loss': 316.7802, 'grad_norm': 13321.589147328381, 'learning_rate': 8.791208791208792e-06, 'epoch': 0.01} +{'loss': 319.6825, 'grad_norm': 25258.22006338411, 'learning_rate': 9.065934065934067e-06, 'epoch': 0.01} +{'loss': 309.0048, 'grad_norm': 15329.208412648835, 'learning_rate': 9.340659340659341e-06, 'epoch': 0.01} +{'loss': 336.7008, 'grad_norm': 25984.92917270844, 'learning_rate': 9.615384615384616e-06, 'epoch': 0.01} +{'loss': 327.048, 'grad_norm': 14818.124003812418, 'learning_rate': 9.890109890109892e-06, 'epoch': 0.01} +{'loss': 317.3185, 'grad_norm': 17742.756487097635, 'learning_rate': 1.0164835164835165e-05, 'epoch': 0.01} +{'loss': 312.6258, 'grad_norm': 15577.172764200057, 'learning_rate': 1.0439560439560441e-05, 'epoch': 0.01} +{'loss': 314.1413, 'grad_norm': 14986.068853428693, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.01} +{'loss': 310.179, 'grad_norm': 17451.589500785518, 'learning_rate': 1.098901098901099e-05, 'epoch': 0.01} +{'loss': 315.8287, 'grad_norm': 24250.825904885387, 'learning_rate': 1.1263736263736265e-05, 'epoch': 0.01} +{'loss': 325.393, 'grad_norm': 22888.13257608094, 'learning_rate': 1.1538461538461538e-05, 'epoch': 0.01} +{'loss': 311.063, 'grad_norm': 16960.92845662646, 'learning_rate': 1.1813186813186814e-05, 'epoch': 0.01} +{'loss': 305.959, 'grad_norm': 20606.445970332505, 'learning_rate': 1.2087912087912089e-05, 'epoch': 0.01} +{'loss': 305.6962, 'grad_norm': 14654.807089737682, 'learning_rate': 1.2362637362637363e-05, 'epoch': 0.01} +{'loss': 313.825, 'grad_norm': 20279.7815551087, 'learning_rate': 1.2637362637362638e-05, 'epoch': 0.01} +{'loss': 316.8674, 'grad_norm': 21846.819342544248, 'learning_rate': 1.2912087912087914e-05, 'epoch': 0.01} +{'loss': 325.6178, 'grad_norm': 19327.405140085993, 'learning_rate': 1.3186813186813187e-05, 'epoch': 0.01} +{'loss': 329.3903, 'grad_norm': 13740.064533389821, 'learning_rate': 1.3461538461538463e-05, 'epoch': 0.01} +{'loss': 325.966, 'grad_norm': 22482.96306643711, 'learning_rate': 1.3736263736263738e-05, 'epoch': 0.01} +{'loss': 314.0896, 'grad_norm': 13988.232585457443, 'learning_rate': 1.401098901098901e-05, 'epoch': 0.01} +{'loss': 309.5052, 'grad_norm': 15360.770564331913, 'learning_rate': 1.4285714285714287e-05, 'epoch': 0.01} +{'loss': 321.6392, 'grad_norm': 15862.515766457655, 'learning_rate': 1.4560439560439562e-05, 'epoch': 0.01} +{'loss': 315.0158, 'grad_norm': 13070.093000810497, 'learning_rate': 1.4835164835164836e-05, 'epoch': 0.01} +{'loss': 327.0227, 'grad_norm': 15114.129373912214, 'learning_rate': 1.5109890109890111e-05, 'epoch': 0.01} +{'loss': 320.4659, 'grad_norm': 14973.220473204598, 'learning_rate': 1.5384615384615387e-05, 'epoch': 0.01} +{'loss': 319.2006, 'grad_norm': 22248.842276212144, 'learning_rate': 1.5659340659340662e-05, 'epoch': 0.01} +{'loss': 330.9218, 'grad_norm': 28579.92984045344, 'learning_rate': 1.5934065934065933e-05, 'epoch': 0.01} +{'loss': 344.1444, 'grad_norm': 18006.570921290644, 'learning_rate': 1.620879120879121e-05, 'epoch': 0.01} +{'loss': 322.1274, 'grad_norm': 16581.045562230665, 'learning_rate': 1.6483516483516486e-05, 'epoch': 0.01} +{'loss': 320.5329, 'grad_norm': 15452.81605897286, 'learning_rate': 1.675824175824176e-05, 'epoch': 0.01} +{'loss': 339.0284, 'grad_norm': 16044.461382071544, 'learning_rate': 1.7032967032967035e-05, 'epoch': 0.01} +{'loss': 329.1375, 'grad_norm': 15671.1564271664, 'learning_rate': 1.730769230769231e-05, 'epoch': 0.01} +{'loss': 321.1222, 'grad_norm': 15312.647168694502, 'learning_rate': 1.7582417582417584e-05, 'epoch': 0.01} +{'loss': 343.4097, 'grad_norm': 16663.99158295195, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.01} +{'loss': 343.345, 'grad_norm': 16075.089291452457, 'learning_rate': 1.8131868131868133e-05, 'epoch': 0.01} +{'loss': 335.2219, 'grad_norm': 13295.4182705059, 'learning_rate': 1.8406593406593408e-05, 'epoch': 0.01} +{'loss': 330.8803, 'grad_norm': 17430.89504739878, 'learning_rate': 1.8681318681318682e-05, 'epoch': 0.01} +{'loss': 344.9438, 'grad_norm': 11945.880240729177, 'learning_rate': 1.8956043956043957e-05, 'epoch': 0.01} +{'loss': 335.3231, 'grad_norm': 14632.134335396006, 'learning_rate': 1.923076923076923e-05, 'epoch': 0.01} +{'loss': 332.3656, 'grad_norm': 11267.806398212442, 'learning_rate': 1.9505494505494506e-05, 'epoch': 0.01} +{'loss': 343.9449, 'grad_norm': 16949.972615671526, 'learning_rate': 1.9780219780219784e-05, 'epoch': 0.01} +{'loss': 365.7662, 'grad_norm': 49735.037788750626, 'learning_rate': 1.9999999961991482e-05, 'epoch': 0.01} +{'loss': 362.0244, 'grad_norm': 11678.671877249266, 'learning_rate': 1.9999998631693312e-05, 'epoch': 0.01} +{'loss': 355.17, 'grad_norm': 14646.759693486993, 'learning_rate': 1.9999995400969434e-05, 'epoch': 0.01} +{'loss': 338.5812, 'grad_norm': 13683.146608061634, 'learning_rate': 1.999999026982046e-05, 'epoch': 0.01} +{'loss': 339.8364, 'grad_norm': 13404.346093632792, 'learning_rate': 1.999998323824736e-05, 'epoch': 0.01} +{'loss': 348.7413, 'grad_norm': 15290.176132269145, 'learning_rate': 1.999997430625148e-05, 'epoch': 0.02} +{'loss': 335.1151, 'grad_norm': 16593.50472094032, 'learning_rate': 1.9999963473834517e-05, 'epoch': 0.02} +{'loss': 344.8227, 'grad_norm': 16974.001683839306, 'learning_rate': 1.999995074099852e-05, 'epoch': 0.02} +{'loss': 338.5061, 'grad_norm': 12875.012659690097, 'learning_rate': 1.9999936107745918e-05, 'epoch': 0.02} +{'loss': 322.0944, 'grad_norm': 12475.458888027448, 'learning_rate': 1.9999919574079484e-05, 'epoch': 0.02} +{'loss': 381.8763, 'grad_norm': 22846.652856745077, 'learning_rate': 1.9999901140002363e-05, 'epoch': 0.02} +{'loss': 365.5293, 'grad_norm': 18609.029661661654, 'learning_rate': 1.9999880805518067e-05, 'epoch': 0.02} +{'loss': 356.1942, 'grad_norm': 14660.3101809287, 'learning_rate': 1.9999858570630447e-05, 'epoch': 0.02} +{'loss': 348.7349, 'grad_norm': 12363.466083334279, 'learning_rate': 1.9999834435343738e-05, 'epoch': 0.02} +{'loss': 363.4696, 'grad_norm': 21081.024403193205, 'learning_rate': 1.999980839966252e-05, 'epoch': 0.02} +{'loss': 337.5097, 'grad_norm': 14844.087181217039, 'learning_rate': 1.9999780463591747e-05, 'epoch': 0.02} +{'loss': 338.2398, 'grad_norm': 12057.682050621554, 'learning_rate': 1.9999750627136724e-05, 'epoch': 0.02} +{'loss': 323.7438, 'grad_norm': 10085.380841979848, 'learning_rate': 1.9999718890303124e-05, 'epoch': 0.02} +{'loss': 322.3696, 'grad_norm': 11947.270051033747, 'learning_rate': 1.9999685253096975e-05, 'epoch': 0.02} +{'loss': 340.7957, 'grad_norm': 14634.162804718793, 'learning_rate': 1.9999649715524673e-05, 'epoch': 0.02} +{'loss': 335.5583, 'grad_norm': 11557.933495426141, 'learning_rate': 1.9999612277592968e-05, 'epoch': 0.02} +{'loss': 352.6666, 'grad_norm': 16404.296440501024, 'learning_rate': 1.9999572939308978e-05, 'epoch': 0.02} +{'loss': 334.3783, 'grad_norm': 14803.37332774061, 'learning_rate': 1.9999531700680178e-05, 'epoch': 0.02} +{'loss': 338.0354, 'grad_norm': 10533.738935138472, 'learning_rate': 1.9999488561714403e-05, 'epoch': 0.02} +{'loss': 338.9414, 'grad_norm': 13173.858682775655, 'learning_rate': 1.9999443522419857e-05, 'epoch': 0.02} +{'loss': 340.9611, 'grad_norm': 10163.810735409734, 'learning_rate': 1.999939658280509e-05, 'epoch': 0.02} +{'loss': 329.5386, 'grad_norm': 12677.339937618472, 'learning_rate': 1.9999347742879034e-05, 'epoch': 0.02} +{'loss': 338.6758, 'grad_norm': 16913.839875744237, 'learning_rate': 1.999929700265096e-05, 'epoch': 0.02} +{'loss': 332.4711, 'grad_norm': 19076.01006720437, 'learning_rate': 1.9999244362130516e-05, 'epoch': 0.02} +{'loss': 332.826, 'grad_norm': 11946.913748978383, 'learning_rate': 1.9999189821327706e-05, 'epoch': 0.02} +{'loss': 341.2245, 'grad_norm': 13138.743646294819, 'learning_rate': 1.9999133380252897e-05, 'epoch': 0.02} +{'loss': 326.9117, 'grad_norm': 9897.963109324704, 'learning_rate': 1.9999075038916808e-05, 'epoch': 0.02} +{'loss': 327.3149, 'grad_norm': 11910.755841012866, 'learning_rate': 1.9999014797330536e-05, 'epoch': 0.02} +{'loss': 328.7623, 'grad_norm': 9942.456390613484, 'learning_rate': 1.999895265550552e-05, 'epoch': 0.02} +{'loss': 340.0259, 'grad_norm': 8983.008397572004, 'learning_rate': 1.9998888613453576e-05, 'epoch': 0.02} +{'loss': 332.3015, 'grad_norm': 12546.552240517029, 'learning_rate': 1.9998822671186874e-05, 'epoch': 0.02} +{'loss': 337.8467, 'grad_norm': 11535.063402166315, 'learning_rate': 1.9998754828717947e-05, 'epoch': 0.02} +{'loss': 343.274, 'grad_norm': 19422.26480041368, 'learning_rate': 1.999868508605968e-05, 'epoch': 0.02} +{'loss': 335.4328, 'grad_norm': 9757.024159933153, 'learning_rate': 1.9998613443225335e-05, 'epoch': 0.02} +{'loss': 329.1482, 'grad_norm': 8475.21310648536, 'learning_rate': 1.9998539900228526e-05, 'epoch': 0.02} +{'loss': 333.281, 'grad_norm': 8137.8377928540585, 'learning_rate': 1.9998464457083227e-05, 'epoch': 0.02} +{'loss': 326.9676, 'grad_norm': 9511.241043074086, 'learning_rate': 1.9998387113803775e-05, 'epoch': 0.02} +{'loss': 317.2063, 'grad_norm': 8529.568979961487, 'learning_rate': 1.999830787040487e-05, 'epoch': 0.02} +{'loss': 333.4298, 'grad_norm': 10905.495538584408, 'learning_rate': 1.9998226726901573e-05, 'epoch': 0.02} +{'loss': 323.4469, 'grad_norm': 11713.142728908673, 'learning_rate': 1.9998143683309307e-05, 'epoch': 0.02} +{'loss': 326.6084, 'grad_norm': 9896.161356198923, 'learning_rate': 1.9998058739643843e-05, 'epoch': 0.02} +{'loss': 325.9905, 'grad_norm': 8019.394985662202, 'learning_rate': 1.999797189592134e-05, 'epoch': 0.02} +{'loss': 315.7593, 'grad_norm': 10108.745493221706, 'learning_rate': 1.9997883152158285e-05, 'epoch': 0.02} +{'loss': 319.7391, 'grad_norm': 9879.931237280154, 'learning_rate': 1.9997792508371555e-05, 'epoch': 0.02} +{'loss': 319.8348, 'grad_norm': 10207.179172644162, 'learning_rate': 1.9997699964578373e-05, 'epoch': 0.02} +{'loss': 328.8729, 'grad_norm': 10370.858909613617, 'learning_rate': 1.9997605520796324e-05, 'epoch': 0.02} +{'loss': 326.7514, 'grad_norm': 10871.414232099665, 'learning_rate': 1.9997509177043358e-05, 'epoch': 0.02} +{'loss': 324.2874, 'grad_norm': 8091.526586729309, 'learning_rate': 1.9997410933337785e-05, 'epoch': 0.02} +{'loss': 308.1851, 'grad_norm': 14173.806048862818, 'learning_rate': 1.9997310789698276e-05, 'epoch': 0.02} +{'loss': 320.8488, 'grad_norm': 12458.092170368747, 'learning_rate': 1.999720874614386e-05, 'epoch': 0.02} +{'loss': 318.4716, 'grad_norm': 13259.535532609338, 'learning_rate': 1.9997104802693933e-05, 'epoch': 0.02} +{'loss': 335.2393, 'grad_norm': 11937.446547371328, 'learning_rate': 1.9996998959368245e-05, 'epoch': 0.02} +{'loss': 323.6082, 'grad_norm': 11491.491728009423, 'learning_rate': 1.9996891216186915e-05, 'epoch': 0.03} +{'loss': 329.0695, 'grad_norm': 8903.056643924841, 'learning_rate': 1.9996781573170413e-05, 'epoch': 0.03} +{'loss': 311.5873, 'grad_norm': 7581.48535749003, 'learning_rate': 1.999667003033958e-05, 'epoch': 0.03} +{'loss': 308.5724, 'grad_norm': 9059.086565247, 'learning_rate': 1.9996556587715617e-05, 'epoch': 0.03} +{'loss': 312.5809, 'grad_norm': 8852.355996724367, 'learning_rate': 1.9996441245320074e-05, 'epoch': 0.03} +{'loss': 311.3036, 'grad_norm': 8407.51150796641, 'learning_rate': 1.999632400317488e-05, 'epoch': 0.03} +{'loss': 323.9354, 'grad_norm': 30992.519877220104, 'learning_rate': 1.999620486130231e-05, 'epoch': 0.03} +{'loss': 329.8395, 'grad_norm': 10570.792659759041, 'learning_rate': 1.9996083819725008e-05, 'epoch': 0.03} +{'loss': 314.332, 'grad_norm': 9178.799266956437, 'learning_rate': 1.9995960878465978e-05, 'epoch': 0.03} +{'loss': 309.1129, 'grad_norm': 8728.604141587562, 'learning_rate': 1.9995836037548582e-05, 'epoch': 0.03} +{'loss': 301.0527, 'grad_norm': 9487.155348620168, 'learning_rate': 1.9995709296996545e-05, 'epoch': 0.03} +{'loss': 318.0904, 'grad_norm': 8772.007148777431, 'learning_rate': 1.9995580656833956e-05, 'epoch': 0.03} +{'loss': 302.8864, 'grad_norm': 8078.466869972233, 'learning_rate': 1.999545011708526e-05, 'epoch': 0.03} +{'loss': 311.6851, 'grad_norm': 8675.06126678313, 'learning_rate': 1.9995317677775268e-05, 'epoch': 0.03} +{'loss': 311.001, 'grad_norm': 7711.116137808248, 'learning_rate': 1.9995183338929143e-05, 'epoch': 0.03} +{'loss': 315.2583, 'grad_norm': 8936.73318007066, 'learning_rate': 1.999504710057242e-05, 'epoch': 0.03} +{'loss': 311.5757, 'grad_norm': 18297.21979573903, 'learning_rate': 1.999490896273099e-05, 'epoch': 0.03} +{'loss': 318.4717, 'grad_norm': 8756.869951089293, 'learning_rate': 1.9994768925431104e-05, 'epoch': 0.03} +{'loss': 314.0639, 'grad_norm': 9263.889505736917, 'learning_rate': 1.9994626988699375e-05, 'epoch': 0.03} +{'loss': 303.8643, 'grad_norm': 10785.68420649618, 'learning_rate': 1.9994483152562773e-05, 'epoch': 0.03} +{'loss': 316.4601, 'grad_norm': 8217.444478816655, 'learning_rate': 1.9994337417048642e-05, 'epoch': 0.03} +{'loss': 317.6428, 'grad_norm': 8722.043161948419, 'learning_rate': 1.9994189782184673e-05, 'epoch': 0.03} +{'loss': 309.4591, 'grad_norm': 9510.343397115566, 'learning_rate': 1.999404024799892e-05, 'epoch': 0.03} +{'loss': 312.8805, 'grad_norm': 8075.891316114537, 'learning_rate': 1.9993888814519807e-05, 'epoch': 0.03} +{'loss': 302.8839, 'grad_norm': 8904.736274773124, 'learning_rate': 1.9993735481776105e-05, 'epoch': 0.03} +{'loss': 312.9997, 'grad_norm': 8346.771876663846, 'learning_rate': 1.999358024979696e-05, 'epoch': 0.03} +{'loss': 314.2531, 'grad_norm': 7020.217904264529, 'learning_rate': 1.9993423118611874e-05, 'epoch': 0.03} +{'loss': 295.2384, 'grad_norm': 7831.894593052273, 'learning_rate': 1.9993264088250705e-05, 'epoch': 0.03} +{'loss': 308.9717, 'grad_norm': 10259.254731601914, 'learning_rate': 1.9993103158743673e-05, 'epoch': 0.03} +{'loss': 319.4825, 'grad_norm': 9705.011418267473, 'learning_rate': 1.9992940330121366e-05, 'epoch': 0.03} +{'loss': 319.5823, 'grad_norm': 8065.938649434976, 'learning_rate': 1.9992775602414726e-05, 'epoch': 0.03} +{'loss': 297.2817, 'grad_norm': 6046.3954999090765, 'learning_rate': 1.999260897565506e-05, 'epoch': 0.03} +{'loss': 303.0433, 'grad_norm': 7086.278227460245, 'learning_rate': 1.9992440449874036e-05, 'epoch': 0.03} +{'loss': 306.5158, 'grad_norm': 8546.855454676544, 'learning_rate': 1.9992270025103678e-05, 'epoch': 0.03} +{'loss': 308.5524, 'grad_norm': 17706.623180554332, 'learning_rate': 1.9992097701376372e-05, 'epoch': 0.03} +{'loss': 303.7569, 'grad_norm': 7290.997343746077, 'learning_rate': 1.9991923478724872e-05, 'epoch': 0.03} +{'loss': 318.5758, 'grad_norm': 39434.404374059864, 'learning_rate': 1.9991747357182283e-05, 'epoch': 0.03} +{'loss': 309.0659, 'grad_norm': 7488.878661285784, 'learning_rate': 1.9991569336782077e-05, 'epoch': 0.03} +{'loss': 299.0802, 'grad_norm': 6906.356844054174, 'learning_rate': 1.9991389417558088e-05, 'epoch': 0.03} +{'loss': 300.6366, 'grad_norm': 8328.189557181175, 'learning_rate': 1.9991207599544508e-05, 'epoch': 0.03} +{'loss': 304.1396, 'grad_norm': 8573.007591566256, 'learning_rate': 1.9991023882775886e-05, 'epoch': 0.03} +{'loss': 300.8392, 'grad_norm': 6697.022515218526, 'learning_rate': 1.9990838267287137e-05, 'epoch': 0.03} +{'loss': 304.6042, 'grad_norm': 8623.55148115435, 'learning_rate': 1.999065075311354e-05, 'epoch': 0.03} +{'loss': 300.6198, 'grad_norm': 8343.953586686624, 'learning_rate': 1.9990461340290727e-05, 'epoch': 0.03} +{'loss': 295.4774, 'grad_norm': 8188.178449364588, 'learning_rate': 1.9990270028854698e-05, 'epoch': 0.03} +{'loss': 297.3747, 'grad_norm': 10497.529456181954, 'learning_rate': 1.9990076818841805e-05, 'epoch': 0.03} +{'loss': 297.9815, 'grad_norm': 9378.943947068658, 'learning_rate': 1.9989881710288773e-05, 'epoch': 0.03} +{'loss': 284.0368, 'grad_norm': 8220.735861258154, 'learning_rate': 1.9989684703232674e-05, 'epoch': 0.03} +{'loss': 307.4841, 'grad_norm': 10002.49544921718, 'learning_rate': 1.9989485797710952e-05, 'epoch': 0.03} +{'loss': 315.2706, 'grad_norm': 11837.756442698284, 'learning_rate': 1.9989284993761407e-05, 'epoch': 0.03} +{'loss': 303.1186, 'grad_norm': 10983.535601474632, 'learning_rate': 1.9989082291422196e-05, 'epoch': 0.03} +{'loss': 299.5825, 'grad_norm': 8425.768140261758, 'learning_rate': 1.998887769073185e-05, 'epoch': 0.03} +{'loss': 303.6638, 'grad_norm': 7194.835142373059, 'learning_rate': 1.9988671191729243e-05, 'epoch': 0.04} +{'loss': 285.5324, 'grad_norm': 6045.819165257687, 'learning_rate': 1.9988462794453624e-05, 'epoch': 0.04} +{'loss': 289.9272, 'grad_norm': 7186.349795388911, 'learning_rate': 1.9988252498944593e-05, 'epoch': 0.04} +{'loss': 309.4213, 'grad_norm': 12979.911346288549, 'learning_rate': 1.998804030524212e-05, 'epoch': 0.04} +{'loss': 299.7609, 'grad_norm': 7597.809690238993, 'learning_rate': 1.9987826213386525e-05, 'epoch': 0.04} +{'loss': 294.8166, 'grad_norm': 7162.816245532447, 'learning_rate': 1.9987610223418505e-05, 'epoch': 0.04} +{'loss': 298.4477, 'grad_norm': 6908.402940576547, 'learning_rate': 1.9987392335379093e-05, 'epoch': 0.04} +{'loss': 301.6677, 'grad_norm': 7392.280479957445, 'learning_rate': 1.9987172549309707e-05, 'epoch': 0.04} +{'loss': 301.9867, 'grad_norm': 6391.628583530934, 'learning_rate': 1.998695086525211e-05, 'epoch': 0.04} +{'loss': 299.6, 'grad_norm': 7781.133145627778, 'learning_rate': 1.998672728324844e-05, 'epoch': 0.04} +{'loss': 307.5614, 'grad_norm': 8203.392177147709, 'learning_rate': 1.998650180334118e-05, 'epoch': 0.04} +{'loss': 307.7181, 'grad_norm': 7177.271009478491, 'learning_rate': 1.9986274425573186e-05, 'epoch': 0.04} +{'loss': 293.7922, 'grad_norm': 7021.8411242235925, 'learning_rate': 1.998604514998766e-05, 'epoch': 0.04} +{'loss': 290.2468, 'grad_norm': 6866.033149007145, 'learning_rate': 1.998581397662818e-05, 'epoch': 0.04} +{'loss': 292.1691, 'grad_norm': 7165.52361086593, 'learning_rate': 1.998558090553868e-05, 'epoch': 0.04} +{'loss': 294.108, 'grad_norm': 6322.599055270106, 'learning_rate': 1.9985345936763453e-05, 'epoch': 0.04} +{'loss': 295.1796, 'grad_norm': 6539.617441444488, 'learning_rate': 1.998510907034715e-05, 'epoch': 0.04} +{'loss': 298.9973, 'grad_norm': 6929.031396288793, 'learning_rate': 1.998487030633479e-05, 'epoch': 0.04} +{'loss': 289.9841, 'grad_norm': 6014.826309967397, 'learning_rate': 1.9984629644771747e-05, 'epoch': 0.04} +[2025-05-27 15:59:32,338] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is about to be saved! +[2025-05-27 15:59:32,432] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 15:59:32,432] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 15:59:32,574] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 15:59:32,725] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 16:05:11,894] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 16:05:11,980] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 16:05:26,062] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2000 is ready now! +[2025-05-27 16:05:26,139] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is about to be saved! +[2025-05-27 16:05:26,163] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 16:05:26,163] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 16:05:26,242] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 16:05:26,311] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 16:11:10,395] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 16:11:10,559] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 16:11:26,670] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2000 is ready now! +{'loss': 286.4723, 'grad_norm': 5982.647745115934, 'learning_rate': 1.9984387085703755e-05, 'epoch': 0.04} +{'loss': 288.3169, 'grad_norm': 9428.632466861487, 'learning_rate': 1.9984142629176912e-05, 'epoch': 0.04} +{'loss': 293.3998, 'grad_norm': 6310.641923888585, 'learning_rate': 1.9983896275237677e-05, 'epoch': 0.04} +{'loss': 289.9672, 'grad_norm': 9399.954087609174, 'learning_rate': 1.9983648023932863e-05, 'epoch': 0.04} +{'loss': 304.6729, 'grad_norm': 7827.844661393074, 'learning_rate': 1.9983397875309652e-05, 'epoch': 0.04} +{'loss': 289.1203, 'grad_norm': 6554.300373685561, 'learning_rate': 1.998314582941558e-05, 'epoch': 0.04} +{'loss': 295.4327, 'grad_norm': 7715.332049316107, 'learning_rate': 1.998289188629855e-05, 'epoch': 0.04} +{'loss': 283.08, 'grad_norm': 7474.765285902324, 'learning_rate': 1.9982636046006817e-05, 'epoch': 0.04} +{'loss': 291.963, 'grad_norm': 7268.024674430168, 'learning_rate': 1.998237830858901e-05, 'epoch': 0.04} +{'loss': 287.0604, 'grad_norm': 5852.543486727241, 'learning_rate': 1.9982118674094104e-05, 'epoch': 0.04} +{'loss': 288.1686, 'grad_norm': 6143.24425255447, 'learning_rate': 1.9981857142571442e-05, 'epoch': 0.04} +{'loss': 289.2145, 'grad_norm': 9914.927466994573, 'learning_rate': 1.9981593714070722e-05, 'epoch': 0.04} +{'loss': 284.7304, 'grad_norm': 6101.13280151461, 'learning_rate': 1.9981328388642014e-05, 'epoch': 0.04} +{'loss': 283.2496, 'grad_norm': 5364.5175972670795, 'learning_rate': 1.9981061166335737e-05, 'epoch': 0.04} +{'loss': 281.6831, 'grad_norm': 5410.542943917702, 'learning_rate': 1.9980792047202674e-05, 'epoch': 0.04} +{'loss': 295.4832, 'grad_norm': 6865.695953872166, 'learning_rate': 1.998052103129397e-05, 'epoch': 0.04} +{'loss': 298.1781, 'grad_norm': 10470.417228384138, 'learning_rate': 1.998024811866113e-05, 'epoch': 0.04} +{'loss': 308.4186, 'grad_norm': 8587.382658111013, 'learning_rate': 1.997997330935602e-05, 'epoch': 0.04} +{'loss': 284.6719, 'grad_norm': 7285.973393755306, 'learning_rate': 1.9979696603430866e-05, 'epoch': 0.04} +{'loss': 293.63, 'grad_norm': 48694.17587525751, 'learning_rate': 1.997941800093825e-05, 'epoch': 0.04} +{'loss': 293.9157, 'grad_norm': 8798.799416168658, 'learning_rate': 1.997913750193112e-05, 'epoch': 0.04} +{'loss': 292.6706, 'grad_norm': 6794.731983323082, 'learning_rate': 1.9978855106462785e-05, 'epoch': 0.04} +{'loss': 290.7245, 'grad_norm': 7193.379511733667, 'learning_rate': 1.997857081458691e-05, 'epoch': 0.04} +{'loss': 281.6432, 'grad_norm': 5884.456188199507, 'learning_rate': 1.997828462635752e-05, 'epoch': 0.04} +{'loss': 288.6545, 'grad_norm': 6378.771864557847, 'learning_rate': 1.997799654182901e-05, 'epoch': 0.04} +{'loss': 290.8712, 'grad_norm': 7062.809644703008, 'learning_rate': 1.997770656105612e-05, 'epoch': 0.04} +{'loss': 291.469, 'grad_norm': 7291.971106254749, 'learning_rate': 1.9977414684093967e-05, 'epoch': 0.04} +{'loss': 294.8518, 'grad_norm': 10411.404819356301, 'learning_rate': 1.9977120910998016e-05, 'epoch': 0.04} +{'loss': 299.95, 'grad_norm': 10861.16209227371, 'learning_rate': 1.9976825241824095e-05, 'epoch': 0.04} +{'loss': 305.2853, 'grad_norm': 10075.78808665597, 'learning_rate': 1.9976527676628395e-05, 'epoch': 0.04} +{'loss': 297.2157, 'grad_norm': 6330.59076787373, 'learning_rate': 1.997622821546747e-05, 'epoch': 0.04} +{'loss': 286.2085, 'grad_norm': 8249.74724637323, 'learning_rate': 1.9975926858398223e-05, 'epoch': 0.04} +{'loss': 293.209, 'grad_norm': 6013.300086158216, 'learning_rate': 1.997562360547793e-05, 'epoch': 0.04} +{'loss': 285.3205, 'grad_norm': 5982.7199749710935, 'learning_rate': 1.997531845676422e-05, 'epoch': 0.05} +{'loss': 285.4995, 'grad_norm': 5949.257247060425, 'learning_rate': 1.997501141231508e-05, 'epoch': 0.05} +{'loss': 280.3362, 'grad_norm': 6769.245908405149, 'learning_rate': 1.9974702472188873e-05, 'epoch': 0.05} +{'loss': 286.7402, 'grad_norm': 6378.108362725461, 'learning_rate': 1.9974391636444302e-05, 'epoch': 0.05} +{'loss': 286.533, 'grad_norm': 6281.260552219933, 'learning_rate': 1.997407890514044e-05, 'epoch': 0.05} +{'loss': 281.3701, 'grad_norm': 6246.306750191569, 'learning_rate': 1.9973764278336717e-05, 'epoch': 0.05} +{'loss': 285.8197, 'grad_norm': 6233.060188188513, 'learning_rate': 1.9973447756092935e-05, 'epoch': 0.05} +{'loss': 283.5802, 'grad_norm': 5920.456561239844, 'learning_rate': 1.9973129338469236e-05, 'epoch': 0.05} +{'loss': 284.6309, 'grad_norm': 5731.660143312194, 'learning_rate': 1.997280902552614e-05, 'epoch': 0.05} +{'loss': 294.2882, 'grad_norm': 6447.770986182184, 'learning_rate': 1.9972486817324514e-05, 'epoch': 0.05} +{'loss': 278.9252, 'grad_norm': 5590.641044317201, 'learning_rate': 1.9972162713925597e-05, 'epoch': 0.05} +{'loss': 278.3477, 'grad_norm': 5700.409573041655, 'learning_rate': 1.997183671539098e-05, 'epoch': 0.05} +{'loss': 280.8005, 'grad_norm': 7204.245974124153, 'learning_rate': 1.9971508821782616e-05, 'epoch': 0.05} +{'loss': 278.2131, 'grad_norm': 8035.619299418274, 'learning_rate': 1.9971179033162823e-05, 'epoch': 0.05} +{'loss': 284.9582, 'grad_norm': 6378.799538017985, 'learning_rate': 1.9970847349594266e-05, 'epoch': 0.05} +{'loss': 282.7758, 'grad_norm': 5150.231862611728, 'learning_rate': 1.9970513771139988e-05, 'epoch': 0.05} +{'loss': 279.2287, 'grad_norm': 5635.543522483492, 'learning_rate': 1.9970178297863375e-05, 'epoch': 0.05} +{'loss': 281.6046, 'grad_norm': 11646.735089381922, 'learning_rate': 1.996984092982819e-05, 'epoch': 0.05} +{'loss': 288.0467, 'grad_norm': 8668.215258192251, 'learning_rate': 1.9969501667098547e-05, 'epoch': 0.05} +{'loss': 273.982, 'grad_norm': 5377.348151147404, 'learning_rate': 1.9969160509738907e-05, 'epoch': 0.05} +{'loss': 269.7868, 'grad_norm': 6763.959028623108, 'learning_rate': 1.996881745781412e-05, 'epoch': 0.05} +{'loss': 271.8189, 'grad_norm': 5276.858835848995, 'learning_rate': 1.9968472511389373e-05, 'epoch': 0.05} +{'loss': 271.7521, 'grad_norm': 5647.256758377208, 'learning_rate': 1.9968125670530222e-05, 'epoch': 0.05} +{'loss': 281.3188, 'grad_norm': 6576.640822962687, 'learning_rate': 1.996777693530258e-05, 'epoch': 0.05} +{'loss': 278.299, 'grad_norm': 5457.355258652536, 'learning_rate': 1.9967426305772727e-05, 'epoch': 0.05} +{'loss': 273.6992, 'grad_norm': 5856.925892811377, 'learning_rate': 1.996707378200729e-05, 'epoch': 0.05} +{'loss': 279.6693, 'grad_norm': 5700.853400923447, 'learning_rate': 1.9966719364073265e-05, 'epoch': 0.05} +{'loss': 281.579, 'grad_norm': 5465.842719823011, 'learning_rate': 1.9966363052038012e-05, 'epoch': 0.05} +{'loss': 279.4551, 'grad_norm': 5259.499552282461, 'learning_rate': 1.996600484596924e-05, 'epoch': 0.05} +{'loss': 281.9862, 'grad_norm': 5309.308606893805, 'learning_rate': 1.9965644745935026e-05, 'epoch': 0.05} +{'loss': 286.8579, 'grad_norm': 5801.2829885533465, 'learning_rate': 1.9965282752003803e-05, 'epoch': 0.05} +{'loss': 280.9132, 'grad_norm': 6380.461077991013, 'learning_rate': 1.996491886424437e-05, 'epoch': 0.05} +{'loss': 275.2541, 'grad_norm': 6443.429871349155, 'learning_rate': 1.9964553082725873e-05, 'epoch': 0.05} +{'loss': 275.9884, 'grad_norm': 7557.310983646175, 'learning_rate': 1.9964185407517827e-05, 'epoch': 0.05} +{'loss': 289.4439, 'grad_norm': 5730.358055521764, 'learning_rate': 1.9963815838690118e-05, 'epoch': 0.05} +{'loss': 279.3496, 'grad_norm': 6419.329784841913, 'learning_rate': 1.9963444376312964e-05, 'epoch': 0.05} +{'loss': 272.6331, 'grad_norm': 5024.9741367468405, 'learning_rate': 1.9963071020456966e-05, 'epoch': 0.05} +{'loss': 272.6989, 'grad_norm': 5888.730417022496, 'learning_rate': 1.9962695771193084e-05, 'epoch': 0.05} +{'loss': 265.9378, 'grad_norm': 5737.871119383757, 'learning_rate': 1.996231862859262e-05, 'epoch': 0.05} +{'loss': 278.9142, 'grad_norm': 5358.465104109923, 'learning_rate': 1.996193959272725e-05, 'epoch': 0.05} +{'loss': 274.9221, 'grad_norm': 5339.291252721916, 'learning_rate': 1.996155866366901e-05, 'epoch': 0.05} +{'loss': 276.3468, 'grad_norm': 6512.097901092639, 'learning_rate': 1.9961175841490292e-05, 'epoch': 0.05} +{'loss': 270.7878, 'grad_norm': 6262.250426841913, 'learning_rate': 1.9960791126263846e-05, 'epoch': 0.05} +{'loss': 274.8866, 'grad_norm': 6795.028306687616, 'learning_rate': 1.996040451806279e-05, 'epoch': 0.05} +{'loss': 283.0813, 'grad_norm': 5741.749606020706, 'learning_rate': 1.9960016016960587e-05, 'epoch': 0.05} +{'loss': 273.4389, 'grad_norm': 5114.93059556537, 'learning_rate': 1.995962562303108e-05, 'epoch': 0.05} +{'loss': 273.5774, 'grad_norm': 5773.4006694103855, 'learning_rate': 1.9959233336348452e-05, 'epoch': 0.05} +{'loss': 277.3649, 'grad_norm': 5728.427944064841, 'learning_rate': 1.9958839156987256e-05, 'epoch': 0.05} +{'loss': 273.6311, 'grad_norm': 5784.237232289836, 'learning_rate': 1.9958443085022402e-05, 'epoch': 0.05} +{'loss': 273.8848, 'grad_norm': 6643.392393004442, 'learning_rate': 1.9958045120529164e-05, 'epoch': 0.05} +{'loss': 270.0869, 'grad_norm': 5902.956523136116, 'learning_rate': 1.995764526358317e-05, 'epoch': 0.05} +{'loss': 278.4286, 'grad_norm': 5514.355170218259, 'learning_rate': 1.9957243514260408e-05, 'epoch': 0.05} +{'loss': 273.4903, 'grad_norm': 4925.399147202112, 'learning_rate': 1.9956839872637234e-05, 'epoch': 0.06} +{'loss': 268.1364, 'grad_norm': 5672.570477346067, 'learning_rate': 1.995643433879035e-05, 'epoch': 0.06} +{'loss': 264.4196, 'grad_norm': 4815.403712705149, 'learning_rate': 1.9956026912796827e-05, 'epoch': 0.06} +{'loss': 292.8321, 'grad_norm': 13956.139638974697, 'learning_rate': 1.9955617594734096e-05, 'epoch': 0.06} +{'loss': 267.1038, 'grad_norm': 4533.280613311907, 'learning_rate': 1.995520638467994e-05, 'epoch': 0.06} +{'loss': 277.9058, 'grad_norm': 7150.525078328858, 'learning_rate': 1.9954793282712508e-05, 'epoch': 0.06} +{'loss': 267.3103, 'grad_norm': 5168.448824817495, 'learning_rate': 1.995437828891031e-05, 'epoch': 0.06} +{'loss': 268.5873, 'grad_norm': 7772.247999449252, 'learning_rate': 1.9953961403352213e-05, 'epoch': 0.06} +{'loss': 270.5858, 'grad_norm': 6762.01607543658, 'learning_rate': 1.9953542626117437e-05, 'epoch': 0.06} +{'loss': 267.8625, 'grad_norm': 6366.9799840890155, 'learning_rate': 1.9953121957285572e-05, 'epoch': 0.06} +{'loss': 281.5777, 'grad_norm': 4697.719077052751, 'learning_rate': 1.995269939693656e-05, 'epoch': 0.06} +{'loss': 270.3846, 'grad_norm': 5380.617911061849, 'learning_rate': 1.995227494515071e-05, 'epoch': 0.06} +{'loss': 262.2327, 'grad_norm': 5133.07022029338, 'learning_rate': 1.9951848602008686e-05, 'epoch': 0.06} +{'loss': 276.6385, 'grad_norm': 5791.521373806686, 'learning_rate': 1.9951420367591504e-05, 'epoch': 0.06} +{'loss': 266.9408, 'grad_norm': 6135.777825406228, 'learning_rate': 1.9950990241980554e-05, 'epoch': 0.06} +{'loss': 268.3585, 'grad_norm': 5887.052879461476, 'learning_rate': 1.9950558225257574e-05, 'epoch': 0.06} +{'loss': 271.4122, 'grad_norm': 4551.05257608361, 'learning_rate': 1.9950124317504665e-05, 'epoch': 0.06} +{'loss': 270.2365, 'grad_norm': 6651.294209123826, 'learning_rate': 1.9949688518804296e-05, 'epoch': 0.06} +{'loss': 269.732, 'grad_norm': 5363.351193429454, 'learning_rate': 1.9949250829239277e-05, 'epoch': 0.06} +{'loss': 270.7998, 'grad_norm': 5697.862263104024, 'learning_rate': 1.9948811248892793e-05, 'epoch': 0.06} +{'loss': 262.858, 'grad_norm': 5203.33951741443, 'learning_rate': 1.9948369777848386e-05, 'epoch': 0.06} +{'loss': 270.8703, 'grad_norm': 5146.89116503673, 'learning_rate': 1.9947926416189947e-05, 'epoch': 0.06} +{'loss': 272.2091, 'grad_norm': 6351.329571270656, 'learning_rate': 1.994748116400174e-05, 'epoch': 0.06} +{'loss': 266.1269, 'grad_norm': 5180.6218110544705, 'learning_rate': 1.9947034021368375e-05, 'epoch': 0.06} +{'loss': 274.5471, 'grad_norm': 5062.265137489766, 'learning_rate': 1.9946584988374834e-05, 'epoch': 0.06} +{'loss': 266.222, 'grad_norm': 5866.067753173707, 'learning_rate': 1.994613406510645e-05, 'epoch': 0.06} +{'loss': 266.0699, 'grad_norm': 5328.254656311031, 'learning_rate': 1.994568125164892e-05, 'epoch': 0.06} +{'loss': 269.0337, 'grad_norm': 5620.463808089424, 'learning_rate': 1.9945226548088296e-05, 'epoch': 0.06} +{'loss': 289.306, 'grad_norm': 10168.946935027823, 'learning_rate': 1.9944769954510987e-05, 'epoch': 0.06} +{'loss': 273.5809, 'grad_norm': 5991.265238304619, 'learning_rate': 1.9944311471003775e-05, 'epoch': 0.06} +{'loss': 270.3863, 'grad_norm': 5876.567650826719, 'learning_rate': 1.994385109765378e-05, 'epoch': 0.06} +{'loss': 268.4849, 'grad_norm': 6402.824028927786, 'learning_rate': 1.99433888345485e-05, 'epoch': 0.06} +{'loss': 264.1833, 'grad_norm': 7466.321501880956, 'learning_rate': 1.9942924681775787e-05, 'epoch': 0.06} +{'loss': 264.1096, 'grad_norm': 5714.458263516542, 'learning_rate': 1.994245863942384e-05, 'epoch': 0.06} +{'loss': 275.9551, 'grad_norm': 16438.13244421413, 'learning_rate': 1.9941990707581237e-05, 'epoch': 0.06} +{'loss': 272.1054, 'grad_norm': 9168.134644507567, 'learning_rate': 1.99415208863369e-05, 'epoch': 0.06} +{'loss': 262.7327, 'grad_norm': 5906.922151048145, 'learning_rate': 1.994104917578011e-05, 'epoch': 0.06} +{'loss': 251.5728, 'grad_norm': 4709.182427340126, 'learning_rate': 1.9940575576000526e-05, 'epoch': 0.06} +{'loss': 253.3597, 'grad_norm': 5341.033354991707, 'learning_rate': 1.994010008708814e-05, 'epoch': 0.06} +{'loss': 262.9364, 'grad_norm': 6299.820592348834, 'learning_rate': 1.993962270913332e-05, 'epoch': 0.06} +{'loss': 262.7119, 'grad_norm': 10825.015206924485, 'learning_rate': 1.9939143442226786e-05, 'epoch': 0.06} +{'loss': 270.0059, 'grad_norm': 5945.339621954083, 'learning_rate': 1.993866228645962e-05, 'epoch': 0.06} +{'loss': 260.5726, 'grad_norm': 4736.838439480678, 'learning_rate': 1.9938179241923263e-05, 'epoch': 0.06} +{'loss': 261.1622, 'grad_norm': 4348.389360722324, 'learning_rate': 1.9937694308709514e-05, 'epoch': 0.06} +{'loss': 263.5107, 'grad_norm': 5121.920834356014, 'learning_rate': 1.993720748691053e-05, 'epoch': 0.06} +{'loss': 262.0286, 'grad_norm': 5695.853652506345, 'learning_rate': 1.9936718776618825e-05, 'epoch': 0.06} +{'loss': 254.6154, 'grad_norm': 6069.945493909362, 'learning_rate': 1.993622817792728e-05, 'epoch': 0.06} +{'loss': 268.8489, 'grad_norm': 5719.122678679285, 'learning_rate': 1.993573569092913e-05, 'epoch': 0.06} +{'loss': 260.1831, 'grad_norm': 5410.305569184699, 'learning_rate': 1.9935241315717966e-05, 'epoch': 0.06} +{'loss': 264.7921, 'grad_norm': 5313.469372062846, 'learning_rate': 1.993474505238774e-05, 'epoch': 0.06} +{'loss': 266.0815, 'grad_norm': 6251.920700761831, 'learning_rate': 1.9934246901032764e-05, 'epoch': 0.06} +{'loss': 253.164, 'grad_norm': 5871.60766268327, 'learning_rate': 1.9933746861747706e-05, 'epoch': 0.06} +{'loss': 263.0707, 'grad_norm': 5234.030451692944, 'learning_rate': 1.9933244934627597e-05, 'epoch': 0.07} +{'loss': 263.9035, 'grad_norm': 5737.125796997924, 'learning_rate': 1.9932741119767828e-05, 'epoch': 0.07} +{'loss': 262.4721, 'grad_norm': 6079.835121753672, 'learning_rate': 1.9932235417264135e-05, 'epoch': 0.07} +{'loss': 256.7705, 'grad_norm': 5783.558111384768, 'learning_rate': 1.9931727827212632e-05, 'epoch': 0.07} +{'loss': 256.6023, 'grad_norm': 6040.82744477673, 'learning_rate': 1.9931218349709783e-05, 'epoch': 0.07} +{'loss': 255.495, 'grad_norm': 4919.899999935508, 'learning_rate': 1.9930706984852403e-05, 'epoch': 0.07} +{'loss': 260.1925, 'grad_norm': 5425.473689453248, 'learning_rate': 1.993019373273768e-05, 'epoch': 0.07} +{'loss': 266.7468, 'grad_norm': 5210.843873993726, 'learning_rate': 1.9929678593463148e-05, 'epoch': 0.07} +{'loss': 258.438, 'grad_norm': 5744.544906443449, 'learning_rate': 1.992916156712671e-05, 'epoch': 0.07} +{'loss': 260.7026, 'grad_norm': 5367.803710624718, 'learning_rate': 1.9928642653826625e-05, 'epoch': 0.07} +{'loss': 278.2777, 'grad_norm': 5070.764747260182, 'learning_rate': 1.9928121853661502e-05, 'epoch': 0.07} +{'loss': 259.4603, 'grad_norm': 5204.283130272653, 'learning_rate': 1.9927599166730324e-05, 'epoch': 0.07} +{'loss': 258.7573, 'grad_norm': 5723.907478593047, 'learning_rate': 1.9927074593132412e-05, 'epoch': 0.07} +{'loss': 255.7584, 'grad_norm': 5809.482319351213, 'learning_rate': 1.9926548132967463e-05, 'epoch': 0.07} +{'loss': 257.5015, 'grad_norm': 5823.568720567016, 'learning_rate': 1.9926019786335532e-05, 'epoch': 0.07} +{'loss': 265.0275, 'grad_norm': 6775.589762108202, 'learning_rate': 1.9925489553337023e-05, 'epoch': 0.07} +{'loss': 261.5918, 'grad_norm': 4546.859836919356, 'learning_rate': 1.9924957434072702e-05, 'epoch': 0.07} +{'loss': 262.2006, 'grad_norm': 5673.117813138109, 'learning_rate': 1.9924423428643694e-05, 'epoch': 0.07} +{'loss': 259.4596, 'grad_norm': 6020.142209122516, 'learning_rate': 1.9923887537151485e-05, 'epoch': 0.07} +{'loss': 258.443, 'grad_norm': 5376.260326270022, 'learning_rate': 1.9923349759697915e-05, 'epoch': 0.07} +{'loss': 258.8044, 'grad_norm': 8659.846597933332, 'learning_rate': 1.992281009638519e-05, 'epoch': 0.07} +{'loss': 269.3783, 'grad_norm': 5212.576609887003, 'learning_rate': 1.9922268547315858e-05, 'epoch': 0.07} +{'loss': 268.4919, 'grad_norm': 5682.821563566315, 'learning_rate': 1.9921725112592845e-05, 'epoch': 0.07} +{'loss': 261.1105, 'grad_norm': 4863.309115219586, 'learning_rate': 1.9921179792319427e-05, 'epoch': 0.07} +{'loss': 255.9614, 'grad_norm': 4879.449174677589, 'learning_rate': 1.9920632586599235e-05, 'epoch': 0.07} +{'loss': 253.981, 'grad_norm': 5398.793303094365, 'learning_rate': 1.9920083495536263e-05, 'epoch': 0.07} +{'loss': 257.9243, 'grad_norm': 4927.287078655291, 'learning_rate': 1.9919532519234858e-05, 'epoch': 0.07} +{'loss': 265.3031, 'grad_norm': 5223.159529891428, 'learning_rate': 1.991897965779974e-05, 'epoch': 0.07} +{'loss': 258.1597, 'grad_norm': 5807.7377099004925, 'learning_rate': 1.991842491133596e-05, 'epoch': 0.07} +{'loss': 256.1671, 'grad_norm': 5767.879616083101, 'learning_rate': 1.9917868279948953e-05, 'epoch': 0.07} +{'loss': 261.8735, 'grad_norm': 4383.422346556441, 'learning_rate': 1.9917309763744504e-05, 'epoch': 0.07} +{'loss': 263.2288, 'grad_norm': 5049.17072780954, 'learning_rate': 1.991674936282875e-05, 'epoch': 0.07} +{'loss': 264.6083, 'grad_norm': 5283.129991198693, 'learning_rate': 1.9916187077308194e-05, 'epoch': 0.07} +{'loss': 253.4765, 'grad_norm': 5754.0527092974635, 'learning_rate': 1.9915622907289695e-05, 'epoch': 0.07} +{'loss': 269.1185, 'grad_norm': 5928.186312478771, 'learning_rate': 1.9915056852880468e-05, 'epoch': 0.07} +{'loss': 259.6534, 'grad_norm': 6237.221379817354, 'learning_rate': 1.9914488914188085e-05, 'epoch': 0.07} +{'loss': 256.5458, 'grad_norm': 4984.02090165369, 'learning_rate': 1.991391909132048e-05, 'epoch': 0.07} +{'loss': 263.3731, 'grad_norm': 6139.1822511606615, 'learning_rate': 1.991334738438595e-05, 'epoch': 0.07} +{'loss': 266.7272, 'grad_norm': 4881.287931792868, 'learning_rate': 1.9912773793493133e-05, 'epoch': 0.07} +{'loss': 252.0567, 'grad_norm': 6334.846094972156, 'learning_rate': 1.991219831875104e-05, 'epoch': 0.07} +{'loss': 248.2841, 'grad_norm': 4984.046969444316, 'learning_rate': 1.991162096026904e-05, 'epoch': 0.07} +{'loss': 252.9763, 'grad_norm': 6900.840196357435, 'learning_rate': 1.9911041718156852e-05, 'epoch': 0.07} +{'loss': 262.7292, 'grad_norm': 5731.201225324468, 'learning_rate': 1.991046059252456e-05, 'epoch': 0.07} +{'loss': 256.2999, 'grad_norm': 5096.047330895367, 'learning_rate': 1.9909877583482594e-05, 'epoch': 0.07} +{'loss': 247.7265, 'grad_norm': 4567.754141781349, 'learning_rate': 1.9909292691141755e-05, 'epoch': 0.07} +{'loss': 247.3756, 'grad_norm': 4774.586713932822, 'learning_rate': 1.9908705915613202e-05, 'epoch': 0.07} +{'loss': 250.9869, 'grad_norm': 5218.423620492083, 'learning_rate': 1.9908117257008444e-05, 'epoch': 0.07} +{'loss': 244.8083, 'grad_norm': 4437.989518842503, 'learning_rate': 1.990752671543935e-05, 'epoch': 0.07} +{'loss': 253.0361, 'grad_norm': 4442.934138987595, 'learning_rate': 1.990693429101815e-05, 'epoch': 0.07} +{'loss': 246.371, 'grad_norm': 4753.148397931764, 'learning_rate': 1.9906339983857428e-05, 'epoch': 0.07} +{'loss': 253.7837, 'grad_norm': 4367.921106583825, 'learning_rate': 1.9905743794070128e-05, 'epoch': 0.07} +{'loss': 256.9134, 'grad_norm': 5083.186378583804, 'learning_rate': 1.9905145721769554e-05, 'epoch': 0.07} +{'loss': 249.6476, 'grad_norm': 5483.092287993849, 'learning_rate': 1.9904545767069363e-05, 'epoch': 0.08} +{'loss': 254.399, 'grad_norm': 4905.350751052706, 'learning_rate': 1.9903943930083574e-05, 'epoch': 0.08} +{'loss': 245.5006, 'grad_norm': 5144.135979909222, 'learning_rate': 1.9903340210926555e-05, 'epoch': 0.08} +{'loss': 254.655, 'grad_norm': 5744.334081959852, 'learning_rate': 1.9902734609713046e-05, 'epoch': 0.08} +{'loss': 247.7445, 'grad_norm': 4771.149799233891, 'learning_rate': 1.9902127126558138e-05, 'epoch': 0.08} +{'loss': 253.9628, 'grad_norm': 5194.276428929531, 'learning_rate': 1.9901517761577273e-05, 'epoch': 0.08} +{'loss': 259.5312, 'grad_norm': 6233.642247253174, 'learning_rate': 1.9900906514886254e-05, 'epoch': 0.08} +{'loss': 257.5294, 'grad_norm': 4796.226419321303, 'learning_rate': 1.9900293386601254e-05, 'epoch': 0.08} +{'loss': 245.9384, 'grad_norm': 5564.731062353742, 'learning_rate': 1.9899678376838783e-05, 'epoch': 0.08} +{'loss': 253.6868, 'grad_norm': 4833.158042020205, 'learning_rate': 1.9899061485715726e-05, 'epoch': 0.08} +{'loss': 256.8946, 'grad_norm': 5088.625918211801, 'learning_rate': 1.989844271334932e-05, 'epoch': 0.08} +[2025-05-27 18:00:29,189] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is about to be saved! +[2025-05-27 18:00:29,272] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 18:00:29,272] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 18:00:29,377] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 18:00:29,526] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 18:06:18,443] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 18:06:18,489] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 18:06:26,173] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step4000 is ready now! +[2025-05-27 18:06:26,205] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is about to be saved! +[2025-05-27 18:06:26,225] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 18:06:26,226] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 18:06:26,284] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 18:06:26,439] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 18:12:08,105] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 18:12:08,266] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-4000/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 18:12:17,513] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step4000 is ready now! +{'loss': 253.002, 'grad_norm': 5706.643980985074, 'learning_rate': 1.9897822059857146e-05, 'epoch': 0.08} +{'loss': 256.4881, 'grad_norm': 5089.499169395007, 'learning_rate': 1.989719952535717e-05, 'epoch': 0.08} +{'loss': 269.2233, 'grad_norm': 4973.148311100811, 'learning_rate': 1.9896575109967692e-05, 'epoch': 0.08} +{'loss': 250.9371, 'grad_norm': 4307.317197943101, 'learning_rate': 1.9895948813807377e-05, 'epoch': 0.08} +{'loss': 253.2884, 'grad_norm': 5544.796805299328, 'learning_rate': 1.989532063699525e-05, 'epoch': 0.08} +{'loss': 250.9568, 'grad_norm': 5072.004069184222, 'learning_rate': 1.9894690579650694e-05, 'epoch': 0.08} +{'loss': 252.2962, 'grad_norm': 4640.639271231568, 'learning_rate': 1.989405864189344e-05, 'epoch': 0.08} +{'loss': 248.6559, 'grad_norm': 4203.039660707991, 'learning_rate': 1.989342482384359e-05, 'epoch': 0.08} +{'loss': 251.5532, 'grad_norm': 4632.308093723779, 'learning_rate': 1.989278912562159e-05, 'epoch': 0.08} +{'loss': 253.2678, 'grad_norm': 4048.9913422520403, 'learning_rate': 1.9892151547348254e-05, 'epoch': 0.08} +{'loss': 248.0539, 'grad_norm': 4655.257936880383, 'learning_rate': 1.989151208914475e-05, 'epoch': 0.08} +{'loss': 249.3407, 'grad_norm': 17404.204885920968, 'learning_rate': 1.98908707511326e-05, 'epoch': 0.08} +{'loss': 255.0875, 'grad_norm': 4758.893828701143, 'learning_rate': 1.9890227533433685e-05, 'epoch': 0.08} +{'loss': 255.0094, 'grad_norm': 5185.827486010931, 'learning_rate': 1.9889582436170242e-05, 'epoch': 0.08} +{'loss': 244.9269, 'grad_norm': 5244.096952844905, 'learning_rate': 1.9888935459464875e-05, 'epoch': 0.08} +{'loss': 248.9609, 'grad_norm': 5342.55817131766, 'learning_rate': 1.9888286603440528e-05, 'epoch': 0.08} +{'loss': 252.4532, 'grad_norm': 5414.813978623954, 'learning_rate': 1.9887635868220516e-05, 'epoch': 0.08} +{'loss': 240.757, 'grad_norm': 4465.388001770276, 'learning_rate': 1.9886983253928505e-05, 'epoch': 0.08} +{'loss': 258.9011, 'grad_norm': 5942.779887554109, 'learning_rate': 1.988632876068852e-05, 'epoch': 0.08} +{'loss': 247.0266, 'grad_norm': 6011.141843098709, 'learning_rate': 1.9885672388624942e-05, 'epoch': 0.08} +{'loss': 250.1597, 'grad_norm': 4721.889389489868, 'learning_rate': 1.9885014137862512e-05, 'epoch': 0.08} +{'loss': 242.6988, 'grad_norm': 4299.2211278486775, 'learning_rate': 1.988435400852632e-05, 'epoch': 0.08} +{'loss': 248.6468, 'grad_norm': 7911.986835788672, 'learning_rate': 1.9883692000741827e-05, 'epoch': 0.08} +{'loss': 254.14, 'grad_norm': 5123.415952960608, 'learning_rate': 1.9883028114634836e-05, 'epoch': 0.08} +{'loss': 256.3147, 'grad_norm': 4862.578008841945, 'learning_rate': 1.9882362350331516e-05, 'epoch': 0.08} +{'loss': 248.0838, 'grad_norm': 4024.658404701674, 'learning_rate': 1.988169470795839e-05, 'epoch': 0.08} +{'loss': 259.7167, 'grad_norm': 4436.400693413799, 'learning_rate': 1.988102518764234e-05, 'epoch': 0.08} +{'loss': 241.0959, 'grad_norm': 4462.668234408074, 'learning_rate': 1.9880353789510606e-05, 'epoch': 0.08} +{'loss': 245.6783, 'grad_norm': 3819.9457355046748, 'learning_rate': 1.9879680513690772e-05, 'epoch': 0.08} +{'loss': 241.236, 'grad_norm': 5244.117002608137, 'learning_rate': 1.9879005360310798e-05, 'epoch': 0.08} +{'loss': 247.3927, 'grad_norm': 5756.222269179554, 'learning_rate': 1.987832832949899e-05, 'epoch': 0.08} +{'loss': 246.5972, 'grad_norm': 4042.3527026143843, 'learning_rate': 1.9877649421384012e-05, 'epoch': 0.08} +{'loss': 249.6531, 'grad_norm': 4392.961573138766, 'learning_rate': 1.9876968636094888e-05, 'epoch': 0.08} +{'loss': 248.326, 'grad_norm': 4485.6042367239515, 'learning_rate': 1.9876285973760993e-05, 'epoch': 0.08} +{'loss': 246.7857, 'grad_norm': 4849.705769595763, 'learning_rate': 1.987560143451206e-05, 'epoch': 0.08} +{'loss': 256.0474, 'grad_norm': 5244.154544592652, 'learning_rate': 1.9874915018478186e-05, 'epoch': 0.08} +{'loss': 251.6139, 'grad_norm': 4432.962289193002, 'learning_rate': 1.9874226725789815e-05, 'epoch': 0.08} +{'loss': 241.3346, 'grad_norm': 3723.98180499243, 'learning_rate': 1.9873536556577754e-05, 'epoch': 0.08} +{'loss': 248.6425, 'grad_norm': 4499.161635430944, 'learning_rate': 1.9872844510973165e-05, 'epoch': 0.08} +{'loss': 247.1508, 'grad_norm': 5492.590430541617, 'learning_rate': 1.9872150589107565e-05, 'epoch': 0.08} +{'loss': 248.3343, 'grad_norm': 4468.567383427872, 'learning_rate': 1.987145479111283e-05, 'epoch': 0.08} +{'loss': 245.6465, 'grad_norm': 6268.830889694204, 'learning_rate': 1.9870757117121192e-05, 'epoch': 0.09} +{'loss': 243.2504, 'grad_norm': 4101.740857307009, 'learning_rate': 1.9870057567265233e-05, 'epoch': 0.09} +{'loss': 243.7554, 'grad_norm': 4886.8253255056115, 'learning_rate': 1.98693561416779e-05, 'epoch': 0.09} +{'loss': 242.4127, 'grad_norm': 4372.448975950228, 'learning_rate': 1.98686528404925e-05, 'epoch': 0.09} +{'loss': 245.8554, 'grad_norm': 4355.234814509353, 'learning_rate': 1.9867947663842684e-05, 'epoch': 0.09} +{'loss': 250.5643, 'grad_norm': 5427.321675459001, 'learning_rate': 1.9867240611862467e-05, 'epoch': 0.09} +{'loss': 250.6592, 'grad_norm': 4877.532950830604, 'learning_rate': 1.986653168468622e-05, 'epoch': 0.09} +{'loss': 252.4883, 'grad_norm': 6675.180467329186, 'learning_rate': 1.9865820882448665e-05, 'epoch': 0.09} +{'loss': 248.3097, 'grad_norm': 4416.376757922113, 'learning_rate': 1.9865108205284888e-05, 'epoch': 0.09} +{'loss': 253.0402, 'grad_norm': 4267.3277505224005, 'learning_rate': 1.9864393653330332e-05, 'epoch': 0.09} +{'loss': 250.4248, 'grad_norm': 4513.404691140366, 'learning_rate': 1.9863677226720787e-05, 'epoch': 0.09} +{'loss': 243.3575, 'grad_norm': 5794.227652194739, 'learning_rate': 1.9862958925592405e-05, 'epoch': 0.09} +{'loss': 244.2448, 'grad_norm': 6651.603624297834, 'learning_rate': 1.9862238750081693e-05, 'epoch': 0.09} +{'loss': 247.0087, 'grad_norm': 3988.024950728662, 'learning_rate': 1.986151670032552e-05, 'epoch': 0.09} +{'loss': 246.2982, 'grad_norm': 4862.574770304205, 'learning_rate': 1.98607927764611e-05, 'epoch': 0.09} +{'loss': 250.683, 'grad_norm': 5040.887924066737, 'learning_rate': 1.9860066978626014e-05, 'epoch': 0.09} +{'loss': 244.6816, 'grad_norm': 4866.819115466417, 'learning_rate': 1.9859339306958192e-05, 'epoch': 0.09} +{'loss': 242.4684, 'grad_norm': 4071.941595799817, 'learning_rate': 1.9858609761595923e-05, 'epoch': 0.09} +{'loss': 246.4316, 'grad_norm': 4265.624932517257, 'learning_rate': 1.9857878342677853e-05, 'epoch': 0.09} +{'loss': 243.4982, 'grad_norm': 3949.1680129873635, 'learning_rate': 1.9857145050342983e-05, 'epoch': 0.09} +{'loss': 243.1369, 'grad_norm': 4891.30775763911, 'learning_rate': 1.9856409884730667e-05, 'epoch': 0.09} +{'loss': 245.3821, 'grad_norm': 5545.376666757302, 'learning_rate': 1.9855672845980618e-05, 'epoch': 0.09} +{'loss': 241.922, 'grad_norm': 3917.2075715495102, 'learning_rate': 1.985493393423291e-05, 'epoch': 0.09} +{'loss': 247.0883, 'grad_norm': 4523.391066588496, 'learning_rate': 1.985419314962796e-05, 'epoch': 0.09} +{'loss': 260.7994, 'grad_norm': 6256.708541230452, 'learning_rate': 1.9853450492306556e-05, 'epoch': 0.09} +{'loss': 245.0001, 'grad_norm': 4246.673181598854, 'learning_rate': 1.985270596240983e-05, 'epoch': 0.09} +{'loss': 252.8711, 'grad_norm': 5825.725349448773, 'learning_rate': 1.9851959560079273e-05, 'epoch': 0.09} +{'loss': 243.9565, 'grad_norm': 4576.246966929074, 'learning_rate': 1.9851211285456738e-05, 'epoch': 0.09} +{'loss': 243.4543, 'grad_norm': 4365.146442552849, 'learning_rate': 1.9850461138684426e-05, 'epoch': 0.09} +{'loss': 249.9271, 'grad_norm': 4419.951436780358, 'learning_rate': 1.9849709119904895e-05, 'epoch': 0.09} +{'loss': 245.3832, 'grad_norm': 4289.0133044041895, 'learning_rate': 1.9848955229261068e-05, 'epoch': 0.09} +{'loss': 243.6839, 'grad_norm': 4454.609873022917, 'learning_rate': 1.984819946689621e-05, 'epoch': 0.09} +{'loss': 249.3783, 'grad_norm': 4755.25773075336, 'learning_rate': 1.984744183295395e-05, 'epoch': 0.09} +{'loss': 247.8827, 'grad_norm': 4315.698717441482, 'learning_rate': 1.9846682327578268e-05, 'epoch': 0.09} +{'loss': 244.4402, 'grad_norm': 3716.6158771997398, 'learning_rate': 1.9845920950913506e-05, 'epoch': 0.09} +{'loss': 240.6693, 'grad_norm': 4290.829759410978, 'learning_rate': 1.9845157703104357e-05, 'epoch': 0.09} +{'loss': 245.3078, 'grad_norm': 5734.870190500958, 'learning_rate': 1.984439258429587e-05, 'epoch': 0.09} +{'loss': 245.16, 'grad_norm': 4718.737853809374, 'learning_rate': 1.9843625594633452e-05, 'epoch': 0.09} +{'loss': 251.6174, 'grad_norm': 4441.89004299403, 'learning_rate': 1.984285673426286e-05, 'epoch': 0.09} +{'loss': 236.0407, 'grad_norm': 3837.630267386255, 'learning_rate': 1.984208600333021e-05, 'epoch': 0.09} +{'loss': 240.2411, 'grad_norm': 3737.5361350755575, 'learning_rate': 1.984131340198198e-05, 'epoch': 0.09} +{'loss': 255.6628, 'grad_norm': 4855.512504845424, 'learning_rate': 1.9840538930364992e-05, 'epoch': 0.09} +{'loss': 249.3929, 'grad_norm': 8954.851657678224, 'learning_rate': 1.983976258862643e-05, 'epoch': 0.09} +{'loss': 242.0065, 'grad_norm': 4564.152629877211, 'learning_rate': 1.9838984376913833e-05, 'epoch': 0.09} +{'loss': 244.1546, 'grad_norm': 4666.958927823942, 'learning_rate': 1.983820429537509e-05, 'epoch': 0.09} +{'loss': 241.1909, 'grad_norm': 3779.4670330006406, 'learning_rate': 1.9837422344158456e-05, 'epoch': 0.09} +{'loss': 236.918, 'grad_norm': 3944.336098650434, 'learning_rate': 1.983663852341253e-05, 'epoch': 0.09} +{'loss': 242.4354, 'grad_norm': 4249.78713370144, 'learning_rate': 1.9835852833286273e-05, 'epoch': 0.09} +{'loss': 239.8427, 'grad_norm': 4270.9046171272985, 'learning_rate': 1.9835065273929002e-05, 'epoch': 0.09} +{'loss': 243.2954, 'grad_norm': 4560.272108152037, 'learning_rate': 1.9834275845490384e-05, 'epoch': 0.09} +{'loss': 245.042, 'grad_norm': 4016.9401570121727, 'learning_rate': 1.9833484548120443e-05, 'epoch': 0.09} +{'loss': 242.8463, 'grad_norm': 3923.2673423662054, 'learning_rate': 1.983269138196956e-05, 'epoch': 0.09} +{'loss': 239.3249, 'grad_norm': 4210.923398568457, 'learning_rate': 1.9831896347188476e-05, 'epoch': 0.1} +{'loss': 247.1364, 'grad_norm': 6733.742619158002, 'learning_rate': 1.9831099443928275e-05, 'epoch': 0.1} +{'loss': 241.1064, 'grad_norm': 5604.667057945613, 'learning_rate': 1.98303006723404e-05, 'epoch': 0.1} +{'loss': 238.7894, 'grad_norm': 3757.309602127364, 'learning_rate': 1.982950003257666e-05, 'epoch': 0.1} +{'loss': 236.5079, 'grad_norm': 4530.819179517332, 'learning_rate': 1.9828697524789207e-05, 'epoch': 0.1} +{'loss': 236.4103, 'grad_norm': 4271.910767015976, 'learning_rate': 1.982789314913055e-05, 'epoch': 0.1} +{'loss': 245.8392, 'grad_norm': 6182.597547053084, 'learning_rate': 1.9827086905753557e-05, 'epoch': 0.1} +{'loss': 242.2586, 'grad_norm': 4297.492633431516, 'learning_rate': 1.9826278794811445e-05, 'epoch': 0.1} +{'loss': 242.1635, 'grad_norm': 4126.965231783521, 'learning_rate': 1.9825468816457796e-05, 'epoch': 0.1} +{'loss': 241.1167, 'grad_norm': 4632.438113029274, 'learning_rate': 1.9824656970846535e-05, 'epoch': 0.1} +{'loss': 236.9617, 'grad_norm': 4221.365537290329, 'learning_rate': 1.9823843258131945e-05, 'epoch': 0.1} +{'loss': 233.9433, 'grad_norm': 4472.852604931069, 'learning_rate': 1.9823027678468675e-05, 'epoch': 0.1} +{'loss': 241.5538, 'grad_norm': 4056.882603927285, 'learning_rate': 1.9822210232011708e-05, 'epoch': 0.1} +{'loss': 234.1987, 'grad_norm': 3606.878866280406, 'learning_rate': 1.9821390918916407e-05, 'epoch': 0.1} +{'loss': 235.8393, 'grad_norm': 3803.9970425877186, 'learning_rate': 1.9820569739338468e-05, 'epoch': 0.1} +{'loss': 239.4918, 'grad_norm': 4473.188888884076, 'learning_rate': 1.981974669343395e-05, 'epoch': 0.1} +{'loss': 243.6425, 'grad_norm': 4092.821022784284, 'learning_rate': 1.981892178135927e-05, 'epoch': 0.1} +{'loss': 239.9658, 'grad_norm': 4331.663615721003, 'learning_rate': 1.9818095003271193e-05, 'epoch': 0.1} +{'loss': 243.3735, 'grad_norm': 4110.71742918651, 'learning_rate': 1.9817266359326844e-05, 'epoch': 0.1} +{'loss': 237.6298, 'grad_norm': 4589.041371134349, 'learning_rate': 1.9816435849683704e-05, 'epoch': 0.1} +{'loss': 244.4074, 'grad_norm': 3656.3659998471367, 'learning_rate': 1.98156034744996e-05, 'epoch': 0.1} +{'loss': 238.6049, 'grad_norm': 4479.79762082503, 'learning_rate': 1.981476923393272e-05, 'epoch': 0.1} +{'loss': 239.666, 'grad_norm': 4250.128051981735, 'learning_rate': 1.9813933128141604e-05, 'epoch': 0.1} +{'loss': 241.403, 'grad_norm': 4591.681986964152, 'learning_rate': 1.9813095157285153e-05, 'epoch': 0.1} +{'loss': 234.5168, 'grad_norm': 4461.33211327001, 'learning_rate': 1.9812255321522614e-05, 'epoch': 0.1} +{'loss': 233.5084, 'grad_norm': 4039.4717538278123, 'learning_rate': 1.981141362101359e-05, 'epoch': 0.1} +{'loss': 237.2544, 'grad_norm': 4044.5741387399844, 'learning_rate': 1.981057005591804e-05, 'epoch': 0.1} +{'loss': 233.4442, 'grad_norm': 4697.5113479380725, 'learning_rate': 1.9809724626396277e-05, 'epoch': 0.1} +{'loss': 238.6111, 'grad_norm': 4271.8495266778955, 'learning_rate': 1.9808877332608974e-05, 'epoch': 0.1} +{'loss': 241.5604, 'grad_norm': 4596.74165721394, 'learning_rate': 1.9808028174717148e-05, 'epoch': 0.1} +{'loss': 246.961, 'grad_norm': 4543.893820466488, 'learning_rate': 1.9807177152882175e-05, 'epoch': 0.1} +{'loss': 245.0709, 'grad_norm': 4353.247796227846, 'learning_rate': 1.9806324267265786e-05, 'epoch': 0.1} +{'loss': 238.8592, 'grad_norm': 3939.767288918237, 'learning_rate': 1.980546951803007e-05, 'epoch': 0.1} +{'loss': 241.1343, 'grad_norm': 5464.256981895994, 'learning_rate': 1.980461290533746e-05, 'epoch': 0.1} +{'loss': 238.8958, 'grad_norm': 4033.526449619854, 'learning_rate': 1.980375442935075e-05, 'epoch': 0.1} +{'loss': 236.2125, 'grad_norm': 4928.411905803434, 'learning_rate': 1.980289409023309e-05, 'epoch': 0.1} +{'loss': 246.2393, 'grad_norm': 4124.069027386746, 'learning_rate': 1.9802031888147977e-05, 'epoch': 0.1} +{'loss': 236.456, 'grad_norm': 4200.939175120763, 'learning_rate': 1.9801167823259266e-05, 'epoch': 0.1} +{'loss': 242.3691, 'grad_norm': 4003.963198560152, 'learning_rate': 1.9800301895731172e-05, 'epoch': 0.1} +{'loss': 236.6607, 'grad_norm': 4440.996299902466, 'learning_rate': 1.9799434105728254e-05, 'epoch': 0.1} +{'loss': 236.8661, 'grad_norm': 4188.026897236485, 'learning_rate': 1.9798564453415427e-05, 'epoch': 0.1} +{'loss': 242.1368, 'grad_norm': 4956.984085191027, 'learning_rate': 1.9797692938957967e-05, 'epoch': 0.1} +{'loss': 234.3837, 'grad_norm': 5823.223802911593, 'learning_rate': 1.9796819562521494e-05, 'epoch': 0.1} +{'loss': 235.3643, 'grad_norm': 5100.253929607074, 'learning_rate': 1.9795944324271993e-05, 'epoch': 0.1} +{'loss': 237.2125, 'grad_norm': 5137.266055403268, 'learning_rate': 1.979506722437579e-05, 'epoch': 0.1} +{'loss': 235.1038, 'grad_norm': 4309.706099353632, 'learning_rate': 1.9794188262999574e-05, 'epoch': 0.1} +{'loss': 240.5707, 'grad_norm': 5103.083317617584, 'learning_rate': 1.9793307440310384e-05, 'epoch': 0.1} +{'loss': 241.1925, 'grad_norm': 5145.082796115051, 'learning_rate': 1.9792424756475618e-05, 'epoch': 0.1} +{'loss': 236.1822, 'grad_norm': 4392.541766165352, 'learning_rate': 1.9791540211663015e-05, 'epoch': 0.1} +{'loss': 235.378, 'grad_norm': 4613.3895247918845, 'learning_rate': 1.9790653806040684e-05, 'epoch': 0.1} +{'loss': 237.6301, 'grad_norm': 4814.3520136455945, 'learning_rate': 1.978976553977708e-05, 'epoch': 0.1} +{'loss': 245.5414, 'grad_norm': 3774.8039786102004, 'learning_rate': 1.9788875413041005e-05, 'epoch': 0.1} +{'loss': 241.164, 'grad_norm': 3772.8286312641044, 'learning_rate': 1.978798342600163e-05, 'epoch': 0.11} +{'loss': 234.1787, 'grad_norm': 3993.6463291693517, 'learning_rate': 1.978708957882846e-05, 'epoch': 0.11} +{'loss': 243.3888, 'grad_norm': 4195.896530537725, 'learning_rate': 1.9786193871691372e-05, 'epoch': 0.11} +{'loss': 246.5732, 'grad_norm': 5737.547311340583, 'learning_rate': 1.9785296304760585e-05, 'epoch': 0.11} +{'loss': 236.9801, 'grad_norm': 3586.5126442323462, 'learning_rate': 1.9784396878206676e-05, 'epoch': 0.11} +{'loss': 229.8124, 'grad_norm': 4353.997440689372, 'learning_rate': 1.9783495592200575e-05, 'epoch': 0.11} +{'loss': 231.6629, 'grad_norm': 3994.426279612782, 'learning_rate': 1.978259244691356e-05, 'epoch': 0.11} +{'loss': 235.5763, 'grad_norm': 4037.2333326379935, 'learning_rate': 1.9781687442517278e-05, 'epoch': 0.11} +{'loss': 243.3963, 'grad_norm': 3865.1364952191916, 'learning_rate': 1.9780780579183708e-05, 'epoch': 0.11} +{'loss': 229.6211, 'grad_norm': 4943.909307222911, 'learning_rate': 1.9779871857085196e-05, 'epoch': 0.11} +{'loss': 233.0496, 'grad_norm': 4698.533960053896, 'learning_rate': 1.9778961276394435e-05, 'epoch': 0.11} +{'loss': 235.7497, 'grad_norm': 3983.248239589999, 'learning_rate': 1.977804883728448e-05, 'epoch': 0.11} +{'loss': 238.851, 'grad_norm': 3948.584673641454, 'learning_rate': 1.9777134539928732e-05, 'epoch': 0.11} +{'loss': 239.611, 'grad_norm': 4974.020270322019, 'learning_rate': 1.977621838450094e-05, 'epoch': 0.11} +{'loss': 236.9981, 'grad_norm': 4807.659737058589, 'learning_rate': 1.9775300371175225e-05, 'epoch': 0.11} +{'loss': 239.143, 'grad_norm': 4254.778482380804, 'learning_rate': 1.9774380500126033e-05, 'epoch': 0.11} +{'loss': 240.3748, 'grad_norm': 5752.451472966959, 'learning_rate': 1.977345877152819e-05, 'epoch': 0.11} +{'loss': 240.0248, 'grad_norm': 5011.268337614059, 'learning_rate': 1.977253518555686e-05, 'epoch': 0.11} +{'loss': 233.4049, 'grad_norm': 3978.59802502632, 'learning_rate': 1.9771609742387566e-05, 'epoch': 0.11} +{'loss': 236.0489, 'grad_norm': 4174.94092239946, 'learning_rate': 1.9770682442196177e-05, 'epoch': 0.11} +{'loss': 228.4655, 'grad_norm': 5005.4546976508445, 'learning_rate': 1.9769753285158924e-05, 'epoch': 0.11} +{'loss': 230.9795, 'grad_norm': 4059.834904597643, 'learning_rate': 1.9768822271452385e-05, 'epoch': 0.11} +{'loss': 233.8306, 'grad_norm': 3989.091446910414, 'learning_rate': 1.976788940125349e-05, 'epoch': 0.11} +{'loss': 234.0382, 'grad_norm': 4738.667786816561, 'learning_rate': 1.976695467473953e-05, 'epoch': 0.11} +{'loss': 234.0249, 'grad_norm': 4272.095851327929, 'learning_rate': 1.9766018092088134e-05, 'epoch': 0.11} +{'loss': 233.3675, 'grad_norm': 3905.958797360386, 'learning_rate': 1.97650796534773e-05, 'epoch': 0.11} +{'loss': 234.7384, 'grad_norm': 4402.228788337219, 'learning_rate': 1.9764139359085367e-05, 'epoch': 0.11} +{'loss': 234.7795, 'grad_norm': 3667.725706553399, 'learning_rate': 1.9763197209091034e-05, 'epoch': 0.11} +{'loss': 231.2038, 'grad_norm': 4072.3188813241936, 'learning_rate': 1.9762253203673348e-05, 'epoch': 0.11} +{'loss': 239.6609, 'grad_norm': 20860.877539205212, 'learning_rate': 1.976130734301171e-05, 'epoch': 0.11} +{'loss': 242.183, 'grad_norm': 5920.401126340357, 'learning_rate': 1.9760359627285878e-05, 'epoch': 0.11} +{'loss': 239.4126, 'grad_norm': 4267.027964466358, 'learning_rate': 1.975941005667595e-05, 'epoch': 0.11} +{'loss': 228.4836, 'grad_norm': 3674.1855836112886, 'learning_rate': 1.9758458631362393e-05, 'epoch': 0.11} +{'loss': 235.669, 'grad_norm': 4622.756791032972, 'learning_rate': 1.975750535152601e-05, 'epoch': 0.11} +{'loss': 239.0651, 'grad_norm': 4880.449943901646, 'learning_rate': 1.9756550217347976e-05, 'epoch': 0.11} +{'loss': 227.8156, 'grad_norm': 3976.002559415804, 'learning_rate': 1.97555932290098e-05, 'epoch': 0.11} +{'loss': 230.0932, 'grad_norm': 4206.327554409135, 'learning_rate': 1.9754634386693348e-05, 'epoch': 0.11} +{'loss': 224.2146, 'grad_norm': 3860.989711256281, 'learning_rate': 1.9753673690580846e-05, 'epoch': 0.11} +{'loss': 226.3115, 'grad_norm': 3364.880378368636, 'learning_rate': 1.975271114085487e-05, 'epoch': 0.11} +{'loss': 237.426, 'grad_norm': 4316.542446469439, 'learning_rate': 1.9751746737698333e-05, 'epoch': 0.11} +{'loss': 230.4055, 'grad_norm': 3622.870014352444, 'learning_rate': 1.9750780481294525e-05, 'epoch': 0.11} +{'loss': 236.0719, 'grad_norm': 3755.625316188872, 'learning_rate': 1.9749812371827072e-05, 'epoch': 0.11} +{'loss': 231.5175, 'grad_norm': 3868.5423029759313, 'learning_rate': 1.9748842409479953e-05, 'epoch': 0.11} +{'loss': 242.0985, 'grad_norm': 3722.512350241998, 'learning_rate': 1.974787059443751e-05, 'epoch': 0.11} +{'loss': 234.2322, 'grad_norm': 8700.418395915209, 'learning_rate': 1.9746896926884418e-05, 'epoch': 0.11} +{'loss': 231.3199, 'grad_norm': 4262.977133828634, 'learning_rate': 1.9745921407005725e-05, 'epoch': 0.11} +{'loss': 233.4491, 'grad_norm': 4913.240937362059, 'learning_rate': 1.9744944034986816e-05, 'epoch': 0.11} +{'loss': 226.3123, 'grad_norm': 3850.2507523769395, 'learning_rate': 1.9743964811013434e-05, 'epoch': 0.11} +{'loss': 228.9207, 'grad_norm': 5601.026149592807, 'learning_rate': 1.9742983735271677e-05, 'epoch': 0.11} +{'loss': 231.8282, 'grad_norm': 6083.158893121459, 'learning_rate': 1.9742000807947986e-05, 'epoch': 0.11} +{'loss': 227.2637, 'grad_norm': 4331.901192397819, 'learning_rate': 1.9741016029229162e-05, 'epoch': 0.11} +{'loss': 235.7743, 'grad_norm': 4420.364901603495, 'learning_rate': 1.9740029399302356e-05, 'epoch': 0.11} +{'loss': 232.4978, 'grad_norm': 4088.306834962465, 'learning_rate': 1.973904091835507e-05, 'epoch': 0.12} +{'loss': 231.4061, 'grad_norm': 4355.01700166913, 'learning_rate': 1.973805058657515e-05, 'epoch': 0.12} +{'loss': 224.7072, 'grad_norm': 3988.826614827014, 'learning_rate': 1.973705840415081e-05, 'epoch': 0.12} +[2025-05-27 20:01:17,747] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is about to be saved! +[2025-05-27 20:01:17,760] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 20:01:17,760] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 20:01:17,833] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 20:01:17,984] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 20:07:06,343] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 20:07:06,371] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 20:07:18,645] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step6000 is ready now! +[2025-05-27 20:07:18,678] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is about to be saved! +[2025-05-27 20:07:18,705] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 20:07:18,705] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 20:07:18,790] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 20:07:18,961] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 20:13:09,388] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 20:13:09,646] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-6000/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 20:13:24,453] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step6000 is ready now! +{'loss': 227.1693, 'grad_norm': 3825.900671763928, 'learning_rate': 1.9736064371270604e-05, 'epoch': 0.12} +{'loss': 242.3855, 'grad_norm': 4475.076413092029, 'learning_rate': 1.973506848812344e-05, 'epoch': 0.12} +{'loss': 227.6184, 'grad_norm': 3871.9626433612825, 'learning_rate': 1.973407075489858e-05, 'epoch': 0.12} +{'loss': 233.5849, 'grad_norm': 3887.45351610296, 'learning_rate': 1.973307117178563e-05, 'epoch': 0.12} +{'loss': 234.2199, 'grad_norm': 3588.7352734457095, 'learning_rate': 1.973206973897456e-05, 'epoch': 0.12} +{'loss': 235.4953, 'grad_norm': 3961.569762910722, 'learning_rate': 1.9731066456655684e-05, 'epoch': 0.12} +{'loss': 228.7454, 'grad_norm': 4051.3147854164595, 'learning_rate': 1.9730061325019667e-05, 'epoch': 0.12} +{'loss': 245.2929, 'grad_norm': 4316.308540635024, 'learning_rate': 1.9729054344257525e-05, 'epoch': 0.12} +{'loss': 233.433, 'grad_norm': 3958.790938590505, 'learning_rate': 1.972804551456063e-05, 'epoch': 0.12} +{'loss': 236.3697, 'grad_norm': 3341.66774469772, 'learning_rate': 1.9727034836120702e-05, 'epoch': 0.12} +{'loss': 244.0113, 'grad_norm': 3797.894259295683, 'learning_rate': 1.9726022309129815e-05, 'epoch': 0.12} +{'loss': 232.702, 'grad_norm': 9767.439326668722, 'learning_rate': 1.9725007933780387e-05, 'epoch': 0.12} +{'loss': 248.5511, 'grad_norm': 7243.33203846477, 'learning_rate': 1.9723991710265192e-05, 'epoch': 0.12} +{'loss': 236.7743, 'grad_norm': 4756.249941610511, 'learning_rate': 1.9722973638777365e-05, 'epoch': 0.12} +{'loss': 227.0356, 'grad_norm': 4631.435835192376, 'learning_rate': 1.9721953719510375e-05, 'epoch': 0.12} +{'loss': 229.3144, 'grad_norm': 3508.678684688832, 'learning_rate': 1.972093195265805e-05, 'epoch': 0.12} +{'loss': 230.6651, 'grad_norm': 3631.374833903945, 'learning_rate': 1.9719908338414578e-05, 'epoch': 0.12} +{'loss': 228.8755, 'grad_norm': 4150.734621771545, 'learning_rate': 1.9718882876974477e-05, 'epoch': 0.12} +{'loss': 233.6016, 'grad_norm': 3854.3088422761075, 'learning_rate': 1.9717855568532634e-05, 'epoch': 0.12} +{'loss': 230.2552, 'grad_norm': 4132.306476313473, 'learning_rate': 1.9716826413284284e-05, 'epoch': 0.12} +{'loss': 235.6086, 'grad_norm': 4290.72821864375, 'learning_rate': 1.971579541142501e-05, 'epoch': 0.12} +{'loss': 237.3908, 'grad_norm': 4839.159141138833, 'learning_rate': 1.971476256315074e-05, 'epoch': 0.12} +{'loss': 232.2001, 'grad_norm': 4181.812956718318, 'learning_rate': 1.9713727868657764e-05, 'epoch': 0.12} +{'loss': 226.37, 'grad_norm': 3265.0861002507354, 'learning_rate': 1.9712691328142716e-05, 'epoch': 0.12} +{'loss': 234.0197, 'grad_norm': 4062.075966032118, 'learning_rate': 1.9711652941802587e-05, 'epoch': 0.12} +{'loss': 234.6748, 'grad_norm': 4962.535033885716, 'learning_rate': 1.971061270983471e-05, 'epoch': 0.12} +{'loss': 234.4601, 'grad_norm': 3783.942224978895, 'learning_rate': 1.9709570632436774e-05, 'epoch': 0.12} +{'loss': 231.8086, 'grad_norm': 3402.7265923484774, 'learning_rate': 1.970852670980682e-05, 'epoch': 0.12} +{'loss': 220.8079, 'grad_norm': 3505.4933707792393, 'learning_rate': 1.970748094214324e-05, 'epoch': 0.12} +{'loss': 232.6243, 'grad_norm': 4164.719050860138, 'learning_rate': 1.970643332964477e-05, 'epoch': 0.12} +{'loss': 234.1037, 'grad_norm': 3781.5414161441518, 'learning_rate': 1.9705383872510502e-05, 'epoch': 0.12} +{'loss': 229.4185, 'grad_norm': 3677.6035849593227, 'learning_rate': 1.970433257093988e-05, 'epoch': 0.12} +{'loss': 233.1826, 'grad_norm': 4309.619950132339, 'learning_rate': 1.970327942513269e-05, 'epoch': 0.12} +{'loss': 234.9402, 'grad_norm': 3628.4259075464133, 'learning_rate': 1.970222443528908e-05, 'epoch': 0.12} +{'loss': 235.239, 'grad_norm': 3936.8170580664632, 'learning_rate': 1.9701167601609542e-05, 'epoch': 0.12} +{'loss': 230.7288, 'grad_norm': 3923.973993016758, 'learning_rate': 1.9700108924294922e-05, 'epoch': 0.12} +{'loss': 236.6208, 'grad_norm': 4044.120642407316, 'learning_rate': 1.969904840354641e-05, 'epoch': 0.12} +{'loss': 228.4249, 'grad_norm': 3896.0473418915194, 'learning_rate': 1.9697986039565548e-05, 'epoch': 0.12} +{'loss': 229.5128, 'grad_norm': 4530.881085814152, 'learning_rate': 1.9696921832554232e-05, 'epoch': 0.12} +{'loss': 240.9811, 'grad_norm': 3953.398251924933, 'learning_rate': 1.9695855782714713e-05, 'epoch': 0.12} +{'loss': 230.2889, 'grad_norm': 3769.544187956116, 'learning_rate': 1.9694787890249576e-05, 'epoch': 0.12} +{'loss': 229.1027, 'grad_norm': 4029.732189492648, 'learning_rate': 1.9693718155361774e-05, 'epoch': 0.12} +{'loss': 229.1691, 'grad_norm': 3641.163497861091, 'learning_rate': 1.9692646578254598e-05, 'epoch': 0.12} +{'loss': 228.1512, 'grad_norm': 4369.810747659375, 'learning_rate': 1.9691573159131696e-05, 'epoch': 0.12} +{'loss': 232.163, 'grad_norm': 3645.301268644583, 'learning_rate': 1.9690497898197063e-05, 'epoch': 0.12} +{'loss': 226.2349, 'grad_norm': 3982.3862635188493, 'learning_rate': 1.968942079565504e-05, 'epoch': 0.12} +{'loss': 232.6838, 'grad_norm': 4328.258276986516, 'learning_rate': 1.9688341851710326e-05, 'epoch': 0.12} +{'loss': 233.8396, 'grad_norm': 4251.339791334689, 'learning_rate': 1.9687261066567967e-05, 'epoch': 0.12} +{'loss': 228.7018, 'grad_norm': 3854.161317831453, 'learning_rate': 1.9686178440433356e-05, 'epoch': 0.12} +{'loss': 223.4705, 'grad_norm': 3722.3441611988233, 'learning_rate': 1.9685093973512242e-05, 'epoch': 0.13} +{'loss': 225.8169, 'grad_norm': 3929.738269389267, 'learning_rate': 1.9684007666010716e-05, 'epoch': 0.13} +{'loss': 233.2047, 'grad_norm': 4284.044281945465, 'learning_rate': 1.968291951813522e-05, 'epoch': 0.13} +{'loss': 234.2742, 'grad_norm': 3976.596232625252, 'learning_rate': 1.968182953009256e-05, 'epoch': 0.13} +{'loss': 224.5545, 'grad_norm': 3724.631855378026, 'learning_rate': 1.9680737702089866e-05, 'epoch': 0.13} +{'loss': 240.2829, 'grad_norm': 4167.136632278434, 'learning_rate': 1.967964403433464e-05, 'epoch': 0.13} +{'loss': 229.7199, 'grad_norm': 5006.318878599629, 'learning_rate': 1.967854852703472e-05, 'epoch': 0.13} +{'loss': 231.8057, 'grad_norm': 3948.6080023368095, 'learning_rate': 1.9677451180398308e-05, 'epoch': 0.13} +{'loss': 226.5908, 'grad_norm': 4332.776957747412, 'learning_rate': 1.967635199463394e-05, 'epoch': 0.13} +{'loss': 227.398, 'grad_norm': 3087.4220813239467, 'learning_rate': 1.967525096995051e-05, 'epoch': 0.13} +{'loss': 230.7968, 'grad_norm': 3837.689738020861, 'learning_rate': 1.9674148106557256e-05, 'epoch': 0.13} +{'loss': 226.4845, 'grad_norm': 3832.468166320759, 'learning_rate': 1.9673043404663774e-05, 'epoch': 0.13} +{'loss': 227.762, 'grad_norm': 3738.5051600752413, 'learning_rate': 1.9671936864480002e-05, 'epoch': 0.13} +{'loss': 231.3077, 'grad_norm': 4164.501407715758, 'learning_rate': 1.9670828486216232e-05, 'epoch': 0.13} +{'loss': 235.3686, 'grad_norm': 4313.056598253102, 'learning_rate': 1.96697182700831e-05, 'epoch': 0.13} +{'loss': 229.2954, 'grad_norm': 3121.8248215834265, 'learning_rate': 1.9668606216291598e-05, 'epoch': 0.13} +{'loss': 238.4093, 'grad_norm': 3916.2234023841575, 'learning_rate': 1.966749232505306e-05, 'epoch': 0.13} +{'loss': 229.0415, 'grad_norm': 3740.2782347037196, 'learning_rate': 1.9666376596579172e-05, 'epoch': 0.13} +{'loss': 235.3642, 'grad_norm': 4172.655213108164, 'learning_rate': 1.9665259031081973e-05, 'epoch': 0.13} +{'loss': 227.4674, 'grad_norm': 4009.357912681913, 'learning_rate': 1.9664139628773848e-05, 'epoch': 0.13} +{'loss': 243.3932, 'grad_norm': 4394.695176798987, 'learning_rate': 1.966301838986753e-05, 'epoch': 0.13} +{'loss': 229.7356, 'grad_norm': 4763.312024461898, 'learning_rate': 1.96618953145761e-05, 'epoch': 0.13} +{'loss': 228.8869, 'grad_norm': 4085.1604358072263, 'learning_rate': 1.9660770403112996e-05, 'epoch': 0.13} +{'loss': 229.3135, 'grad_norm': 4083.2756046285886, 'learning_rate': 1.9659643655691994e-05, 'epoch': 0.13} +{'loss': 237.3602, 'grad_norm': 3790.1741727427366, 'learning_rate': 1.9658515072527226e-05, 'epoch': 0.13} +{'loss': 225.0696, 'grad_norm': 3491.210691458377, 'learning_rate': 1.965738465383317e-05, 'epoch': 0.13} +{'loss': 228.0561, 'grad_norm': 3509.68601718679, 'learning_rate': 1.9656252399824653e-05, 'epoch': 0.13} +{'loss': 229.7564, 'grad_norm': 3766.070280167219, 'learning_rate': 1.9655118310716856e-05, 'epoch': 0.13} +{'loss': 249.1565, 'grad_norm': 4050.8368632501783, 'learning_rate': 1.96539823867253e-05, 'epoch': 0.13} +{'loss': 231.0891, 'grad_norm': 3852.8270394145115, 'learning_rate': 1.9652844628065857e-05, 'epoch': 0.13} +{'loss': 223.5467, 'grad_norm': 3374.144554863213, 'learning_rate': 1.9651705034954757e-05, 'epoch': 0.13} +{'loss': 234.3419, 'grad_norm': 4325.300059407102, 'learning_rate': 1.965056360760856e-05, 'epoch': 0.13} +{'loss': 224.836, 'grad_norm': 3378.0262132290077, 'learning_rate': 1.96494203462442e-05, 'epoch': 0.13} +{'loss': 230.9135, 'grad_norm': 3501.8621372975417, 'learning_rate': 1.9648275251078934e-05, 'epoch': 0.13} +{'loss': 229.1785, 'grad_norm': 3747.7489890297193, 'learning_rate': 1.9647128322330382e-05, 'epoch': 0.13} +{'loss': 228.3591, 'grad_norm': 6328.579046134902, 'learning_rate': 1.9645979560216513e-05, 'epoch': 0.13} +{'loss': 235.2192, 'grad_norm': 3594.443191438248, 'learning_rate': 1.9644828964955633e-05, 'epoch': 0.13} +{'loss': 228.8907, 'grad_norm': 3684.898174760944, 'learning_rate': 1.9643676536766412e-05, 'epoch': 0.13} +{'loss': 232.7077, 'grad_norm': 3793.838231495944, 'learning_rate': 1.9642522275867857e-05, 'epoch': 0.13} +{'loss': 219.6077, 'grad_norm': 3541.1244134197436, 'learning_rate': 1.964136618247933e-05, 'epoch': 0.13} +{'loss': 227.0523, 'grad_norm': 3372.486353689238, 'learning_rate': 1.964020825682053e-05, 'epoch': 0.13} +{'loss': 225.5828, 'grad_norm': 5803.7237351971135, 'learning_rate': 1.963904849911152e-05, 'epoch': 0.13} +{'loss': 236.0501, 'grad_norm': 4912.364831503486, 'learning_rate': 1.96378869095727e-05, 'epoch': 0.13} +{'loss': 226.9487, 'grad_norm': 4701.066578875505, 'learning_rate': 1.9636723488424823e-05, 'epoch': 0.13} +{'loss': 228.3129, 'grad_norm': 3838.4459162380663, 'learning_rate': 1.9635558235888986e-05, 'epoch': 0.13} +{'loss': 223.776, 'grad_norm': 4092.3172128459737, 'learning_rate': 1.9634391152186638e-05, 'epoch': 0.13} +{'loss': 217.5706, 'grad_norm': 3295.2631865303083, 'learning_rate': 1.963322223753958e-05, 'epoch': 0.13} +{'loss': 229.436, 'grad_norm': 3687.0910090162715, 'learning_rate': 1.9632051492169945e-05, 'epoch': 0.13} +{'loss': 223.6056, 'grad_norm': 3509.1809682462012, 'learning_rate': 1.963087891630023e-05, 'epoch': 0.13} +{'loss': 222.2059, 'grad_norm': 3852.9530106813404, 'learning_rate': 1.9629704510153277e-05, 'epoch': 0.13} +{'loss': 226.1808, 'grad_norm': 3325.195251566031, 'learning_rate': 1.962852827395227e-05, 'epoch': 0.13} +{'loss': 219.2153, 'grad_norm': 3335.264475099688, 'learning_rate': 1.9627350207920745e-05, 'epoch': 0.13} +{'loss': 223.3656, 'grad_norm': 3160.7330513665006, 'learning_rate': 1.9626170312282582e-05, 'epoch': 0.14} +{'loss': 233.7219, 'grad_norm': 4111.724557101625, 'learning_rate': 1.9624988587262016e-05, 'epoch': 0.14} +{'loss': 226.6877, 'grad_norm': 3904.5701032063425, 'learning_rate': 1.9623805033083623e-05, 'epoch': 0.14} +{'loss': 285.1381, 'grad_norm': 4019.8373620405996, 'learning_rate': 1.9622619649972327e-05, 'epoch': 0.14} +{'loss': 229.0732, 'grad_norm': 3413.762515836846, 'learning_rate': 1.9621432438153402e-05, 'epoch': 0.14} +{'loss': 232.5685, 'grad_norm': 3113.5742266697953, 'learning_rate': 1.9620243397852473e-05, 'epoch': 0.14} +{'loss': 234.6527, 'grad_norm': 3688.7966179937307, 'learning_rate': 1.9619052529295502e-05, 'epoch': 0.14} +{'loss': 225.182, 'grad_norm': 3807.197315517141, 'learning_rate': 1.961785983270881e-05, 'epoch': 0.14} +{'loss': 228.5137, 'grad_norm': 3466.6988396282836, 'learning_rate': 1.9616665308319053e-05, 'epoch': 0.14} +{'loss': 226.7076, 'grad_norm': 4452.46865892179, 'learning_rate': 1.961546895635325e-05, 'epoch': 0.14} +{'loss': 228.8863, 'grad_norm': 3815.130530440585, 'learning_rate': 1.9614270777038757e-05, 'epoch': 0.14} +{'loss': 227.074, 'grad_norm': 3871.3547027867917, 'learning_rate': 1.9613070770603272e-05, 'epoch': 0.14} +{'loss': 226.0283, 'grad_norm': 3076.369861630737, 'learning_rate': 1.961186893727486e-05, 'epoch': 0.14} +{'loss': 229.999, 'grad_norm': 4494.775702809275, 'learning_rate': 1.9610665277281906e-05, 'epoch': 0.14} +{'loss': 226.0952, 'grad_norm': 4018.07925409279, 'learning_rate': 1.9609459790853166e-05, 'epoch': 0.14} +{'loss': 234.6918, 'grad_norm': 5393.03292984181, 'learning_rate': 1.9608252478217735e-05, 'epoch': 0.14} +{'loss': 230.3861, 'grad_norm': 4379.632301906825, 'learning_rate': 1.9607043339605044e-05, 'epoch': 0.14} +{'loss': 222.3585, 'grad_norm': 3084.8813819283114, 'learning_rate': 1.9605832375244895e-05, 'epoch': 0.14} +{'loss': 226.2325, 'grad_norm': 3287.0392355240056, 'learning_rate': 1.9604619585367408e-05, 'epoch': 0.14} +{'loss': 235.7677, 'grad_norm': 5417.937469291236, 'learning_rate': 1.9603404970203078e-05, 'epoch': 0.14} +{'loss': 227.7762, 'grad_norm': 3281.877218518662, 'learning_rate': 1.9602188529982725e-05, 'epoch': 0.14} +{'loss': 218.958, 'grad_norm': 3815.7189222523793, 'learning_rate': 1.9600970264937525e-05, 'epoch': 0.14} +{'loss': 219.9474, 'grad_norm': 3578.9015738661933, 'learning_rate': 1.9599750175299003e-05, 'epoch': 0.14} +{'loss': 223.8055, 'grad_norm': 3961.358301189282, 'learning_rate': 1.9598528261299024e-05, 'epoch': 0.14} +{'loss': 229.0683, 'grad_norm': 4808.305853968544, 'learning_rate': 1.9597304523169812e-05, 'epoch': 0.14} +{'loss': 218.7515, 'grad_norm': 3945.543823413686, 'learning_rate': 1.959607896114392e-05, 'epoch': 0.14} +{'loss': 222.7782, 'grad_norm': 3587.6607488937684, 'learning_rate': 1.9594851575454266e-05, 'epoch': 0.14} +{'loss': 227.1742, 'grad_norm': 4442.574771463477, 'learning_rate': 1.95936223663341e-05, 'epoch': 0.14} +{'loss': 220.4112, 'grad_norm': 3557.5565782392787, 'learning_rate': 1.9592391334017025e-05, 'epoch': 0.14} +{'loss': 273.4297, 'grad_norm': 4172.655155673642, 'learning_rate': 1.9591158478736986e-05, 'epoch': 0.14} +{'loss': 225.2915, 'grad_norm': 3842.762296306671, 'learning_rate': 1.958992380072828e-05, 'epoch': 0.14} +{'loss': 253.4766, 'grad_norm': 4365.147554947403, 'learning_rate': 1.958868730022556e-05, 'epoch': 0.14} +{'loss': 226.8476, 'grad_norm': 4778.789790236881, 'learning_rate': 1.9587448977463793e-05, 'epoch': 0.14} +{'loss': 218.1223, 'grad_norm': 3063.0610848737924, 'learning_rate': 1.9586208832678328e-05, 'epoch': 0.14} +{'loss': 227.8918, 'grad_norm': 3768.8029001627337, 'learning_rate': 1.9584966866104843e-05, 'epoch': 0.14} +{'loss': 223.9729, 'grad_norm': 3738.1952584008836, 'learning_rate': 1.958372307797936e-05, 'epoch': 0.14} +{'loss': 222.1909, 'grad_norm': 3830.6048633832315, 'learning_rate': 1.958247746853825e-05, 'epoch': 0.14} +{'loss': 224.3266, 'grad_norm': 4012.997111149356, 'learning_rate': 1.9581230038018244e-05, 'epoch': 0.14} +{'loss': 229.2094, 'grad_norm': 4394.639814810512, 'learning_rate': 1.9579980786656396e-05, 'epoch': 0.14} +{'loss': 217.9667, 'grad_norm': 3585.041292018165, 'learning_rate': 1.957872971469012e-05, 'epoch': 0.14} +{'loss': 223.7828, 'grad_norm': 3971.0636068052945, 'learning_rate': 1.9577476822357174e-05, 'epoch': 0.14} +{'loss': 225.0194, 'grad_norm': 4747.987157242357, 'learning_rate': 1.9576222109895658e-05, 'epoch': 0.14} +{'loss': 220.7331, 'grad_norm': 4631.660638148578, 'learning_rate': 1.9574965577544024e-05, 'epoch': 0.14} +{'loss': 223.4653, 'grad_norm': 3758.775413155789, 'learning_rate': 1.9573707225541067e-05, 'epoch': 0.14} +{'loss': 223.8915, 'grad_norm': 4725.811664433298, 'learning_rate': 1.957244705412592e-05, 'epoch': 0.14} +{'loss': 220.5884, 'grad_norm': 3218.386031530638, 'learning_rate': 1.957118506353808e-05, 'epoch': 0.14} +{'loss': 217.8646, 'grad_norm': 3488.688168491564, 'learning_rate': 1.9569921254017376e-05, 'epoch': 0.14} +{'loss': 217.1623, 'grad_norm': 3360.3982686958125, 'learning_rate': 1.9568655625803982e-05, 'epoch': 0.14} +{'loss': 221.8964, 'grad_norm': 3851.162754832756, 'learning_rate': 1.956738817913842e-05, 'epoch': 0.14} +{'loss': 222.037, 'grad_norm': 4087.497308361899, 'learning_rate': 1.9566118914261568e-05, 'epoch': 0.14} +{'loss': 219.6614, 'grad_norm': 3564.667780356889, 'learning_rate': 1.956484783141463e-05, 'epoch': 0.14} +{'loss': 216.4149, 'grad_norm': 3733.935203416586, 'learning_rate': 1.9563574930839172e-05, 'epoch': 0.14} +{'loss': 222.4937, 'grad_norm': 3298.007121709243, 'learning_rate': 1.9562300212777096e-05, 'epoch': 0.15} +{'loss': 220.8107, 'grad_norm': 4117.178996763251, 'learning_rate': 1.9561023677470657e-05, 'epoch': 0.15} +{'loss': 230.546, 'grad_norm': 4002.415918754354, 'learning_rate': 1.9559745325162445e-05, 'epoch': 0.15} +{'loss': 223.6588, 'grad_norm': 4998.032778200501, 'learning_rate': 1.955846515609541e-05, 'epoch': 0.15} +{'loss': 225.7979, 'grad_norm': 4413.198390068832, 'learning_rate': 1.9557183170512828e-05, 'epoch': 0.15} +{'loss': 217.1356, 'grad_norm': 3484.2590825741995, 'learning_rate': 1.9555899368658338e-05, 'epoch': 0.15} +{'loss': 228.1621, 'grad_norm': 3756.518060951499, 'learning_rate': 1.9554613750775916e-05, 'epoch': 0.15} +{'loss': 218.3505, 'grad_norm': 3418.0416988780507, 'learning_rate': 1.955332631710988e-05, 'epoch': 0.15} +{'loss': 225.379, 'grad_norm': 3653.8103825325857, 'learning_rate': 1.9552037067904903e-05, 'epoch': 0.15} +{'loss': 220.6961, 'grad_norm': 3656.384502420465, 'learning_rate': 1.9550746003405996e-05, 'epoch': 0.15} +{'loss': 216.379, 'grad_norm': 3971.348214345209, 'learning_rate': 1.9549453123858515e-05, 'epoch': 0.15} +{'loss': 226.594, 'grad_norm': 3481.176205496006, 'learning_rate': 1.9548158429508163e-05, 'epoch': 0.15} +{'loss': 221.0556, 'grad_norm': 3532.7287956351984, 'learning_rate': 1.9546861920600985e-05, 'epoch': 0.15} +{'loss': 220.5608, 'grad_norm': 10725.544524804905, 'learning_rate': 1.9545563597383376e-05, 'epoch': 0.15} +{'loss': 224.55, 'grad_norm': 3607.3297455033335, 'learning_rate': 1.954426346010207e-05, 'epoch': 0.15} +{'loss': 217.4797, 'grad_norm': 3315.5394771693905, 'learning_rate': 1.9542961509004147e-05, 'epoch': 0.15} +{'loss': 215.4487, 'grad_norm': 3303.048117216035, 'learning_rate': 1.9541657744337038e-05, 'epoch': 0.15} +{'loss': 220.1507, 'grad_norm': 3410.626477621558, 'learning_rate': 1.954035216634851e-05, 'epoch': 0.15} +{'loss': 219.9554, 'grad_norm': 3488.442337655571, 'learning_rate': 1.9539044775286682e-05, 'epoch': 0.15} +{'loss': 219.0863, 'grad_norm': 3436.9308118915687, 'learning_rate': 1.9537735571400017e-05, 'epoch': 0.15} +{'loss': 220.2753, 'grad_norm': 3662.8338259870898, 'learning_rate': 1.953642455493731e-05, 'epoch': 0.15} +{'loss': 222.0348, 'grad_norm': 3256.6027229332612, 'learning_rate': 1.9535111726147714e-05, 'epoch': 0.15} +{'loss': 222.9547, 'grad_norm': 3340.4565329366196, 'learning_rate': 1.953379708528072e-05, 'epoch': 0.15} +{'loss': 220.0742, 'grad_norm': 3497.572955551128, 'learning_rate': 1.9532480632586175e-05, 'epoch': 0.15} +{'loss': 227.6341, 'grad_norm': 4933.339205498537, 'learning_rate': 1.953116236831425e-05, 'epoch': 0.15} +{'loss': 222.3896, 'grad_norm': 3374.906268396155, 'learning_rate': 1.9529842292715476e-05, 'epoch': 0.15} +{'loss': 220.4039, 'grad_norm': 2938.38143996361, 'learning_rate': 1.9528520406040726e-05, 'epoch': 0.15} +{'loss': 216.0359, 'grad_norm': 3390.174559362138, 'learning_rate': 1.9527196708541215e-05, 'epoch': 0.15} +{'loss': 226.3494, 'grad_norm': 3776.751431276937, 'learning_rate': 1.9525871200468493e-05, 'epoch': 0.15} +{'loss': 227.0997, 'grad_norm': 56030.20681574775, 'learning_rate': 1.9524543882074473e-05, 'epoch': 0.15} +{'loss': 221.7106, 'grad_norm': 4145.441998937796, 'learning_rate': 1.9523214753611398e-05, 'epoch': 0.15} +{'loss': 221.9125, 'grad_norm': 3226.93651372226, 'learning_rate': 1.952188381533186e-05, 'epoch': 0.15} +{'loss': 221.8524, 'grad_norm': 3646.4619305039732, 'learning_rate': 1.9520551067488793e-05, 'epoch': 0.15} +{'loss': 219.7521, 'grad_norm': 3522.3298444822844, 'learning_rate': 1.951921651033548e-05, 'epoch': 0.15} +{'loss': 221.0182, 'grad_norm': 3361.8103938873796, 'learning_rate': 1.9517880144125533e-05, 'epoch': 0.15} +{'loss': 221.1536, 'grad_norm': 3665.6741655279675, 'learning_rate': 1.9516541969112927e-05, 'epoch': 0.15} +{'loss': 226.7894, 'grad_norm': 3922.771760318678, 'learning_rate': 1.9515201985551974e-05, 'epoch': 0.15} +{'loss': 217.7256, 'grad_norm': 3521.076946035968, 'learning_rate': 1.951386019369732e-05, 'epoch': 0.15} +{'loss': 218.7851, 'grad_norm': 3343.9511799929496, 'learning_rate': 1.951251659380397e-05, 'epoch': 0.15} +{'loss': 223.5983, 'grad_norm': 3928.244571148715, 'learning_rate': 1.9511171186127265e-05, 'epoch': 0.15} +{'loss': 225.4505, 'grad_norm': 3396.9384016425633, 'learning_rate': 1.9509823970922883e-05, 'epoch': 0.15} +{'loss': 221.2968, 'grad_norm': 4070.84346315347, 'learning_rate': 1.950847494844686e-05, 'epoch': 0.15} +{'loss': 215.7083, 'grad_norm': 3962.7574036083515, 'learning_rate': 1.9507124118955563e-05, 'epoch': 0.15} +{'loss': 225.97, 'grad_norm': 3582.3221536817196, 'learning_rate': 1.950577148270571e-05, 'epoch': 0.15} +{'loss': 223.7311, 'grad_norm': 3283.5139789864484, 'learning_rate': 1.9504417039954357e-05, 'epoch': 0.15} +{'loss': 218.7463, 'grad_norm': 3576.0663981971034, 'learning_rate': 1.9503060790958902e-05, 'epoch': 0.15} +{'loss': 222.2488, 'grad_norm': 3834.4045122637244, 'learning_rate': 1.95017027359771e-05, 'epoch': 0.15} +[2025-05-27 22:02:06,598] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is about to be saved! +[2025-05-27 22:02:06,632] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 22:02:06,632] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 22:02:06,756] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 22:02:06,868] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 22:08:15,387] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 22:08:15,478] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 22:08:20,778] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step8000 is ready now! +[2025-05-27 22:08:20,966] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is about to be saved! +[2025-05-27 22:08:20,991] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-27 22:08:20,991] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-27 22:08:21,102] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-27 22:08:21,437] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-27 22:14:07,814] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-27 22:14:07,890] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-8000/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-27 22:14:20,453] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step8000 is ready now! +{'loss': 220.6299, 'grad_norm': 3706.7982701287433, 'learning_rate': 1.9500342875267034e-05, 'epoch': 0.15} +{'loss': 229.1426, 'grad_norm': 4070.699779244305, 'learning_rate': 1.9498981209087134e-05, 'epoch': 0.15} +{'loss': 218.1549, 'grad_norm': 3680.121661405437, 'learning_rate': 1.949761773769618e-05, 'epoch': 0.15} +{'loss': 218.2519, 'grad_norm': 3760.5449790838816, 'learning_rate': 1.949625246135328e-05, 'epoch': 0.15} +{'loss': 215.0864, 'grad_norm': 4394.722739018977, 'learning_rate': 1.9494885380317906e-05, 'epoch': 0.15} +{'loss': 220.1045, 'grad_norm': 3018.453245703441, 'learning_rate': 1.949351649484985e-05, 'epoch': 0.16} +{'loss': 217.4647, 'grad_norm': 3684.498997221069, 'learning_rate': 1.949214580520927e-05, 'epoch': 0.16} +{'loss': 221.6467, 'grad_norm': 4148.622192084587, 'learning_rate': 1.9490773311656648e-05, 'epoch': 0.16} +{'loss': 223.3958, 'grad_norm': 3808.1024454853246, 'learning_rate': 1.948939901445282e-05, 'epoch': 0.16} +{'loss': 224.1006, 'grad_norm': 3468.9150183314646, 'learning_rate': 1.948802291385896e-05, 'epoch': 0.16} +{'loss': 217.8923, 'grad_norm': 3321.1318859093417, 'learning_rate': 1.948664501013658e-05, 'epoch': 0.16} +{'loss': 217.896, 'grad_norm': 3528.2002157885586, 'learning_rate': 1.9485265303547547e-05, 'epoch': 0.16} +{'loss': 222.2764, 'grad_norm': 3683.6278885523293, 'learning_rate': 1.9483883794354063e-05, 'epoch': 0.16} +{'loss': 223.8616, 'grad_norm': 4906.537767605333, 'learning_rate': 1.9482500482818675e-05, 'epoch': 0.16} +{'loss': 213.7224, 'grad_norm': 4230.379644130982, 'learning_rate': 1.948111536920427e-05, 'epoch': 0.16} +{'loss': 227.5846, 'grad_norm': 3155.8822625363055, 'learning_rate': 1.9479728453774075e-05, 'epoch': 0.16} +{'loss': 217.495, 'grad_norm': 3033.9693087390647, 'learning_rate': 1.9478339736791666e-05, 'epoch': 0.16} +{'loss': 214.6403, 'grad_norm': 2981.11006582672, 'learning_rate': 1.947694921852096e-05, 'epoch': 0.16} +{'loss': 216.5648, 'grad_norm': 3801.8641113328576, 'learning_rate': 1.9475556899226213e-05, 'epoch': 0.16} +{'loss': 226.7511, 'grad_norm': 2912.0762109950547, 'learning_rate': 1.9474162779172022e-05, 'epoch': 0.16} +{'loss': 216.2252, 'grad_norm': 3191.5289322993676, 'learning_rate': 1.9472766858623334e-05, 'epoch': 0.16} +{'loss': 227.8032, 'grad_norm': 3155.0396947013514, 'learning_rate': 1.947136913784543e-05, 'epoch': 0.16} +{'loss': 221.7712, 'grad_norm': 3485.2286034575877, 'learning_rate': 1.946996961710394e-05, 'epoch': 0.16} +{'loss': 219.734, 'grad_norm': 3301.173503327511, 'learning_rate': 1.946856829666483e-05, 'epoch': 0.16} +{'loss': 217.1554, 'grad_norm': 3435.2952130806216, 'learning_rate': 1.946716517679441e-05, 'epoch': 0.16} +{'loss': 216.7158, 'grad_norm': 3858.962274524545, 'learning_rate': 1.9465760257759336e-05, 'epoch': 0.16} +{'loss': 220.05, 'grad_norm': 3456.3607607214144, 'learning_rate': 1.9464353539826594e-05, 'epoch': 0.16} +{'loss': 216.1435, 'grad_norm': 3599.2335476945323, 'learning_rate': 1.9462945023263532e-05, 'epoch': 0.16} +{'loss': 224.1727, 'grad_norm': 3264.1911107475985, 'learning_rate': 1.946153470833782e-05, 'epoch': 0.16} +{'loss': 212.3, 'grad_norm': 3608.935650855788, 'learning_rate': 1.9460122595317483e-05, 'epoch': 0.16} +{'loss': 219.9446, 'grad_norm': 3351.3140198971914, 'learning_rate': 1.9458708684470878e-05, 'epoch': 0.16} +{'loss': 218.87, 'grad_norm': 3606.9326838067614, 'learning_rate': 1.945729297606671e-05, 'epoch': 0.16} +{'loss': 213.9779, 'grad_norm': 3131.800834699377, 'learning_rate': 1.9455875470374027e-05, 'epoch': 0.16} +{'loss': 230.5371, 'grad_norm': 3107.280633494168, 'learning_rate': 1.945445616766221e-05, 'epoch': 0.16} +{'loss': 215.9833, 'grad_norm': 4126.410410779152, 'learning_rate': 1.9453035068200993e-05, 'epoch': 0.16} +{'loss': 221.1171, 'grad_norm': 3756.4406845599974, 'learning_rate': 1.9451612172260438e-05, 'epoch': 0.16} +{'loss': 222.3041, 'grad_norm': 4984.680827203614, 'learning_rate': 1.9450187480110963e-05, 'epoch': 0.16} +{'loss': 223.3037, 'grad_norm': 4805.204168057738, 'learning_rate': 1.9448760992023314e-05, 'epoch': 0.16} +{'loss': 214.0621, 'grad_norm': 3244.2328946746525, 'learning_rate': 1.944733270826859e-05, 'epoch': 0.16} +{'loss': 211.6483, 'grad_norm': 3723.7698616709263, 'learning_rate': 1.9445902629118223e-05, 'epoch': 0.16} +{'loss': 224.13, 'grad_norm': 3564.2860759273894, 'learning_rate': 1.944447075484399e-05, 'epoch': 0.16} +{'loss': 207.2065, 'grad_norm': 3519.3884804494683, 'learning_rate': 1.9443037085718008e-05, 'epoch': 0.16} +{'loss': 220.095, 'grad_norm': 3194.28345762565, 'learning_rate': 1.9441601622012732e-05, 'epoch': 0.16} +{'loss': 215.6691, 'grad_norm': 3556.3882750573443, 'learning_rate': 1.9440164364000966e-05, 'epoch': 0.16} +{'loss': 211.3236, 'grad_norm': 3594.1141357745623, 'learning_rate': 1.9438725311955847e-05, 'epoch': 0.16} +{'loss': 219.8847, 'grad_norm': 3513.2107763512927, 'learning_rate': 1.9437284466150858e-05, 'epoch': 0.16} +{'loss': 217.9009, 'grad_norm': 3139.1885851285156, 'learning_rate': 1.943584182685982e-05, 'epoch': 0.16} +{'loss': 210.9826, 'grad_norm': 3315.8945692508014, 'learning_rate': 1.9434397394356897e-05, 'epoch': 0.16} +{'loss': 217.2167, 'grad_norm': 3116.633900013245, 'learning_rate': 1.9432951168916592e-05, 'epoch': 0.16} +{'loss': 214.8646, 'grad_norm': 3093.774352782947, 'learning_rate': 1.943150315081375e-05, 'epoch': 0.16} +{'loss': 216.1692, 'grad_norm': 3182.356387731097, 'learning_rate': 1.9430053340323554e-05, 'epoch': 0.16} +{'loss': 215.7676, 'grad_norm': 3573.3396916000174, 'learning_rate': 1.9428601737721534e-05, 'epoch': 0.16} +{'loss': 216.9169, 'grad_norm': 3681.3025207395312, 'learning_rate': 1.942714834328355e-05, 'epoch': 0.16} +{'loss': 216.0915, 'grad_norm': 3121.6383183578005, 'learning_rate': 1.9425693157285816e-05, 'epoch': 0.16} +{'loss': 214.0073, 'grad_norm': 3120.9027826837996, 'learning_rate': 1.942423618000487e-05, 'epoch': 0.16} +{'loss': 215.0297, 'grad_norm': 3171.8933142643727, 'learning_rate': 1.9422777411717613e-05, 'epoch': 0.16} +{'loss': 213.6791, 'grad_norm': 3173.8433323782465, 'learning_rate': 1.942131685270126e-05, 'epoch': 0.16} +{'loss': 214.7695, 'grad_norm': 3317.211457974693, 'learning_rate': 1.941985450323339e-05, 'epoch': 0.17} +{'loss': 217.0942, 'grad_norm': 3018.6555586837385, 'learning_rate': 1.9418390363591904e-05, 'epoch': 0.17} +{'loss': 217.5898, 'grad_norm': 3476.222770748071, 'learning_rate': 1.9416924434055057e-05, 'epoch': 0.17} +{'loss': 216.8884, 'grad_norm': 3685.2851837214225, 'learning_rate': 1.9415456714901432e-05, 'epoch': 0.17} +{'loss': 215.2664, 'grad_norm': 2955.6497273681966, 'learning_rate': 1.9413987206409958e-05, 'epoch': 0.17} +{'loss': 214.6376, 'grad_norm': 3196.980875525264, 'learning_rate': 1.9412515908859914e-05, 'epoch': 0.17} +{'loss': 216.6864, 'grad_norm': 4002.32695088722, 'learning_rate': 1.9411042822530896e-05, 'epoch': 0.17} +{'loss': 218.1302, 'grad_norm': 3197.87794742829, 'learning_rate': 1.940956794770286e-05, 'epoch': 0.17} +{'loss': 221.3189, 'grad_norm': 3709.133386114234, 'learning_rate': 1.94080912846561e-05, 'epoch': 0.17} +{'loss': 224.8487, 'grad_norm': 4438.110622790272, 'learning_rate': 1.9406612833671237e-05, 'epoch': 0.17} +{'loss': 220.8947, 'grad_norm': 3297.897100711684, 'learning_rate': 1.940513259502924e-05, 'epoch': 0.17} +{'loss': 216.0911, 'grad_norm': 3325.673123359973, 'learning_rate': 1.9403650569011423e-05, 'epoch': 0.17} +{'loss': 210.6098, 'grad_norm': 4133.049124757076, 'learning_rate': 1.9402166755899428e-05, 'epoch': 0.17} +{'loss': 219.2792, 'grad_norm': 3217.0507059488104, 'learning_rate': 1.9400681155975246e-05, 'epoch': 0.17} +{'loss': 211.77, 'grad_norm': 3128.818685617956, 'learning_rate': 1.9399193769521206e-05, 'epoch': 0.17} +{'loss': 213.7742, 'grad_norm': 3139.110281820052, 'learning_rate': 1.9397704596819967e-05, 'epoch': 0.17} +{'loss': 214.1365, 'grad_norm': 2887.22881734517, 'learning_rate': 1.9396213638154542e-05, 'epoch': 0.17} +{'loss': 217.7659, 'grad_norm': 3150.389479014825, 'learning_rate': 1.939472089380828e-05, 'epoch': 0.17} +{'loss': 213.8746, 'grad_norm': 3898.7261450979054, 'learning_rate': 1.9393226364064858e-05, 'epoch': 0.17} +{'loss': 217.1316, 'grad_norm': 3361.5759829599046, 'learning_rate': 1.9391730049208303e-05, 'epoch': 0.17} +{'loss': 214.4414, 'grad_norm': 3445.599335474568, 'learning_rate': 1.939023194952298e-05, 'epoch': 0.17} +{'loss': 220.8973, 'grad_norm': 4311.569621706203, 'learning_rate': 1.938873206529359e-05, 'epoch': 0.17} +{'loss': 214.5978, 'grad_norm': 4636.618118676885, 'learning_rate': 1.9387230396805176e-05, 'epoch': 0.17} +{'loss': 214.3214, 'grad_norm': 3346.4544823265924, 'learning_rate': 1.938572694434312e-05, 'epoch': 0.17} +{'loss': 216.2788, 'grad_norm': 3771.3956803346905, 'learning_rate': 1.938422170819314e-05, 'epoch': 0.17} +{'loss': 212.9706, 'grad_norm': 3276.482177928791, 'learning_rate': 1.9382714688641294e-05, 'epoch': 0.17} +{'loss': 207.6655, 'grad_norm': 3064.342382243743, 'learning_rate': 1.9381205885973986e-05, 'epoch': 0.17} +{'loss': 218.3643, 'grad_norm': 3311.9829438471975, 'learning_rate': 1.9379695300477945e-05, 'epoch': 0.17} +{'loss': 208.3625, 'grad_norm': 2963.622459730811, 'learning_rate': 1.937818293244025e-05, 'epoch': 0.17} +{'loss': 216.3914, 'grad_norm': 3396.5888768953696, 'learning_rate': 1.937666878214832e-05, 'epoch': 0.17} +{'loss': 216.1933, 'grad_norm': 3970.940659847001, 'learning_rate': 1.9375152849889897e-05, 'epoch': 0.17} +{'loss': 210.1771, 'grad_norm': 3708.084486860672, 'learning_rate': 1.937363513595308e-05, 'epoch': 0.17} +{'loss': 208.1693, 'grad_norm': 3215.645964766315, 'learning_rate': 1.93721156406263e-05, 'epoch': 0.17} +{'loss': 218.2069, 'grad_norm': 2951.13451086513, 'learning_rate': 1.9370594364198327e-05, 'epoch': 0.17} +{'loss': 214.884, 'grad_norm': 2821.1852862323144, 'learning_rate': 1.9369071306958263e-05, 'epoch': 0.17} +{'loss': 215.6531, 'grad_norm': 3724.6154165489884, 'learning_rate': 1.9367546469195554e-05, 'epoch': 0.17} +{'loss': 217.2322, 'grad_norm': 3690.6987339775023, 'learning_rate': 1.936601985119999e-05, 'epoch': 0.17} +{'loss': 217.1961, 'grad_norm': 3886.057184660892, 'learning_rate': 1.9364491453261692e-05, 'epoch': 0.17} +{'loss': 212.7987, 'grad_norm': 3323.505683698704, 'learning_rate': 1.9362961275671112e-05, 'epoch': 0.17} +{'loss': 215.6141, 'grad_norm': 3205.205196020728, 'learning_rate': 1.9361429318719064e-05, 'epoch': 0.17} +{'loss': 210.8998, 'grad_norm': 2793.2048393362525, 'learning_rate': 1.935989558269667e-05, 'epoch': 0.17} +{'loss': 212.4875, 'grad_norm': 2812.5799990373375, 'learning_rate': 1.935836006789541e-05, 'epoch': 0.17} +{'loss': 215.3151, 'grad_norm': 3798.3981347479835, 'learning_rate': 1.9356822774607105e-05, 'epoch': 0.17} +{'loss': 208.5641, 'grad_norm': 3157.6712568683465, 'learning_rate': 1.93552837031239e-05, 'epoch': 0.17} +{'loss': 213.2163, 'grad_norm': 3920.6258756611815, 'learning_rate': 1.9353742853738283e-05, 'epoch': 0.17} +{'loss': 209.9862, 'grad_norm': 3300.3533475767026, 'learning_rate': 1.935220022674308e-05, 'epoch': 0.17} +{'loss': 211.0856, 'grad_norm': 2638.5607218700125, 'learning_rate': 1.9350655822431463e-05, 'epoch': 0.17} +{'loss': 212.0048, 'grad_norm': 3569.9963751207824, 'learning_rate': 1.9349109641096922e-05, 'epoch': 0.17} +{'loss': 215.6922, 'grad_norm': 3160.9462240297444, 'learning_rate': 1.9347561683033312e-05, 'epoch': 0.17} +{'loss': 211.5211, 'grad_norm': 3095.374204525049, 'learning_rate': 1.9346011948534804e-05, 'epoch': 0.17} +{'loss': 206.0811, 'grad_norm': 3655.6972522205715, 'learning_rate': 1.934446043789591e-05, 'epoch': 0.17} +{'loss': 221.0348, 'grad_norm': 3706.409582705749, 'learning_rate': 1.934290715141149e-05, 'epoch': 0.17} +{'loss': 221.9713, 'grad_norm': 3115.0456152196252, 'learning_rate': 1.934135208937673e-05, 'epoch': 0.18} +{'loss': 209.7632, 'grad_norm': 3408.956581025094, 'learning_rate': 1.933979525208716e-05, 'epoch': 0.18} +{'loss': 207.1215, 'grad_norm': 3435.814934436475, 'learning_rate': 1.9338236639838645e-05, 'epoch': 0.18} +{'loss': 211.2703, 'grad_norm': 2956.963628001214, 'learning_rate': 1.9336676252927384e-05, 'epoch': 0.18} +{'loss': 207.9188, 'grad_norm': 3322.605953997914, 'learning_rate': 1.9335114091649926e-05, 'epoch': 0.18} +{'loss': 211.511, 'grad_norm': 3274.940035956919, 'learning_rate': 1.933355015630314e-05, 'epoch': 0.18} +{'loss': 219.0684, 'grad_norm': 4623.450349923703, 'learning_rate': 1.9331984447184245e-05, 'epoch': 0.18} +{'loss': 220.0017, 'grad_norm': 3138.64686893907, 'learning_rate': 1.9330416964590787e-05, 'epoch': 0.18} +{'loss': 210.4459, 'grad_norm': 3800.0049338012795, 'learning_rate': 1.932884770882066e-05, 'epoch': 0.18} +{'loss': 206.5503, 'grad_norm': 3697.763728144201, 'learning_rate': 1.9327276680172086e-05, 'epoch': 0.18} +{'loss': 210.4369, 'grad_norm': 2964.3781566670077, 'learning_rate': 1.932570387894363e-05, 'epoch': 0.18} +{'loss': 207.4543, 'grad_norm': 2728.679914532463, 'learning_rate': 1.9324129305434192e-05, 'epoch': 0.18} +{'loss': 212.0695, 'grad_norm': 2644.887293790684, 'learning_rate': 1.9322552959943004e-05, 'epoch': 0.18} +{'loss': 209.9681, 'grad_norm': 3046.9892616205966, 'learning_rate': 1.9320974842769643e-05, 'epoch': 0.18} +{'loss': 217.8736, 'grad_norm': 3474.179707376472, 'learning_rate': 1.9319394954214013e-05, 'epoch': 0.18} +{'loss': 214.5244, 'grad_norm': 2961.747432022854, 'learning_rate': 1.9317813294576364e-05, 'epoch': 0.18} +{'loss': 214.806, 'grad_norm': 3359.6694121836176, 'learning_rate': 1.931622986415728e-05, 'epoch': 0.18} +{'loss': 215.3126, 'grad_norm': 3586.0677189784, 'learning_rate': 1.9314644663257677e-05, 'epoch': 0.18} +{'loss': 210.9496, 'grad_norm': 3584.7491708666166, 'learning_rate': 1.9313057692178813e-05, 'epoch': 0.18} +{'loss': 218.4053, 'grad_norm': 3951.8405166866, 'learning_rate': 1.9311468951222278e-05, 'epoch': 0.18} +{'loss': 216.2146, 'grad_norm': 3723.150768776975, 'learning_rate': 1.930987844069e-05, 'epoch': 0.18} +{'loss': 219.9992, 'grad_norm': 3329.3186393602177, 'learning_rate': 1.930828616088425e-05, 'epoch': 0.18} +{'loss': 209.025, 'grad_norm': 2814.804774845549, 'learning_rate': 1.9306692112107624e-05, 'epoch': 0.18} +{'loss': 214.4577, 'grad_norm': 4225.0893268524105, 'learning_rate': 1.930509629466306e-05, 'epoch': 0.18} +{'loss': 215.467, 'grad_norm': 4994.266649905392, 'learning_rate': 1.9303498708853824e-05, 'epoch': 0.18} +{'loss': 213.311, 'grad_norm': 3282.6926247939755, 'learning_rate': 1.930189935498354e-05, 'epoch': 0.18} +{'loss': 208.9501, 'grad_norm': 3159.1126909268382, 'learning_rate': 1.9300298233356137e-05, 'epoch': 0.18} +{'loss': 214.4785, 'grad_norm': 3297.7195607445483, 'learning_rate': 1.9298695344275913e-05, 'epoch': 0.18} +{'loss': 234.1151, 'grad_norm': 3567.098626042358, 'learning_rate': 1.9297090688047473e-05, 'epoch': 0.18} +{'loss': 206.9089, 'grad_norm': 3315.770092405832, 'learning_rate': 1.9295484264975773e-05, 'epoch': 0.18} +{'loss': 210.3064, 'grad_norm': 3155.5237007531155, 'learning_rate': 1.9293876075366108e-05, 'epoch': 0.18} +{'loss': 211.415, 'grad_norm': 3026.1572102285786, 'learning_rate': 1.9292266119524096e-05, 'epoch': 0.18} +{'loss': 216.4655, 'grad_norm': 3252.69467420624, 'learning_rate': 1.9290654397755693e-05, 'epoch': 0.18} +{'loss': 208.675, 'grad_norm': 2645.7792458861627, 'learning_rate': 1.9289040910367207e-05, 'epoch': 0.18} +{'loss': 209.4873, 'grad_norm': 3005.7841561124796, 'learning_rate': 1.928742565766526e-05, 'epoch': 0.18} +{'loss': 212.895, 'grad_norm': 5835.72939746647, 'learning_rate': 1.9285808639956823e-05, 'epoch': 0.18} +{'loss': 215.9535, 'grad_norm': 3207.981969518022, 'learning_rate': 1.9284189857549197e-05, 'epoch': 0.18} +{'loss': 209.6967, 'grad_norm': 2968.4462086497842, 'learning_rate': 1.9282569310750018e-05, 'epoch': 0.18} +{'loss': 208.6284, 'grad_norm': 3317.1930852319892, 'learning_rate': 1.9280946999867262e-05, 'epoch': 0.18} +{'loss': 212.039, 'grad_norm': 3127.164089640527, 'learning_rate': 1.9279322925209233e-05, 'epoch': 0.18} +{'loss': 217.0168, 'grad_norm': 3410.3680280219514, 'learning_rate': 1.927769708708458e-05, 'epoch': 0.18} +{'loss': 216.878, 'grad_norm': 3124.541497629778, 'learning_rate': 1.9276069485802274e-05, 'epoch': 0.18} +{'loss': 215.2664, 'grad_norm': 6251.175094941335, 'learning_rate': 1.9274440121671637e-05, 'epoch': 0.18} +{'loss': 233.1537, 'grad_norm': 5079.6997502356935, 'learning_rate': 1.927280899500231e-05, 'epoch': 0.18} +{'loss': 213.0091, 'grad_norm': 3446.813747282093, 'learning_rate': 1.9271176106104284e-05, 'epoch': 0.18} +{'loss': 214.7863, 'grad_norm': 3676.661687881309, 'learning_rate': 1.926954145528787e-05, 'epoch': 0.18} +{'loss': 220.8003, 'grad_norm': 3488.644652383466, 'learning_rate': 1.926790504286373e-05, 'epoch': 0.18} +{'loss': 215.326, 'grad_norm': 4815.202807310507, 'learning_rate': 1.926626686914284e-05, 'epoch': 0.18} +{'loss': 210.7869, 'grad_norm': 4407.439194128647, 'learning_rate': 1.9264626934436535e-05, 'epoch': 0.18} +{'loss': 216.0669, 'grad_norm': 6073.879529402843, 'learning_rate': 1.9262985239056463e-05, 'epoch': 0.18} +{'loss': 210.1223, 'grad_norm': 4034.7380943130906, 'learning_rate': 1.9261341783314623e-05, 'epoch': 0.18} +{'loss': 211.8261, 'grad_norm': 4386.591822881566, 'learning_rate': 1.9259696567523337e-05, 'epoch': 0.18} +{'loss': 203.9493, 'grad_norm': 3146.510254682276, 'learning_rate': 1.9258049591995272e-05, 'epoch': 0.19} +{'loss': 212.0474, 'grad_norm': 3443.200643034272, 'learning_rate': 1.9256400857043417e-05, 'epoch': 0.19} +{'loss': 218.5846, 'grad_norm': 3664.410396400268, 'learning_rate': 1.9254750362981106e-05, 'epoch': 0.19} +{'loss': 208.403, 'grad_norm': 2765.4924945128623, 'learning_rate': 1.9253098110121996e-05, 'epoch': 0.19} +{'loss': 212.0068, 'grad_norm': 2971.2141496686313, 'learning_rate': 1.9251444098780095e-05, 'epoch': 0.19} +{'loss': 209.7742, 'grad_norm': 3657.4224504902195, 'learning_rate': 1.924978832926973e-05, 'epoch': 0.19} +{'loss': 211.6122, 'grad_norm': 2970.5051887284767, 'learning_rate': 1.924813080190557e-05, 'epoch': 0.19} +{'loss': 205.1908, 'grad_norm': 3207.2775687647136, 'learning_rate': 1.9246471517002616e-05, 'epoch': 0.19} +{'loss': 211.5368, 'grad_norm': 3044.139237958847, 'learning_rate': 1.92448104748762e-05, 'epoch': 0.19} +{'loss': 205.8543, 'grad_norm': 3255.3694023899116, 'learning_rate': 1.9243147675841992e-05, 'epoch': 0.19} +{'loss': 210.2673, 'grad_norm': 3226.6040070036133, 'learning_rate': 1.9241483120216e-05, 'epoch': 0.19} +{'loss': 213.3646, 'grad_norm': 3120.6392417402685, 'learning_rate': 1.923981680831455e-05, 'epoch': 0.19} +{'loss': 216.7053, 'grad_norm': 3047.0681306872443, 'learning_rate': 1.9238148740454325e-05, 'epoch': 0.19} +{'loss': 208.2915, 'grad_norm': 3097.493539048096, 'learning_rate': 1.9236478916952316e-05, 'epoch': 0.19} +{'loss': 210.4774, 'grad_norm': 2971.456133116101, 'learning_rate': 1.9234807338125866e-05, 'epoch': 0.19} +{'loss': 210.7702, 'grad_norm': 2790.1093911708244, 'learning_rate': 1.9233134004292648e-05, 'epoch': 0.19} +{'loss': 204.0078, 'grad_norm': 3730.2129139190483, 'learning_rate': 1.9231458915770666e-05, 'epoch': 0.19} +{'loss': 210.4337, 'grad_norm': 2808.4827671776543, 'learning_rate': 1.9229782072878258e-05, 'epoch': 0.19} +{'loss': 204.9517, 'grad_norm': 2870.0484569848622, 'learning_rate': 1.9228103475934096e-05, 'epoch': 0.19} +{'loss': 211.8062, 'grad_norm': 2669.933899586942, 'learning_rate': 1.922642312525718e-05, 'epoch': 0.19} +{'loss': 208.5717, 'grad_norm': 2711.7352030645397, 'learning_rate': 1.9224741021166857e-05, 'epoch': 0.19} +{'loss': 208.6384, 'grad_norm': 2941.144346804086, 'learning_rate': 1.922305716398279e-05, 'epoch': 0.19} +{'loss': 210.9995, 'grad_norm': 2981.1279600213606, 'learning_rate': 1.922137155402499e-05, 'epoch': 0.19} +{'loss': 203.7964, 'grad_norm': 2851.368790202186, 'learning_rate': 1.921968419161379e-05, 'epoch': 0.19} +{'loss': 219.3734, 'grad_norm': 3269.9825401743433, 'learning_rate': 1.9217995077069865e-05, 'epoch': 0.19} +{'loss': 211.7333, 'grad_norm': 3291.8670912664165, 'learning_rate': 1.9216304210714213e-05, 'epoch': 0.19} +{'loss': 220.3333, 'grad_norm': 2970.502507185408, 'learning_rate': 1.9214611592868176e-05, 'epoch': 0.19} +{'loss': 208.3308, 'grad_norm': 3215.771551892869, 'learning_rate': 1.9212917223853424e-05, 'epoch': 0.19} +{'loss': 210.9635, 'grad_norm': 2974.6349060131624, 'learning_rate': 1.9211221103991953e-05, 'epoch': 0.19} +{'loss': 209.7562, 'grad_norm': 2766.0635816357185, 'learning_rate': 1.9209523233606102e-05, 'epoch': 0.19} +{'loss': 211.8707, 'grad_norm': 2866.9035910342473, 'learning_rate': 1.9207823613018543e-05, 'epoch': 0.19} +{'loss': 212.219, 'grad_norm': 3138.6578008882775, 'learning_rate': 1.9206122242552268e-05, 'epoch': 0.19} +{'loss': 210.8612, 'grad_norm': 3286.697299974783, 'learning_rate': 1.9204419122530614e-05, 'epoch': 0.19} +{'loss': 243.9861, 'grad_norm': 3450.893189268755, 'learning_rate': 1.9202714253277247e-05, 'epoch': 0.19} +{'loss': 217.6377, 'grad_norm': 3436.1672283694606, 'learning_rate': 1.9201007635116163e-05, 'epoch': 0.19} +{'loss': 210.8959, 'grad_norm': 3363.882752215118, 'learning_rate': 1.919929926837169e-05, 'epoch': 0.19} +{'loss': 209.3545, 'grad_norm': 3107.566270745, 'learning_rate': 1.91975891533685e-05, 'epoch': 0.19} +{'loss': 213.5087, 'grad_norm': 3540.9278811573736, 'learning_rate': 1.919587729043158e-05, 'epoch': 0.19} +{'loss': 216.3508, 'grad_norm': 5079.234308830727, 'learning_rate': 1.9194163679886254e-05, 'epoch': 0.19} +[2025-05-28 00:02:58,621] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is about to be saved! +[2025-05-28 00:02:58,634] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 00:02:58,634] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 00:02:58,698] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 00:02:58,895] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 00:08:52,577] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 00:08:52,623] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 00:09:00,741] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step10000 is ready now! +[2025-05-28 00:09:00,773] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is about to be saved! +[2025-05-28 00:09:00,795] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 00:09:00,795] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 00:09:00,877] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 00:09:00,996] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 00:14:59,719] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 00:14:59,749] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-10000/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 00:14:59,759] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step10000 is ready now! +{'loss': 214.4858, 'grad_norm': 3577.705357084135, 'learning_rate': 1.9192448322058187e-05, 'epoch': 0.19} +{'loss': 213.0751, 'grad_norm': 2994.6064818563136, 'learning_rate': 1.9190731217273367e-05, 'epoch': 0.19} +{'loss': 209.6479, 'grad_norm': 2710.869841204205, 'learning_rate': 1.9189012365858123e-05, 'epoch': 0.19} +{'loss': 216.1309, 'grad_norm': 3598.5157609357707, 'learning_rate': 1.91872917681391e-05, 'epoch': 0.19} +{'loss': 212.0023, 'grad_norm': 3043.7220706663948, 'learning_rate': 1.918556942444329e-05, 'epoch': 0.19} +{'loss': 204.9666, 'grad_norm': 3000.311188942744, 'learning_rate': 1.9183845335098015e-05, 'epoch': 0.19} +{'loss': 211.219, 'grad_norm': 3487.385631830371, 'learning_rate': 1.918211950043092e-05, 'epoch': 0.19} +{'loss': 205.8094, 'grad_norm': 2968.0522731250685, 'learning_rate': 1.9180391920769993e-05, 'epoch': 0.19} +{'loss': 204.775, 'grad_norm': 2726.366233861564, 'learning_rate': 1.9178662596443538e-05, 'epoch': 0.19} +{'loss': 207.2328, 'grad_norm': 2783.3111024476443, 'learning_rate': 1.917693152778021e-05, 'epoch': 0.19} +{'loss': 209.1291, 'grad_norm': 2987.0561011715513, 'learning_rate': 1.917519871510898e-05, 'epoch': 0.19} +{'loss': 210.7403, 'grad_norm': 3887.54888140859, 'learning_rate': 1.917346415875916e-05, 'epoch': 0.19} +{'loss': 210.6292, 'grad_norm': 3463.734321449105, 'learning_rate': 1.9171727859060383e-05, 'epoch': 0.19} +{'loss': 209.4062, 'grad_norm': 4289.658214396677, 'learning_rate': 1.916998981634263e-05, 'epoch': 0.2} +{'loss': 204.2838, 'grad_norm': 2925.392282456496, 'learning_rate': 1.9168250030936195e-05, 'epoch': 0.2} +{'loss': 207.7687, 'grad_norm': 3242.1895164669336, 'learning_rate': 1.916650850317172e-05, 'epoch': 0.2} +{'loss': 214.9401, 'grad_norm': 3273.6881380049467, 'learning_rate': 1.9164765233380156e-05, 'epoch': 0.2} +{'loss': 210.0387, 'grad_norm': 3050.579384730209, 'learning_rate': 1.9163020221892807e-05, 'epoch': 0.2} +{'loss': 209.6089, 'grad_norm': 2961.556990163932, 'learning_rate': 1.9161273469041305e-05, 'epoch': 0.2} +{'loss': 206.9452, 'grad_norm': 3373.5152744802376, 'learning_rate': 1.9159524975157594e-05, 'epoch': 0.2} +{'loss': 207.4028, 'grad_norm': 3077.309267835709, 'learning_rate': 1.9157774740573974e-05, 'epoch': 0.2} +{'loss': 212.8778, 'grad_norm': 2791.7703777116494, 'learning_rate': 1.9156022765623057e-05, 'epoch': 0.2} +{'loss': 212.1535, 'grad_norm': 2638.6071993741402, 'learning_rate': 1.9154269050637797e-05, 'epoch': 0.2} +{'loss': 207.5585, 'grad_norm': 3214.349712753819, 'learning_rate': 1.9152513595951472e-05, 'epoch': 0.2} +{'loss': 201.1981, 'grad_norm': 2739.8443792912863, 'learning_rate': 1.9150756401897695e-05, 'epoch': 0.2} +{'loss': 207.2977, 'grad_norm': 2774.419294280353, 'learning_rate': 1.9148997468810406e-05, 'epoch': 0.2} +{'loss': 215.2148, 'grad_norm': 2925.561130294651, 'learning_rate': 1.914723679702388e-05, 'epoch': 0.2} +{'loss': 205.7435, 'grad_norm': 3242.8966325860993, 'learning_rate': 1.9145474386872714e-05, 'epoch': 0.2} +{'loss': 206.1994, 'grad_norm': 3401.44344507642, 'learning_rate': 1.9143710238691847e-05, 'epoch': 0.2} +{'loss': 210.215, 'grad_norm': 3080.1733980874656, 'learning_rate': 1.9141944352816542e-05, 'epoch': 0.2} +{'loss': 207.5571, 'grad_norm': 3291.4272467555775, 'learning_rate': 1.9140176729582388e-05, 'epoch': 0.2} +{'loss': 206.1642, 'grad_norm': 3303.8728042676457, 'learning_rate': 1.913840736932531e-05, 'epoch': 0.2} +{'loss': 211.707, 'grad_norm': 3242.1450323267545, 'learning_rate': 1.9136636272381568e-05, 'epoch': 0.2} +{'loss': 206.3439, 'grad_norm': 2846.0009686536378, 'learning_rate': 1.913486343908774e-05, 'epoch': 0.2} +{'loss': 215.949, 'grad_norm': 2895.040405697638, 'learning_rate': 1.9133088869780737e-05, 'epoch': 0.2} +{'loss': 206.7912, 'grad_norm': 3114.93310997067, 'learning_rate': 1.9131312564797805e-05, 'epoch': 0.2} +{'loss': 208.0032, 'grad_norm': 2936.3603048096893, 'learning_rate': 1.9129534524476526e-05, 'epoch': 0.2} +{'loss': 220.6171, 'grad_norm': 2959.9702205383333, 'learning_rate': 1.912775474915479e-05, 'epoch': 0.2} +{'loss': 206.5639, 'grad_norm': 2885.2868225127527, 'learning_rate': 1.9125973239170844e-05, 'epoch': 0.2} +{'loss': 211.0418, 'grad_norm': 2704.7863554392798, 'learning_rate': 1.9124189994863238e-05, 'epoch': 0.2} +{'loss': 205.8001, 'grad_norm': 2944.669024933193, 'learning_rate': 1.912240501657087e-05, 'epoch': 0.2} +{'loss': 213.1391, 'grad_norm': 3385.951072256468, 'learning_rate': 1.9120618304632964e-05, 'epoch': 0.2} +{'loss': 202.7192, 'grad_norm': 3439.438304862569, 'learning_rate': 1.9118829859389067e-05, 'epoch': 0.2} +{'loss': 205.2696, 'grad_norm': 2883.125630605956, 'learning_rate': 1.9117039681179065e-05, 'epoch': 0.2} +{'loss': 205.3947, 'grad_norm': 3095.529099524836, 'learning_rate': 1.911524777034316e-05, 'epoch': 0.2} +{'loss': 203.6201, 'grad_norm': 2712.6862932957793, 'learning_rate': 1.9113454127221902e-05, 'epoch': 0.2} +{'loss': 206.0224, 'grad_norm': 3122.281787197002, 'learning_rate': 1.9111658752156154e-05, 'epoch': 0.2} +{'loss': 210.0275, 'grad_norm': 3312.088545966391, 'learning_rate': 1.910986164548711e-05, 'epoch': 0.2} +{'loss': 209.9945, 'grad_norm': 3418.8251372800237, 'learning_rate': 1.91080628075563e-05, 'epoch': 0.2} +{'loss': 205.8996, 'grad_norm': 2865.049942718114, 'learning_rate': 1.9106262238705583e-05, 'epoch': 0.2} +{'loss': 203.8597, 'grad_norm': 2614.9858164758284, 'learning_rate': 1.910445993927714e-05, 'epoch': 0.2} +{'loss': 206.4704, 'grad_norm': 2889.547060969425, 'learning_rate': 1.910265590961349e-05, 'epoch': 0.2} +{'loss': 208.3956, 'grad_norm': 3654.4918216130814, 'learning_rate': 1.9100850150057465e-05, 'epoch': 0.2} +{'loss': 208.3627, 'grad_norm': 4136.363110090676, 'learning_rate': 1.9099042660952246e-05, 'epoch': 0.2} +{'loss': 211.0996, 'grad_norm': 3469.186653469185, 'learning_rate': 1.909723344264133e-05, 'epoch': 0.2} +{'loss': 204.5411, 'grad_norm': 3273.8624599960926, 'learning_rate': 1.909542249546854e-05, 'epoch': 0.2} +{'loss': 203.4677, 'grad_norm': 3141.565189854574, 'learning_rate': 1.9093609819778044e-05, 'epoch': 0.2} +{'loss': 204.3571, 'grad_norm': 2636.419455481282, 'learning_rate': 1.909179541591432e-05, 'epoch': 0.2} +{'loss': 200.08, 'grad_norm': 3081.4269188801636, 'learning_rate': 1.9089979284222184e-05, 'epoch': 0.2} +{'loss': 200.4468, 'grad_norm': 3228.167507396976, 'learning_rate': 1.908816142504678e-05, 'epoch': 0.2} +{'loss': 217.2847, 'grad_norm': 2877.5045974660347, 'learning_rate': 1.9086341838733572e-05, 'epoch': 0.2} +{'loss': 207.7909, 'grad_norm': 2895.4891062719284, 'learning_rate': 1.9084520525628367e-05, 'epoch': 0.2} +{'loss': 205.8746, 'grad_norm': 3391.9474439592204, 'learning_rate': 1.9082697486077288e-05, 'epoch': 0.2} +{'loss': 202.7255, 'grad_norm': 2798.033311006501, 'learning_rate': 1.9080872720426793e-05, 'epoch': 0.2} +{'loss': 205.1083, 'grad_norm': 2809.5444637326427, 'learning_rate': 1.9079046229023662e-05, 'epoch': 0.2} +{'loss': 202.2626, 'grad_norm': 2592.904356259964, 'learning_rate': 1.9077218012215005e-05, 'epoch': 0.21} +{'loss': 203.3281, 'grad_norm': 2770.031254469229, 'learning_rate': 1.9075388070348266e-05, 'epoch': 0.21} +{'loss': 205.8022, 'grad_norm': 2741.285420156615, 'learning_rate': 1.9073556403771207e-05, 'epoch': 0.21} +{'loss': 204.4499, 'grad_norm': 3327.11887521683, 'learning_rate': 1.9071723012831928e-05, 'epoch': 0.21} +{'loss': 205.4007, 'grad_norm': 2999.3074659273116, 'learning_rate': 1.9069887897878847e-05, 'epoch': 0.21} +{'loss': 204.2333, 'grad_norm': 3394.0076287474926, 'learning_rate': 1.9068051059260716e-05, 'epoch': 0.21} +{'loss': 203.858, 'grad_norm': 2710.564898622385, 'learning_rate': 1.906621249732661e-05, 'epoch': 0.21} +{'loss': 204.3325, 'grad_norm': 3679.41420824008, 'learning_rate': 1.9064372212425936e-05, 'epoch': 0.21} +{'loss': 203.1635, 'grad_norm': 2826.273597831231, 'learning_rate': 1.906253020490843e-05, 'epoch': 0.21} +{'loss': 206.0885, 'grad_norm': 5249.989548166724, 'learning_rate': 1.906068647512415e-05, 'epoch': 0.21} +{'loss': 209.315, 'grad_norm': 3946.335262558395, 'learning_rate': 1.9058841023423476e-05, 'epoch': 0.21} +{'loss': 202.0749, 'grad_norm': 3172.427079689883, 'learning_rate': 1.905699385015713e-05, 'epoch': 0.21} +{'loss': 198.9818, 'grad_norm': 2788.6193646357356, 'learning_rate': 1.905514495567615e-05, 'epoch': 0.21} +{'loss': 201.1612, 'grad_norm': 4341.497694522555, 'learning_rate': 1.905329434033191e-05, 'epoch': 0.21} +{'loss': 210.9994, 'grad_norm': 4673.97392735956, 'learning_rate': 1.9051442004476102e-05, 'epoch': 0.21} +{'loss': 202.0415, 'grad_norm': 3533.27923479497, 'learning_rate': 1.9049587948460747e-05, 'epoch': 0.21} +{'loss': 204.0941, 'grad_norm': 2598.4900520831884, 'learning_rate': 1.9047732172638194e-05, 'epoch': 0.21} +{'loss': 206.0082, 'grad_norm': 2900.1619440028667, 'learning_rate': 1.9045874677361127e-05, 'epoch': 0.21} +{'loss': 209.1542, 'grad_norm': 3733.281251716953, 'learning_rate': 1.9044015462982544e-05, 'epoch': 0.21} +{'loss': 206.9515, 'grad_norm': 3162.60847737124, 'learning_rate': 1.904215452985577e-05, 'epoch': 0.21} +{'loss': 206.1037, 'grad_norm': 3672.7979549316096, 'learning_rate': 1.904029187833447e-05, 'epoch': 0.21} +{'loss': 204.9674, 'grad_norm': 3425.3811970669376, 'learning_rate': 1.9038427508772624e-05, 'epoch': 0.21} +{'loss': 203.4358, 'grad_norm': 3243.671637127308, 'learning_rate': 1.9036561421524543e-05, 'epoch': 0.21} +{'loss': 196.2052, 'grad_norm': 2964.1467382141977, 'learning_rate': 1.903469361694486e-05, 'epoch': 0.21} +{'loss': 201.3291, 'grad_norm': 3061.8533868212835, 'learning_rate': 1.9032824095388538e-05, 'epoch': 0.21} +{'loss': 204.1863, 'grad_norm': 3057.649638132211, 'learning_rate': 1.9030952857210868e-05, 'epoch': 0.21} +{'loss': 208.1499, 'grad_norm': 3483.8434536254867, 'learning_rate': 1.902907990276746e-05, 'epoch': 0.21} +{'loss': 196.9422, 'grad_norm': 3076.603143249958, 'learning_rate': 1.9027205232414264e-05, 'epoch': 0.21} +{'loss': 201.319, 'grad_norm': 2913.3811760855365, 'learning_rate': 1.9025328846507542e-05, 'epoch': 0.21} +{'loss': 201.0039, 'grad_norm': 2699.72415823641, 'learning_rate': 1.9023450745403883e-05, 'epoch': 0.21} +{'loss': 206.2109, 'grad_norm': 3408.526711427518, 'learning_rate': 1.9021570929460213e-05, 'epoch': 0.21} +{'loss': 201.9555, 'grad_norm': 3105.8178513435196, 'learning_rate': 1.9019689399033772e-05, 'epoch': 0.21} +{'loss': 207.8628, 'grad_norm': 3157.3918435745295, 'learning_rate': 1.9017806154482136e-05, 'epoch': 0.21} +{'loss': 198.4115, 'grad_norm': 4063.5131404577596, 'learning_rate': 1.90159211961632e-05, 'epoch': 0.21} +{'loss': 203.3332, 'grad_norm': 3327.1866853697234, 'learning_rate': 1.9014034524435185e-05, 'epoch': 0.21} +{'loss': 205.5538, 'grad_norm': 2889.735340929076, 'learning_rate': 1.9012146139656637e-05, 'epoch': 0.21} +{'loss': 201.3261, 'grad_norm': 2798.9271814531185, 'learning_rate': 1.9010256042186435e-05, 'epoch': 0.21} +{'loss': 199.7871, 'grad_norm': 3246.67996979804, 'learning_rate': 1.9008364232383774e-05, 'epoch': 0.21} +{'loss': 204.0156, 'grad_norm': 3071.928354969819, 'learning_rate': 1.900647071060818e-05, 'epoch': 0.21} +{'loss': 207.9242, 'grad_norm': 3118.6477936586407, 'learning_rate': 1.9004575477219504e-05, 'epoch': 0.21} +{'loss': 207.0485, 'grad_norm': 2934.6154858013533, 'learning_rate': 1.9002678532577915e-05, 'epoch': 0.21} +{'loss': 205.4483, 'grad_norm': 2840.7604340418156, 'learning_rate': 1.900077987704392e-05, 'epoch': 0.21} +{'loss': 203.1049, 'grad_norm': 3171.9944822328875, 'learning_rate': 1.8998879510978342e-05, 'epoch': 0.21} +{'loss': 199.9649, 'grad_norm': 3058.119327480242, 'learning_rate': 1.8996977434742332e-05, 'epoch': 0.21} +{'loss': 198.7068, 'grad_norm': 2688.4176188143033, 'learning_rate': 1.8995073648697367e-05, 'epoch': 0.21} +{'loss': 208.6459, 'grad_norm': 2687.0319689973203, 'learning_rate': 1.8993168153205243e-05, 'epoch': 0.21} +{'loss': 206.7298, 'grad_norm': 2968.055174854683, 'learning_rate': 1.8991260948628088e-05, 'epoch': 0.21} +{'loss': 200.4709, 'grad_norm': 2944.079370070142, 'learning_rate': 1.8989352035328352e-05, 'epoch': 0.21} +{'loss': 207.3578, 'grad_norm': 3346.4628362136255, 'learning_rate': 1.8987441413668814e-05, 'epoch': 0.21} +{'loss': 208.0874, 'grad_norm': 2406.4329557506603, 'learning_rate': 1.8985529084012562e-05, 'epoch': 0.21} +{'loss': 208.0449, 'grad_norm': 2946.1022956199945, 'learning_rate': 1.8983615046723032e-05, 'epoch': 0.21} +{'loss': 201.7798, 'grad_norm': 2999.262141776227, 'learning_rate': 1.8981699302163966e-05, 'epoch': 0.21} +{'loss': 207.0367, 'grad_norm': 3071.9899587901164, 'learning_rate': 1.897978185069944e-05, 'epoch': 0.22} +{'loss': 202.8816, 'grad_norm': 3035.7290252529683, 'learning_rate': 1.8977862692693854e-05, 'epoch': 0.22} +{'loss': 200.6476, 'grad_norm': 2467.5198419386084, 'learning_rate': 1.8975941828511923e-05, 'epoch': 0.22} +{'loss': 209.1696, 'grad_norm': 2779.303675573989, 'learning_rate': 1.8974019258518698e-05, 'epoch': 0.22} +{'loss': 203.7821, 'grad_norm': 3000.9566078092544, 'learning_rate': 1.897209498307955e-05, 'epoch': 0.22} +{'loss': 202.4104, 'grad_norm': 3212.1129719819287, 'learning_rate': 1.8970169002560162e-05, 'epoch': 0.22} +{'loss': 202.3748, 'grad_norm': 3021.7452127917, 'learning_rate': 1.8968241317326567e-05, 'epoch': 0.22} +{'loss': 207.2808, 'grad_norm': 3525.583188423247, 'learning_rate': 1.8966311927745102e-05, 'epoch': 0.22} +{'loss': 219.6228, 'grad_norm': 3351.6562614308577, 'learning_rate': 1.8964380834182434e-05, 'epoch': 0.22} +{'loss': 204.1775, 'grad_norm': 3098.036148304903, 'learning_rate': 1.896244803700555e-05, 'epoch': 0.22} +{'loss': 207.8244, 'grad_norm': 3595.7652663251383, 'learning_rate': 1.8960513536581763e-05, 'epoch': 0.22} +{'loss': 204.2542, 'grad_norm': 3771.1283461039397, 'learning_rate': 1.8958577333278715e-05, 'epoch': 0.22} +{'loss': 203.6065, 'grad_norm': 5197.101271376194, 'learning_rate': 1.8956639427464366e-05, 'epoch': 0.22} +{'loss': 207.0274, 'grad_norm': 3292.1505633620736, 'learning_rate': 1.8954699819507e-05, 'epoch': 0.22} +{'loss': 199.131, 'grad_norm': 2711.3976556162816, 'learning_rate': 1.8952758509775227e-05, 'epoch': 0.22} +{'loss': 200.451, 'grad_norm': 2835.4334010925822, 'learning_rate': 1.895081549863797e-05, 'epoch': 0.22} +{'loss': 205.0763, 'grad_norm': 3266.341579799147, 'learning_rate': 1.8948870786464496e-05, 'epoch': 0.22} +{'loss': 205.4304, 'grad_norm': 2900.676675077509, 'learning_rate': 1.8946924373624375e-05, 'epoch': 0.22} +{'loss': 205.6756, 'grad_norm': 3191.2203718061755, 'learning_rate': 1.894497626048751e-05, 'epoch': 0.22} +{'loss': 206.0852, 'grad_norm': 2889.9077495948495, 'learning_rate': 1.894302644742413e-05, 'epoch': 0.22} +{'loss': 205.6657, 'grad_norm': 3893.419514763722, 'learning_rate': 1.8941074934804778e-05, 'epoch': 0.22} +{'loss': 204.1437, 'grad_norm': 2884.275221386927, 'learning_rate': 1.8939121723000324e-05, 'epoch': 0.22} +{'loss': 201.099, 'grad_norm': 2616.769538309673, 'learning_rate': 1.8937166812381964e-05, 'epoch': 0.22} +{'loss': 200.4789, 'grad_norm': 3218.166224472989, 'learning_rate': 1.893521020332121e-05, 'epoch': 0.22} +{'loss': 210.0728, 'grad_norm': 2913.311005713319, 'learning_rate': 1.8933251896189908e-05, 'epoch': 0.22} +{'loss': 197.6233, 'grad_norm': 2946.3804588230964, 'learning_rate': 1.8931291891360216e-05, 'epoch': 0.22} +{'loss': 198.5118, 'grad_norm': 3264.8125065760423, 'learning_rate': 1.8929330189204616e-05, 'epoch': 0.22} +{'loss': 202.188, 'grad_norm': 2790.6472761983596, 'learning_rate': 1.892736679009592e-05, 'epoch': 0.22} +{'loss': 204.4376, 'grad_norm': 3659.254597875111, 'learning_rate': 1.8925401694407252e-05, 'epoch': 0.22} +{'loss': 211.3125, 'grad_norm': 3373.286072766464, 'learning_rate': 1.892343490251207e-05, 'epoch': 0.22} +{'loss': 200.7769, 'grad_norm': 2618.053973775699, 'learning_rate': 1.892146641478414e-05, 'epoch': 0.22} +{'loss': 203.8845, 'grad_norm': 3047.016909659467, 'learning_rate': 1.8919496231597568e-05, 'epoch': 0.22} +{'loss': 201.9397, 'grad_norm': 2970.998281149502, 'learning_rate': 1.8917524353326765e-05, 'epoch': 0.22} +{'loss': 201.7112, 'grad_norm': 2638.310980707981, 'learning_rate': 1.891555078034648e-05, 'epoch': 0.22} +{'loss': 212.1056, 'grad_norm': 2547.4726678066777, 'learning_rate': 1.891357551303177e-05, 'epoch': 0.22} +{'loss': 201.62, 'grad_norm': 3469.397553644401, 'learning_rate': 1.8911598551758016e-05, 'epoch': 0.22} +{'loss': 202.2212, 'grad_norm': 3057.679743541162, 'learning_rate': 1.8909619896900933e-05, 'epoch': 0.22} +{'loss': 210.8733, 'grad_norm': 3320.514441680626, 'learning_rate': 1.8907639548836548e-05, 'epoch': 0.22} +{'loss': 202.0173, 'grad_norm': 3159.5847494383015, 'learning_rate': 1.8905657507941206e-05, 'epoch': 0.22} +{'loss': 212.8544, 'grad_norm': 4916.759528235161, 'learning_rate': 1.8903673774591585e-05, 'epoch': 0.22} +{'loss': 198.508, 'grad_norm': 3125.0343128211384, 'learning_rate': 1.8901688349164677e-05, 'epoch': 0.22} +{'loss': 198.0856, 'grad_norm': 2743.078366078883, 'learning_rate': 1.88997012320378e-05, 'epoch': 0.22} +{'loss': 193.1044, 'grad_norm': 3089.846356494397, 'learning_rate': 1.8897712423588586e-05, 'epoch': 0.22} +{'loss': 201.7153, 'grad_norm': 2812.4900417725576, 'learning_rate': 1.8895721924194993e-05, 'epoch': 0.22} +{'loss': 196.6622, 'grad_norm': 3263.3653313475006, 'learning_rate': 1.889372973423531e-05, 'epoch': 0.22} +{'loss': 205.3196, 'grad_norm': 3004.036239718872, 'learning_rate': 1.889173585408813e-05, 'epoch': 0.22} +{'loss': 198.9438, 'grad_norm': 2620.487038092604, 'learning_rate': 1.888974028413237e-05, 'epoch': 0.22} +{'loss': 202.5917, 'grad_norm': 2994.878123525883, 'learning_rate': 1.8887743024747284e-05, 'epoch': 0.22} +{'loss': 203.6372, 'grad_norm': 3341.281861927326, 'learning_rate': 1.8885744076312432e-05, 'epoch': 0.22} +{'loss': 198.4956, 'grad_norm': 3092.5345755674844, 'learning_rate': 1.88837434392077e-05, 'epoch': 0.22} +{'loss': 205.0858, 'grad_norm': 2988.8305411225974, 'learning_rate': 1.888174111381329e-05, 'epoch': 0.22} +{'loss': 206.724, 'grad_norm': 2639.986103752825, 'learning_rate': 1.887973710050974e-05, 'epoch': 0.22} +{'loss': 202.3443, 'grad_norm': 3345.925463673157, 'learning_rate': 1.8877731399677885e-05, 'epoch': 0.23} +{'loss': 199.7393, 'grad_norm': 2822.852637587545, 'learning_rate': 1.8875724011698902e-05, 'epoch': 0.23} +{'loss': 200.8981, 'grad_norm': 2812.1066029705357, 'learning_rate': 1.8873714936954277e-05, 'epoch': 0.23} +{'loss': 202.3013, 'grad_norm': 2845.004958278338, 'learning_rate': 1.887170417582582e-05, 'epoch': 0.23} +{'loss': 207.8749, 'grad_norm': 2920.081805140274, 'learning_rate': 1.886969172869566e-05, 'epoch': 0.23} +{'loss': 200.2214, 'grad_norm': 2817.3321973885572, 'learning_rate': 1.8867677595946252e-05, 'epoch': 0.23} +{'loss': 202.5422, 'grad_norm': 2860.7734331119696, 'learning_rate': 1.8865661777960366e-05, 'epoch': 0.23} +{'loss': 195.9865, 'grad_norm': 3489.7568918190573, 'learning_rate': 1.8863644275121086e-05, 'epoch': 0.23} +{'loss': 198.3804, 'grad_norm': 3302.2513953555867, 'learning_rate': 1.8861625087811832e-05, 'epoch': 0.23} +{'loss': 197.9905, 'grad_norm': 3044.6180640974794, 'learning_rate': 1.8859604216416334e-05, 'epoch': 0.23} +{'loss': 201.9442, 'grad_norm': 3765.9357102870413, 'learning_rate': 1.885758166131864e-05, 'epoch': 0.23} +{'loss': 199.0741, 'grad_norm': 3358.2688029620203, 'learning_rate': 1.8855557422903125e-05, 'epoch': 0.23} +{'loss': 196.5332, 'grad_norm': 2784.1718766912168, 'learning_rate': 1.885353150155448e-05, 'epoch': 0.23} +{'loss': 198.1407, 'grad_norm': 2754.1041304632754, 'learning_rate': 1.8851503897657717e-05, 'epoch': 0.23} +{'loss': 200.4979, 'grad_norm': 2915.923071744834, 'learning_rate': 1.8849474611598163e-05, 'epoch': 0.23} +{'loss': 201.4767, 'grad_norm': 2576.238260303284, 'learning_rate': 1.8847443643761472e-05, 'epoch': 0.23} +{'loss': 205.1224, 'grad_norm': 3003.8235999037315, 'learning_rate': 1.8845410994533616e-05, 'epoch': 0.23} +{'loss': 203.1785, 'grad_norm': 2674.9643326295677, 'learning_rate': 1.884337666430088e-05, 'epoch': 0.23} +{'loss': 201.7552, 'grad_norm': 3068.7510705133614, 'learning_rate': 1.884134065344988e-05, 'epoch': 0.23} +{'loss': 207.4371, 'grad_norm': 2886.621206809655, 'learning_rate': 1.8839302962367542e-05, 'epoch': 0.23} +{'loss': 199.9525, 'grad_norm': 4031.918746584625, 'learning_rate': 1.883726359144111e-05, 'epoch': 0.23} +{'loss': 195.7967, 'grad_norm': 3155.4399055344993, 'learning_rate': 1.8835222541058157e-05, 'epoch': 0.23} +{'loss': 199.1451, 'grad_norm': 2509.675649436432, 'learning_rate': 1.8833179811606567e-05, 'epoch': 0.23} +{'loss': 195.9385, 'grad_norm': 2575.3401035005927, 'learning_rate': 1.883113540347454e-05, 'epoch': 0.23} +{'loss': 202.9713, 'grad_norm': 2772.540037900462, 'learning_rate': 1.8829089317050614e-05, 'epoch': 0.23} +{'loss': 206.7538, 'grad_norm': 3007.845631402789, 'learning_rate': 1.8827041552723625e-05, 'epoch': 0.23} +{'loss': 199.6452, 'grad_norm': 2801.6215461957026, 'learning_rate': 1.882499211088273e-05, 'epoch': 0.23} +{'loss': 204.4441, 'grad_norm': 3120.5563510836114, 'learning_rate': 1.882294099191742e-05, 'epoch': 0.23} +{'loss': 200.8987, 'grad_norm': 2630.152062363779, 'learning_rate': 1.882088819621749e-05, 'epoch': 0.23} +{'loss': 202.276, 'grad_norm': 2878.8440924372217, 'learning_rate': 1.881883372417306e-05, 'epoch': 0.23} +{'loss': 200.2544, 'grad_norm': 2819.994853141427, 'learning_rate': 1.8816777576174565e-05, 'epoch': 0.23} +[2025-05-28 02:03:24,931] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is about to be saved! +[2025-05-28 02:03:25,004] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 02:03:25,004] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 02:03:25,102] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 02:03:25,262] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 02:09:25,441] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 02:09:25,504] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 02:09:28,489] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step12000 is ready now! +[2025-05-28 02:09:28,565] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is about to be saved! +[2025-05-28 02:09:28,589] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 02:09:28,589] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 02:09:28,702] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 02:09:28,772] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 02:15:22,174] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 02:15:22,263] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-12000/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 02:15:27,877] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step12000 is ready now! +{'loss': 197.2415, 'grad_norm': 2867.0218474928197, 'learning_rate': 1.881471975261276e-05, 'epoch': 0.23} +{'loss': 197.8221, 'grad_norm': 3141.616495547885, 'learning_rate': 1.8812660253878726e-05, 'epoch': 0.23} +{'loss': 200.0165, 'grad_norm': 2577.8208111820823, 'learning_rate': 1.881059908036385e-05, 'epoch': 0.23} +{'loss': 203.8648, 'grad_norm': 2876.262876732549, 'learning_rate': 1.8808536232459844e-05, 'epoch': 0.23} +{'loss': 199.017, 'grad_norm': 3103.089524117418, 'learning_rate': 1.8806471710558732e-05, 'epoch': 0.23} +{'loss': 205.218, 'grad_norm': 3598.121100157306, 'learning_rate': 1.880440551505287e-05, 'epoch': 0.23} +{'loss': 199.3676, 'grad_norm': 4228.4753805446135, 'learning_rate': 1.8802337646334917e-05, 'epoch': 0.23} +{'loss': 195.4383, 'grad_norm': 4219.326536247027, 'learning_rate': 1.880026810479786e-05, 'epoch': 0.23} +{'loss': 196.1205, 'grad_norm': 2822.023232708542, 'learning_rate': 1.8798196890834998e-05, 'epoch': 0.23} +{'loss': 195.9588, 'grad_norm': 2769.1622011764703, 'learning_rate': 1.879612400483995e-05, 'epoch': 0.23} +{'loss': 197.6391, 'grad_norm': 2703.8549427097714, 'learning_rate': 1.879404944720665e-05, 'epoch': 0.23} +{'loss': 196.3496, 'grad_norm': 3207.1179544684346, 'learning_rate': 1.879197321832936e-05, 'epoch': 0.23} +{'loss': 197.9649, 'grad_norm': 2584.662088984943, 'learning_rate': 1.878989531860264e-05, 'epoch': 0.23} +{'loss': 204.9119, 'grad_norm': 3808.8756684428754, 'learning_rate': 1.8787815748421393e-05, 'epoch': 0.23} +{'loss': 204.3942, 'grad_norm': 2373.369832208379, 'learning_rate': 1.8785734508180817e-05, 'epoch': 0.23} +{'loss': 195.2785, 'grad_norm': 2686.0099289109403, 'learning_rate': 1.8783651598276437e-05, 'epoch': 0.23} +{'loss': 203.5755, 'grad_norm': 2631.6166057131095, 'learning_rate': 1.8781567019104096e-05, 'epoch': 0.23} +{'loss': 195.8937, 'grad_norm': 2961.4983659562563, 'learning_rate': 1.8779480771059954e-05, 'epoch': 0.23} +{'loss': 199.786, 'grad_norm': 2776.6860452358264, 'learning_rate': 1.8777392854540487e-05, 'epoch': 0.23} +{'loss': 195.6746, 'grad_norm': 2505.3296667578297, 'learning_rate': 1.8775303269942483e-05, 'epoch': 0.23} +{'loss': 203.6349, 'grad_norm': 2633.4978533494586, 'learning_rate': 1.8773212017663057e-05, 'epoch': 0.23} +{'loss': 200.0235, 'grad_norm': 2770.091318113591, 'learning_rate': 1.877111909809964e-05, 'epoch': 0.24} +{'loss': 201.7897, 'grad_norm': 2517.837566644653, 'learning_rate': 1.8769024511649966e-05, 'epoch': 0.24} +{'loss': 201.6039, 'grad_norm': 2649.939832629826, 'learning_rate': 1.8766928258712104e-05, 'epoch': 0.24} +{'loss': 200.2621, 'grad_norm': 3059.2162858079887, 'learning_rate': 1.8764830339684426e-05, 'epoch': 0.24} +{'loss': 201.2661, 'grad_norm': 3216.490168203285, 'learning_rate': 1.8762730754965627e-05, 'epoch': 0.24} +{'loss': 196.0187, 'grad_norm': 2787.550892952188, 'learning_rate': 1.876062950495472e-05, 'epoch': 0.24} +{'loss': 203.3932, 'grad_norm': 2530.162284721462, 'learning_rate': 1.8758526590051033e-05, 'epoch': 0.24} +{'loss': 206.4949, 'grad_norm': 2993.088314302782, 'learning_rate': 1.8756422010654206e-05, 'epoch': 0.24} +{'loss': 198.363, 'grad_norm': 2592.8801029458864, 'learning_rate': 1.8754315767164197e-05, 'epoch': 0.24} +{'loss': 199.8695, 'grad_norm': 2790.9990687629775, 'learning_rate': 1.8752207859981285e-05, 'epoch': 0.24} +{'loss': 202.5815, 'grad_norm': 4623.600650518911, 'learning_rate': 1.8750098289506066e-05, 'epoch': 0.24} +{'loss': 199.6506, 'grad_norm': 3113.0230096828495, 'learning_rate': 1.8747987056139438e-05, 'epoch': 0.24} +{'loss': 200.4492, 'grad_norm': 3045.2977932363283, 'learning_rate': 1.8745874160282637e-05, 'epoch': 0.24} +{'loss': 203.7098, 'grad_norm': 3499.0270447053704, 'learning_rate': 1.874375960233719e-05, 'epoch': 0.24} +{'loss': 195.6741, 'grad_norm': 3070.545188644847, 'learning_rate': 1.8741643382704967e-05, 'epoch': 0.24} +{'loss': 196.6335, 'grad_norm': 2695.164534497405, 'learning_rate': 1.8739525501788133e-05, 'epoch': 0.24} +{'loss': 203.0123, 'grad_norm': 4028.7040459350983, 'learning_rate': 1.8737405959989173e-05, 'epoch': 0.24} +{'loss': 205.3297, 'grad_norm': 2499.5254144368428, 'learning_rate': 1.8735284757710897e-05, 'epoch': 0.24} +{'loss': 198.9295, 'grad_norm': 3173.161273058109, 'learning_rate': 1.8733161895356416e-05, 'epoch': 0.24} +{'loss': 195.9239, 'grad_norm': 4023.4115818785253, 'learning_rate': 1.8731037373329174e-05, 'epoch': 0.24} +{'loss': 199.6529, 'grad_norm': 2717.30020131024, 'learning_rate': 1.872891119203291e-05, 'epoch': 0.24} +{'loss': 215.6896, 'grad_norm': 3030.5360736036732, 'learning_rate': 1.8726783351871698e-05, 'epoch': 0.24} +{'loss': 199.2302, 'grad_norm': 2811.899725644973, 'learning_rate': 1.8724653853249914e-05, 'epoch': 0.24} +{'loss': 195.2654, 'grad_norm': 2566.7090848138746, 'learning_rate': 1.8722522696572248e-05, 'epoch': 0.24} +{'loss': 203.3263, 'grad_norm': 4356.069327184893, 'learning_rate': 1.8720389882243722e-05, 'epoch': 0.24} +{'loss': 197.7324, 'grad_norm': 3701.8970667695353, 'learning_rate': 1.8718255410669654e-05, 'epoch': 0.24} +{'loss': 198.2593, 'grad_norm': 2957.226399848449, 'learning_rate': 1.871611928225569e-05, 'epoch': 0.24} +{'loss': 197.223, 'grad_norm': 3033.7512019758587, 'learning_rate': 1.8713981497407774e-05, 'epoch': 0.24} +{'loss': 195.083, 'grad_norm': 2761.365119003199, 'learning_rate': 1.871184205653219e-05, 'epoch': 0.24} +{'loss': 197.0259, 'grad_norm': 2654.0327820146, 'learning_rate': 1.8709700960035516e-05, 'epoch': 0.24} +{'loss': 192.7186, 'grad_norm': 2614.829698484066, 'learning_rate': 1.8707558208324652e-05, 'epoch': 0.24} +{'loss': 200.7442, 'grad_norm': 2882.1048673757846, 'learning_rate': 1.8705413801806817e-05, 'epoch': 0.24} +{'loss': 198.2781, 'grad_norm': 2826.834348130051, 'learning_rate': 1.8703267740889528e-05, 'epoch': 0.24} +{'loss': 199.3475, 'grad_norm': 2821.5757380503223, 'learning_rate': 1.870112002598064e-05, 'epoch': 0.24} +{'loss': 199.0499, 'grad_norm': 2677.1595179871315, 'learning_rate': 1.86989706574883e-05, 'epoch': 0.24} +{'loss': 198.2077, 'grad_norm': 2639.772963976403, 'learning_rate': 1.869681963582099e-05, 'epoch': 0.24} +{'loss': 192.1602, 'grad_norm': 2587.8702454737677, 'learning_rate': 1.869466696138749e-05, 'epoch': 0.24} +{'loss': 200.6969, 'grad_norm': 2872.412181862871, 'learning_rate': 1.8692512634596898e-05, 'epoch': 0.24} +{'loss': 199.708, 'grad_norm': 3186.328780456926, 'learning_rate': 1.8690356655858634e-05, 'epoch': 0.24} +{'loss': 195.7914, 'grad_norm': 2667.8500257356322, 'learning_rate': 1.8688199025582423e-05, 'epoch': 0.24} +{'loss': 195.011, 'grad_norm': 2435.1375420105387, 'learning_rate': 1.8686039744178303e-05, 'epoch': 0.24} +{'loss': 199.426, 'grad_norm': 2424.38986608268, 'learning_rate': 1.868387881205663e-05, 'epoch': 0.24} +{'loss': 196.3768, 'grad_norm': 2751.679664682657, 'learning_rate': 1.868171622962808e-05, 'epoch': 0.24} +{'loss': 196.1548, 'grad_norm': 3205.41607875406, 'learning_rate': 1.8679551997303632e-05, 'epoch': 0.24} +{'loss': 199.1848, 'grad_norm': 2994.973456729117, 'learning_rate': 1.867738611549458e-05, 'epoch': 0.24} +{'loss': 199.8381, 'grad_norm': 3705.3407200274296, 'learning_rate': 1.8675218584612534e-05, 'epoch': 0.24} +{'loss': 197.1379, 'grad_norm': 2635.8543193787, 'learning_rate': 1.867304940506942e-05, 'epoch': 0.24} +{'loss': 198.8998, 'grad_norm': 2857.8451182172034, 'learning_rate': 1.8670878577277474e-05, 'epoch': 0.24} +{'loss': 202.9255, 'grad_norm': 2683.5671400288943, 'learning_rate': 1.8668706101649242e-05, 'epoch': 0.24} +{'loss': 196.7737, 'grad_norm': 2773.018950302705, 'learning_rate': 1.8666531978597594e-05, 'epoch': 0.24} +{'loss': 202.0448, 'grad_norm': 2634.7011959421334, 'learning_rate': 1.8664356208535697e-05, 'epoch': 0.24} +{'loss': 195.6011, 'grad_norm': 2520.7940700690156, 'learning_rate': 1.8662178791877047e-05, 'epoch': 0.24} +{'loss': 196.5571, 'grad_norm': 2975.4041419573023, 'learning_rate': 1.8659999729035445e-05, 'epoch': 0.25} +{'loss': 197.8913, 'grad_norm': 2640.003077431483, 'learning_rate': 1.8657819020425e-05, 'epoch': 0.25} +{'loss': 197.3859, 'grad_norm': 2744.1003802064656, 'learning_rate': 1.865563666646015e-05, 'epoch': 0.25} +{'loss': 197.5345, 'grad_norm': 2596.423607582868, 'learning_rate': 1.8653452667555626e-05, 'epoch': 0.25} +{'loss': 194.8095, 'grad_norm': 3176.838926090116, 'learning_rate': 1.865126702412649e-05, 'epoch': 0.25} +{'loss': 199.6725, 'grad_norm': 3314.4402786821256, 'learning_rate': 1.864907973658809e-05, 'epoch': 0.25} +{'loss': 197.2271, 'grad_norm': 3404.305631300762, 'learning_rate': 1.8646890805356123e-05, 'epoch': 0.25} +{'loss': 197.8573, 'grad_norm': 3051.2437686167054, 'learning_rate': 1.864470023084657e-05, 'epoch': 0.25} +{'loss': 190.2398, 'grad_norm': 3323.2319320156494, 'learning_rate': 1.8642508013475737e-05, 'epoch': 0.25} +{'loss': 191.5371, 'grad_norm': 2575.7420467496354, 'learning_rate': 1.8640314153660233e-05, 'epoch': 0.25} +{'loss': 200.6827, 'grad_norm': 24496.71424553159, 'learning_rate': 1.863811865181699e-05, 'epoch': 0.25} +{'loss': 196.5572, 'grad_norm': 2923.4558933400235, 'learning_rate': 1.8635921508363245e-05, 'epoch': 0.25} +{'loss': 196.3397, 'grad_norm': 2701.17784091063, 'learning_rate': 1.863372272371655e-05, 'epoch': 0.25} +{'loss': 198.0278, 'grad_norm': 2734.270549157855, 'learning_rate': 1.8631522298294765e-05, 'epoch': 0.25} +{'loss': 197.3031, 'grad_norm': 3072.624333567278, 'learning_rate': 1.8629320232516063e-05, 'epoch': 0.25} +{'loss': 198.4759, 'grad_norm': 3578.75854745713, 'learning_rate': 1.862711652679894e-05, 'epoch': 0.25} +{'loss': 200.6616, 'grad_norm': 2633.8016738029214, 'learning_rate': 1.8624911181562185e-05, 'epoch': 0.25} +{'loss': 194.3017, 'grad_norm': 3235.0599791423206, 'learning_rate': 1.862270419722491e-05, 'epoch': 0.25} +{'loss': 195.7088, 'grad_norm': 3304.603077268586, 'learning_rate': 1.8620495574206537e-05, 'epoch': 0.25} +{'loss': 200.2426, 'grad_norm': 2669.6426299225177, 'learning_rate': 1.8618285312926798e-05, 'epoch': 0.25} +{'loss': 195.2821, 'grad_norm': 2607.430089880905, 'learning_rate': 1.8616073413805735e-05, 'epoch': 0.25} +{'loss': 190.7765, 'grad_norm': 2445.7111143402603, 'learning_rate': 1.8613859877263708e-05, 'epoch': 0.25} +{'loss': 197.0135, 'grad_norm': 2803.096363128482, 'learning_rate': 1.8611644703721373e-05, 'epoch': 0.25} +{'loss': 199.0389, 'grad_norm': 2789.066292758469, 'learning_rate': 1.860942789359972e-05, 'epoch': 0.25} +{'loss': 200.9729, 'grad_norm': 3018.6675825720845, 'learning_rate': 1.860720944732003e-05, 'epoch': 0.25} +{'loss': 200.376, 'grad_norm': 2741.072515981895, 'learning_rate': 1.8604989365303906e-05, 'epoch': 0.25} +{'loss': 187.2448, 'grad_norm': 2775.1626428688223, 'learning_rate': 1.860276764797325e-05, 'epoch': 0.25} +{'loss': 196.9835, 'grad_norm': 3027.069090805715, 'learning_rate': 1.8600544295750294e-05, 'epoch': 0.25} +{'loss': 194.6743, 'grad_norm': 2825.973277327869, 'learning_rate': 1.859831930905756e-05, 'epoch': 0.25} +{'loss': 194.9006, 'grad_norm': 2760.814361728448, 'learning_rate': 1.8596092688317897e-05, 'epoch': 0.25} +{'loss': 198.2697, 'grad_norm': 2753.668385639541, 'learning_rate': 1.8593864433954457e-05, 'epoch': 0.25} +{'loss': 196.8737, 'grad_norm': 2690.9806786918803, 'learning_rate': 1.85916345463907e-05, 'epoch': 0.25} +{'loss': 199.7947, 'grad_norm': 2588.3875157895914, 'learning_rate': 1.8589403026050404e-05, 'epoch': 0.25} +{'loss': 199.8075, 'grad_norm': 3254.5784013235766, 'learning_rate': 1.8587169873357646e-05, 'epoch': 0.25} +{'loss': 206.0322, 'grad_norm': 3091.6786129957986, 'learning_rate': 1.8584935088736826e-05, 'epoch': 0.25} +{'loss': 204.6174, 'grad_norm': 2683.533829233025, 'learning_rate': 1.8582698672612646e-05, 'epoch': 0.25} +{'loss': 200.1196, 'grad_norm': 2593.1474077957505, 'learning_rate': 1.8580460625410124e-05, 'epoch': 0.25} +{'loss': 197.7165, 'grad_norm': 2826.005671938391, 'learning_rate': 1.8578220947554578e-05, 'epoch': 0.25} +{'loss': 199.5248, 'grad_norm': 3042.415037250144, 'learning_rate': 1.8575979639471646e-05, 'epoch': 0.25} +{'loss': 198.5227, 'grad_norm': 3318.810951455576, 'learning_rate': 1.8573736701587273e-05, 'epoch': 0.25} +{'loss': 194.2879, 'grad_norm': 3010.740749108021, 'learning_rate': 1.857149213432771e-05, 'epoch': 0.25} +{'loss': 202.698, 'grad_norm': 3134.7392281057782, 'learning_rate': 1.8569245938119524e-05, 'epoch': 0.25} +{'loss': 198.2978, 'grad_norm': 2300.488004365002, 'learning_rate': 1.856699811338958e-05, 'epoch': 0.25} +{'loss': 198.6461, 'grad_norm': 3086.877733095863, 'learning_rate': 1.8564748660565075e-05, 'epoch': 0.25} +{'loss': 201.8668, 'grad_norm': 3054.2999790984113, 'learning_rate': 1.8562497580073484e-05, 'epoch': 0.25} +{'loss': 199.0355, 'grad_norm': 2608.215295113456, 'learning_rate': 1.856024487234262e-05, 'epoch': 0.25} +{'loss': 193.7978, 'grad_norm': 2796.7253535173813, 'learning_rate': 1.855799053780059e-05, 'epoch': 0.25} +{'loss': 199.7239, 'grad_norm': 3794.6629630703515, 'learning_rate': 1.855573457687581e-05, 'epoch': 0.25} +{'loss': 197.1996, 'grad_norm': 2629.0872617268733, 'learning_rate': 1.8553476989997013e-05, 'epoch': 0.25} +{'loss': 205.5876, 'grad_norm': 3386.459820046872, 'learning_rate': 1.8551217777593233e-05, 'epoch': 0.25} +{'loss': 198.4955, 'grad_norm': 3677.630605928035, 'learning_rate': 1.8548956940093822e-05, 'epoch': 0.25} +{'loss': 191.5509, 'grad_norm': 2855.2494441465915, 'learning_rate': 1.8546694477928434e-05, 'epoch': 0.25} +{'loss': 198.0824, 'grad_norm': 2742.010852836235, 'learning_rate': 1.8544430391527028e-05, 'epoch': 0.26} +{'loss': 192.3243, 'grad_norm': 3707.3415837635266, 'learning_rate': 1.8542164681319884e-05, 'epoch': 0.26} +{'loss': 194.5632, 'grad_norm': 2410.9790974292787, 'learning_rate': 1.853989734773758e-05, 'epoch': 0.26} +{'loss': 195.6375, 'grad_norm': 2775.13940777807, 'learning_rate': 1.8537628391211003e-05, 'epoch': 0.26} +{'loss': 198.5889, 'grad_norm': 2377.4920933959884, 'learning_rate': 1.8535357812171356e-05, 'epoch': 0.26} +{'loss': 199.3378, 'grad_norm': 2774.909435205762, 'learning_rate': 1.8533085611050144e-05, 'epoch': 0.26} +{'loss': 192.9949, 'grad_norm': 2723.1398535899807, 'learning_rate': 1.853081178827918e-05, 'epoch': 0.26} +{'loss': 197.0891, 'grad_norm': 2862.65616366289, 'learning_rate': 1.8528536344290594e-05, 'epoch': 0.26} +{'loss': 192.8098, 'grad_norm': 2589.4930442457853, 'learning_rate': 1.8526259279516807e-05, 'epoch': 0.26} +{'loss': 194.2844, 'grad_norm': 2906.53150830497, 'learning_rate': 1.8523980594390567e-05, 'epoch': 0.26} +{'loss': 194.8463, 'grad_norm': 3885.9658270209043, 'learning_rate': 1.8521700289344916e-05, 'epoch': 0.26} +{'loss': 196.9989, 'grad_norm': 2494.540244696346, 'learning_rate': 1.8519418364813215e-05, 'epoch': 0.26} +{'loss': 193.2712, 'grad_norm': 2559.4457852801092, 'learning_rate': 1.8517134821229117e-05, 'epoch': 0.26} +{'loss': 198.0588, 'grad_norm': 2721.7161395034786, 'learning_rate': 1.8514849659026604e-05, 'epoch': 0.26} +{'loss': 197.81, 'grad_norm': 2848.242446362365, 'learning_rate': 1.8512562878639947e-05, 'epoch': 0.26} +{'loss': 192.0356, 'grad_norm': 2798.3004296057898, 'learning_rate': 1.8510274480503732e-05, 'epoch': 0.26} +{'loss': 196.8967, 'grad_norm': 3239.680730777992, 'learning_rate': 1.8507984465052854e-05, 'epoch': 0.26} +{'loss': 199.4372, 'grad_norm': 3329.1245508161896, 'learning_rate': 1.8505692832722518e-05, 'epoch': 0.26} +{'loss': 194.3994, 'grad_norm': 2840.705541351813, 'learning_rate': 1.8503399583948224e-05, 'epoch': 0.26} +{'loss': 195.9915, 'grad_norm': 3167.7161549868474, 'learning_rate': 1.8501104719165788e-05, 'epoch': 0.26} +{'loss': 195.5838, 'grad_norm': 3198.9367510648217, 'learning_rate': 1.8498808238811332e-05, 'epoch': 0.26} +{'loss': 196.7853, 'grad_norm': 2368.5152786033473, 'learning_rate': 1.849651014332129e-05, 'epoch': 0.26} +{'loss': 196.9859, 'grad_norm': 3293.048855788782, 'learning_rate': 1.8494210433132392e-05, 'epoch': 0.26} +{'loss': 197.2224, 'grad_norm': 2793.399074957487, 'learning_rate': 1.8491909108681686e-05, 'epoch': 0.26} +{'loss': 194.6738, 'grad_norm': 2815.551099040874, 'learning_rate': 1.8489606170406522e-05, 'epoch': 0.26} +{'loss': 197.4708, 'grad_norm': 2550.1088635775236, 'learning_rate': 1.8487301618744552e-05, 'epoch': 0.26} +{'loss': 193.4046, 'grad_norm': 2929.8304527058795, 'learning_rate': 1.848499545413374e-05, 'epoch': 0.26} +{'loss': 190.7286, 'grad_norm': 2819.971579759605, 'learning_rate': 1.8482687677012357e-05, 'epoch': 0.26} +{'loss': 191.6828, 'grad_norm': 2322.9330793727404, 'learning_rate': 1.8480378287818976e-05, 'epoch': 0.26} +{'loss': 196.3838, 'grad_norm': 2936.7626449244567, 'learning_rate': 1.8478067286992484e-05, 'epoch': 0.26} +{'loss': 195.4884, 'grad_norm': 2773.3848260814857, 'learning_rate': 1.847575467497207e-05, 'epoch': 0.26} +{'loss': 194.0879, 'grad_norm': 2759.175039103258, 'learning_rate': 1.847344045219722e-05, 'epoch': 0.26} +{'loss': 195.9406, 'grad_norm': 2777.170827565264, 'learning_rate': 1.8471124619107744e-05, 'epoch': 0.26} +{'loss': 196.9926, 'grad_norm': 2720.8265511587024, 'learning_rate': 1.8468807176143745e-05, 'epoch': 0.26} +{'loss': 197.3696, 'grad_norm': 2676.960461680229, 'learning_rate': 1.846648812374564e-05, 'epoch': 0.26} +{'loss': 198.1645, 'grad_norm': 2695.441594710511, 'learning_rate': 1.8464167462354142e-05, 'epoch': 0.26} +{'loss': 196.6516, 'grad_norm': 2317.2688954305218, 'learning_rate': 1.8461845192410276e-05, 'epoch': 0.26} +{'loss': 198.5337, 'grad_norm': 2672.188124756142, 'learning_rate': 1.8459521314355376e-05, 'epoch': 0.26} +{'loss': 197.8236, 'grad_norm': 3290.867249926339, 'learning_rate': 1.8457195828631078e-05, 'epoch': 0.26} +{'loss': 195.0326, 'grad_norm': 2412.23664570542, 'learning_rate': 1.845486873567932e-05, 'epoch': 0.26} +{'loss': 197.2305, 'grad_norm': 2320.2550643067975, 'learning_rate': 1.845254003594235e-05, 'epoch': 0.26} +{'loss': 190.5118, 'grad_norm': 3305.3075967311993, 'learning_rate': 1.845020972986272e-05, 'epoch': 0.26} +{'loss': 193.8748, 'grad_norm': 3043.7357620835855, 'learning_rate': 1.8447877817883288e-05, 'epoch': 0.26} +{'loss': 192.2677, 'grad_norm': 2362.2828464955087, 'learning_rate': 1.8445544300447216e-05, 'epoch': 0.26} +{'loss': 196.2126, 'grad_norm': 2428.6851633284505, 'learning_rate': 1.8443209177997975e-05, 'epoch': 0.26} +{'loss': 198.0133, 'grad_norm': 2337.42823458094, 'learning_rate': 1.844087245097933e-05, 'epoch': 0.26} +{'loss': 200.1224, 'grad_norm': 2496.399776899422, 'learning_rate': 1.8438534119835365e-05, 'epoch': 0.26} +{'loss': 199.5374, 'grad_norm': 2619.048630630395, 'learning_rate': 1.8436194185010463e-05, 'epoch': 0.26} +{'loss': 193.5847, 'grad_norm': 2419.6754063632607, 'learning_rate': 1.843385264694931e-05, 'epoch': 0.26} +{'loss': 193.764, 'grad_norm': 2560.9196010850233, 'learning_rate': 1.8431509506096893e-05, 'epoch': 0.26} +{'loss': 204.3199, 'grad_norm': 3160.9997486656853, 'learning_rate': 1.8429164762898514e-05, 'epoch': 0.26} +{'loss': 198.1282, 'grad_norm': 2605.248275726573, 'learning_rate': 1.8426818417799774e-05, 'epoch': 0.26} +{'loss': 193.9963, 'grad_norm': 2712.406046521759, 'learning_rate': 1.842447047124658e-05, 'epoch': 0.27} +{'loss': 199.1989, 'grad_norm': 3310.120418400729, 'learning_rate': 1.8422120923685135e-05, 'epoch': 0.27} +{'loss': 191.6137, 'grad_norm': 3195.327548112885, 'learning_rate': 1.841976977556196e-05, 'epoch': 0.27} +{'loss': 198.8193, 'grad_norm': 2960.3011402857237, 'learning_rate': 1.841741702732387e-05, 'epoch': 0.27} +{'loss': 206.0915, 'grad_norm': 2730.4446442575027, 'learning_rate': 1.841506267941799e-05, 'epoch': 0.27} +{'loss': 195.491, 'grad_norm': 2985.2206268111618, 'learning_rate': 1.841270673229174e-05, 'epoch': 0.27} +{'loss': 192.3428, 'grad_norm': 2941.322978074333, 'learning_rate': 1.841034918639286e-05, 'epoch': 0.27} +{'loss': 193.4647, 'grad_norm': 2717.8266918402055, 'learning_rate': 1.8407990042169376e-05, 'epoch': 0.27} +{'loss': 194.8432, 'grad_norm': 2897.2605079765112, 'learning_rate': 1.8405629300069626e-05, 'epoch': 0.27} +{'loss': 193.1593, 'grad_norm': 2591.0032931854857, 'learning_rate': 1.840326696054226e-05, 'epoch': 0.27} +{'loss': 197.8814, 'grad_norm': 2805.63249819931, 'learning_rate': 1.8400903024036213e-05, 'epoch': 0.27} +{'loss': 198.7023, 'grad_norm': 3298.2996518289024, 'learning_rate': 1.8398537491000742e-05, 'epoch': 0.27} +{'loss': 211.0646, 'grad_norm': 3328.496395452506, 'learning_rate': 1.8396170361885394e-05, 'epoch': 0.27} +{'loss': 191.8191, 'grad_norm': 2715.8048080228764, 'learning_rate': 1.8393801637140025e-05, 'epoch': 0.27} +{'loss': 192.6173, 'grad_norm': 2723.0463781542726, 'learning_rate': 1.8391431317214794e-05, 'epoch': 0.27} +{'loss': 193.7241, 'grad_norm': 2791.471938167689, 'learning_rate': 1.8389059402560165e-05, 'epoch': 0.27} +{'loss': 192.5408, 'grad_norm': 3241.3989090957375, 'learning_rate': 1.8386685893626895e-05, 'epoch': 0.27} +{'loss': 198.5156, 'grad_norm': 3536.5746400349076, 'learning_rate': 1.8384310790866064e-05, 'epoch': 0.27} +{'loss': 192.7659, 'grad_norm': 2619.046057069509, 'learning_rate': 1.8381934094729035e-05, 'epoch': 0.27} +{'loss': 194.5464, 'grad_norm': 3149.854223588541, 'learning_rate': 1.8379555805667483e-05, 'epoch': 0.27} +{'loss': 195.2815, 'grad_norm': 2871.42790411334, 'learning_rate': 1.8377175924133382e-05, 'epoch': 0.27} +{'loss': 192.4083, 'grad_norm': 2679.4239780684916, 'learning_rate': 1.8374794450579013e-05, 'epoch': 0.27} +{'loss': 196.6174, 'grad_norm': 2812.547758511259, 'learning_rate': 1.8372411385456956e-05, 'epoch': 0.27} +[2025-05-28 04:04:02,350] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is about to be saved! +[2025-05-28 04:04:02,363] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 04:04:02,363] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 04:04:02,446] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 04:04:02,583] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 04:09:54,794] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 04:09:54,902] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 04:10:03,394] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step14000 is ready now! +[2025-05-28 04:10:03,469] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is about to be saved! +[2025-05-28 04:10:03,494] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 04:10:03,494] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 04:10:03,587] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 04:10:04,151] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 04:15:55,371] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 04:15:55,462] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-14000/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 04:16:00,384] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step14000 is ready now! +{'loss': 191.1203, 'grad_norm': 2657.2549055405884, 'learning_rate': 1.83700267292201e-05, 'epoch': 0.27} +{'loss': 194.551, 'grad_norm': 2655.0701631845754, 'learning_rate': 1.8367640482321624e-05, 'epoch': 0.27} +{'loss': 194.5594, 'grad_norm': 2557.7566651149186, 'learning_rate': 1.8365252645215022e-05, 'epoch': 0.27} +{'loss': 190.7293, 'grad_norm': 2963.777102208379, 'learning_rate': 1.836286321835408e-05, 'epoch': 0.27} +{'loss': 192.707, 'grad_norm': 3481.79504046286, 'learning_rate': 1.83604722021929e-05, 'epoch': 0.27} +{'loss': 192.6202, 'grad_norm': 2859.140036986411, 'learning_rate': 1.8358079597185864e-05, 'epoch': 0.27} +{'loss': 194.9584, 'grad_norm': 2979.588466033939, 'learning_rate': 1.8355685403787677e-05, 'epoch': 0.27} +{'loss': 190.045, 'grad_norm': 2571.5136545877176, 'learning_rate': 1.8353289622453338e-05, 'epoch': 0.27} +{'loss': 186.4521, 'grad_norm': 2318.695088613577, 'learning_rate': 1.8350892253638143e-05, 'epoch': 0.27} +{'loss': 190.9744, 'grad_norm': 2251.2218764492445, 'learning_rate': 1.8348493297797702e-05, 'epoch': 0.27} +{'loss': 193.1644, 'grad_norm': 2935.7726717113546, 'learning_rate': 1.8346092755387908e-05, 'epoch': 0.27} +{'loss': 191.4378, 'grad_norm': 2888.1401050102554, 'learning_rate': 1.8343690626864976e-05, 'epoch': 0.27} +{'loss': 191.2072, 'grad_norm': 2419.8629693510475, 'learning_rate': 1.8341286912685406e-05, 'epoch': 0.27} +{'loss': 199.3241, 'grad_norm': 9136.874888406463, 'learning_rate': 1.833888161330601e-05, 'epoch': 0.27} +{'loss': 194.9958, 'grad_norm': 3727.235469798796, 'learning_rate': 1.8336474729183897e-05, 'epoch': 0.27} +{'loss': 197.8772, 'grad_norm': 3313.4259671636896, 'learning_rate': 1.8334066260776472e-05, 'epoch': 0.27} +{'loss': 193.3524, 'grad_norm': 2907.558824852695, 'learning_rate': 1.8331656208541455e-05, 'epoch': 0.27} +{'loss': 195.8363, 'grad_norm': 2898.5015659821215, 'learning_rate': 1.8329244572936854e-05, 'epoch': 0.27} +{'loss': 196.1036, 'grad_norm': 2267.4699738815525, 'learning_rate': 1.8326831354420984e-05, 'epoch': 0.27} +{'loss': 191.1895, 'grad_norm': 2417.792965735616, 'learning_rate': 1.832441655345246e-05, 'epoch': 0.27} +{'loss': 191.556, 'grad_norm': 2764.530711203181, 'learning_rate': 1.8322000170490194e-05, 'epoch': 0.27} +{'loss': 199.1667, 'grad_norm': 2913.8986607565953, 'learning_rate': 1.8319582205993406e-05, 'epoch': 0.27} +{'loss': 196.3436, 'grad_norm': 3600.267570991363, 'learning_rate': 1.8317162660421605e-05, 'epoch': 0.27} +{'loss': 194.0851, 'grad_norm': 3145.1518914890976, 'learning_rate': 1.831474153423461e-05, 'epoch': 0.27} +{'loss': 186.5138, 'grad_norm': 2755.644044609771, 'learning_rate': 1.8312318827892547e-05, 'epoch': 0.27} +{'loss': 197.1217, 'grad_norm': 2717.6368684363656, 'learning_rate': 1.8309894541855825e-05, 'epoch': 0.27} +{'loss': 197.2726, 'grad_norm': 3098.5290821097683, 'learning_rate': 1.8307468676585164e-05, 'epoch': 0.27} +{'loss': 188.4575, 'grad_norm': 2479.7924815222673, 'learning_rate': 1.830504123254158e-05, 'epoch': 0.27} +{'loss': 192.6899, 'grad_norm': 2678.483241821791, 'learning_rate': 1.8302612210186394e-05, 'epoch': 0.27} +{'loss': 192.3281, 'grad_norm': 2966.4202229953844, 'learning_rate': 1.8300181609981222e-05, 'epoch': 0.28} +{'loss': 187.6619, 'grad_norm': 2924.7471692469203, 'learning_rate': 1.8297749432387978e-05, 'epoch': 0.28} +{'loss': 193.4919, 'grad_norm': 3101.6877752873006, 'learning_rate': 1.8295315677868882e-05, 'epoch': 0.28} +{'loss': 187.8344, 'grad_norm': 2350.389808221816, 'learning_rate': 1.8292880346886454e-05, 'epoch': 0.28} +{'loss': 188.8998, 'grad_norm': 2448.969990601666, 'learning_rate': 1.829044343990351e-05, 'epoch': 0.28} +{'loss': 191.6464, 'grad_norm': 3066.6636407308365, 'learning_rate': 1.8288004957383162e-05, 'epoch': 0.28} +{'loss': 185.8663, 'grad_norm': 2720.5988023368955, 'learning_rate': 1.8285564899788827e-05, 'epoch': 0.28} +{'loss': 190.1048, 'grad_norm': 2734.0070858580507, 'learning_rate': 1.8283123267584224e-05, 'epoch': 0.28} +{'loss': 188.6927, 'grad_norm': 2669.6209188322905, 'learning_rate': 1.828068006123336e-05, 'epoch': 0.28} +{'loss': 190.2155, 'grad_norm': 2511.947055996898, 'learning_rate': 1.8278235281200556e-05, 'epoch': 0.28} +{'loss': 194.4219, 'grad_norm': 2060.387814477827, 'learning_rate': 1.827578892795042e-05, 'epoch': 0.28} +{'loss': 189.8907, 'grad_norm': 2928.8307766971957, 'learning_rate': 1.8273341001947863e-05, 'epoch': 0.28} +{'loss': 189.7265, 'grad_norm': 3016.0148728715803, 'learning_rate': 1.8270891503658096e-05, 'epoch': 0.28} +{'loss': 189.3051, 'grad_norm': 3447.8658301677247, 'learning_rate': 1.8268440433546627e-05, 'epoch': 0.28} +{'loss': 198.2102, 'grad_norm': 3003.114666361565, 'learning_rate': 1.8265987792079268e-05, 'epoch': 0.28} +{'loss': 190.0198, 'grad_norm': 2769.179289294536, 'learning_rate': 1.8263533579722118e-05, 'epoch': 0.28} +{'loss': 191.4455, 'grad_norm': 3604.8012316822837, 'learning_rate': 1.826107779694159e-05, 'epoch': 0.28} +{'loss': 190.3155, 'grad_norm': 2717.233640531817, 'learning_rate': 1.825862044420438e-05, 'epoch': 0.28} +{'loss': 195.5646, 'grad_norm': 3541.3708283874234, 'learning_rate': 1.8256161521977497e-05, 'epoch': 0.28} +{'loss': 194.9782, 'grad_norm': 2661.0621023648687, 'learning_rate': 1.8253701030728235e-05, 'epoch': 0.28} +{'loss': 194.1278, 'grad_norm': 2824.6789873623297, 'learning_rate': 1.8251238970924194e-05, 'epoch': 0.28} +{'loss': 186.1412, 'grad_norm': 2444.028758359426, 'learning_rate': 1.8248775343033272e-05, 'epoch': 0.28} +{'loss': 193.297, 'grad_norm': 2524.569984995335, 'learning_rate': 1.8246310147523662e-05, 'epoch': 0.28} +{'loss': 193.9299, 'grad_norm': 2560.304375118915, 'learning_rate': 1.8243843384863854e-05, 'epoch': 0.28} +{'loss': 191.5795, 'grad_norm': 2429.1251614512025, 'learning_rate': 1.824137505552264e-05, 'epoch': 0.28} +{'loss': 205.9337, 'grad_norm': 2975.830327422916, 'learning_rate': 1.8238905159969107e-05, 'epoch': 0.28} +{'loss': 188.1643, 'grad_norm': 2786.2383657173095, 'learning_rate': 1.823643369867264e-05, 'epoch': 0.28} +{'loss': 191.6664, 'grad_norm': 2862.3201931745643, 'learning_rate': 1.823396067210293e-05, 'epoch': 0.28} +{'loss': 190.1199, 'grad_norm': 2650.1361846062982, 'learning_rate': 1.8231486080729946e-05, 'epoch': 0.28} +{'loss': 188.9616, 'grad_norm': 2644.5875020044737, 'learning_rate': 1.8229009925023966e-05, 'epoch': 0.28} +{'loss': 186.249, 'grad_norm': 2625.23876476338, 'learning_rate': 1.8226532205455573e-05, 'epoch': 0.28} +{'loss': 195.7395, 'grad_norm': 2804.976211663721, 'learning_rate': 1.8224052922495637e-05, 'epoch': 0.28} +{'loss': 190.8592, 'grad_norm': 2927.7679029592437, 'learning_rate': 1.8221572076615325e-05, 'epoch': 0.28} +{'loss': 195.3323, 'grad_norm': 3365.7737582172713, 'learning_rate': 1.82190896682861e-05, 'epoch': 0.28} +{'loss': 192.1977, 'grad_norm': 3175.784198117862, 'learning_rate': 1.821660569797973e-05, 'epoch': 0.28} +{'loss': 187.0719, 'grad_norm': 2895.2319857767598, 'learning_rate': 1.8214120166168278e-05, 'epoch': 0.28} +{'loss': 187.1686, 'grad_norm': 2629.727533303596, 'learning_rate': 1.82116330733241e-05, 'epoch': 0.28} +{'loss': 182.092, 'grad_norm': 3155.030834122422, 'learning_rate': 1.820914441991984e-05, 'epoch': 0.28} +{'loss': 189.8677, 'grad_norm': 3076.2713264638955, 'learning_rate': 1.820665420642846e-05, 'epoch': 0.28} +{'loss': 188.4272, 'grad_norm': 2967.862083651326, 'learning_rate': 1.82041624333232e-05, 'epoch': 0.28} +{'loss': 195.3543, 'grad_norm': 2783.933494979468, 'learning_rate': 1.8201669101077608e-05, 'epoch': 0.28} +{'loss': 194.7776, 'grad_norm': 2790.509185991598, 'learning_rate': 1.819917421016552e-05, 'epoch': 0.28} +{'loss': 187.2539, 'grad_norm': 2603.911613979396, 'learning_rate': 1.8196677761061067e-05, 'epoch': 0.28} +{'loss': 192.1723, 'grad_norm': 2891.172226910165, 'learning_rate': 1.819417975423869e-05, 'epoch': 0.28} +{'loss': 185.7252, 'grad_norm': 2598.0830127552344, 'learning_rate': 1.819168019017311e-05, 'epoch': 0.28} +{'loss': 191.6377, 'grad_norm': 2564.9140284893697, 'learning_rate': 1.8189179069339355e-05, 'epoch': 0.28} +{'loss': 194.5966, 'grad_norm': 2828.1905611289717, 'learning_rate': 1.818667639221274e-05, 'epoch': 0.28} +{'loss': 192.0568, 'grad_norm': 2640.812562799989, 'learning_rate': 1.8184172159268884e-05, 'epoch': 0.28} +{'loss': 187.0304, 'grad_norm': 2389.0399092260172, 'learning_rate': 1.8181666370983696e-05, 'epoch': 0.28} +{'loss': 188.2371, 'grad_norm': 2694.149322193603, 'learning_rate': 1.8179159027833382e-05, 'epoch': 0.28} +{'loss': 191.6116, 'grad_norm': 2541.8895748765813, 'learning_rate': 1.8176650130294445e-05, 'epoch': 0.28} +{'loss': 189.2478, 'grad_norm': 2621.6752838534408, 'learning_rate': 1.8174139678843686e-05, 'epoch': 0.28} +{'loss': 189.4528, 'grad_norm': 2531.6923075836257, 'learning_rate': 1.817162767395819e-05, 'epoch': 0.29} +{'loss': 190.5089, 'grad_norm': 2886.0253524823033, 'learning_rate': 1.8169114116115353e-05, 'epoch': 0.29} +{'loss': 183.712, 'grad_norm': 2602.7062009178926, 'learning_rate': 1.8166599005792852e-05, 'epoch': 0.29} +{'loss': 190.4369, 'grad_norm': 2518.2970115594067, 'learning_rate': 1.816408234346867e-05, 'epoch': 0.29} +{'loss': 190.6799, 'grad_norm': 2779.980228887551, 'learning_rate': 1.8161564129621073e-05, 'epoch': 0.29} +{'loss': 190.5123, 'grad_norm': 2341.0086876866403, 'learning_rate': 1.8159044364728634e-05, 'epoch': 0.29} +{'loss': 187.7079, 'grad_norm': 2562.018141442297, 'learning_rate': 1.815652304927022e-05, 'epoch': 0.29} +{'loss': 189.6719, 'grad_norm': 2612.7696965460423, 'learning_rate': 1.815400018372498e-05, 'epoch': 0.29} +{'loss': 192.319, 'grad_norm': 2390.270597343232, 'learning_rate': 1.815147576857237e-05, 'epoch': 0.29} +{'loss': 188.0801, 'grad_norm': 2729.1508907123966, 'learning_rate': 1.814894980429213e-05, 'epoch': 0.29} +{'loss': 189.6899, 'grad_norm': 2718.6846337647294, 'learning_rate': 1.814642229136431e-05, 'epoch': 0.29} +{'loss': 188.3787, 'grad_norm': 2461.5281549640854, 'learning_rate': 1.8143893230269242e-05, 'epoch': 0.29} +{'loss': 190.2898, 'grad_norm': 2470.465693099604, 'learning_rate': 1.8141362621487557e-05, 'epoch': 0.29} +{'loss': 188.8999, 'grad_norm': 2651.795820541369, 'learning_rate': 1.8138830465500175e-05, 'epoch': 0.29} +{'loss': 192.6399, 'grad_norm': 3296.8765192586206, 'learning_rate': 1.813629676278831e-05, 'epoch': 0.29} +{'loss': 186.8305, 'grad_norm': 2597.3150806544436, 'learning_rate': 1.8133761513833485e-05, 'epoch': 0.29} +{'loss': 191.8187, 'grad_norm': 2670.5063204441594, 'learning_rate': 1.8131224719117497e-05, 'epoch': 0.29} +{'loss': 188.7303, 'grad_norm': 2471.6313235230637, 'learning_rate': 1.8128686379122443e-05, 'epoch': 0.29} +{'loss': 188.087, 'grad_norm': 2166.026187522093, 'learning_rate': 1.8126146494330723e-05, 'epoch': 0.29} +{'loss': 197.6115, 'grad_norm': 5409.865903573331, 'learning_rate': 1.8123605065225017e-05, 'epoch': 0.29} +{'loss': 197.1894, 'grad_norm': 2540.3221126523445, 'learning_rate': 1.812106209228831e-05, 'epoch': 0.29} +{'loss': 188.6209, 'grad_norm': 2429.4552328062564, 'learning_rate': 1.811851757600387e-05, 'epoch': 0.29} +{'loss': 195.4762, 'grad_norm': 2666.4978676863916, 'learning_rate': 1.8115971516855267e-05, 'epoch': 0.29} +{'loss': 187.3838, 'grad_norm': 2382.527934120418, 'learning_rate': 1.8113423915326362e-05, 'epoch': 0.29} +{'loss': 188.5813, 'grad_norm': 2670.498917765263, 'learning_rate': 1.8110874771901303e-05, 'epoch': 0.29} +{'loss': 198.5718, 'grad_norm': 2401.4288672562343, 'learning_rate': 1.810832408706454e-05, 'epoch': 0.29} +{'loss': 194.9149, 'grad_norm': 3592.7221162566057, 'learning_rate': 1.810577186130081e-05, 'epoch': 0.29} +{'loss': 189.5776, 'grad_norm': 3348.165778322414, 'learning_rate': 1.8103218095095145e-05, 'epoch': 0.29} +{'loss': 186.0026, 'grad_norm': 2655.185340883045, 'learning_rate': 1.8100662788932867e-05, 'epoch': 0.29} +{'loss': 187.1161, 'grad_norm': 2772.8828840204947, 'learning_rate': 1.8098105943299596e-05, 'epoch': 0.29} +{'loss': 187.1197, 'grad_norm': 2198.41427516031, 'learning_rate': 1.8095547558681243e-05, 'epoch': 0.29} +{'loss': 189.0356, 'grad_norm': 2721.504811566473, 'learning_rate': 1.8092987635564005e-05, 'epoch': 0.29} +{'loss': 190.9812, 'grad_norm': 2492.716873168879, 'learning_rate': 1.809042617443438e-05, 'epoch': 0.29} +{'loss': 195.1882, 'grad_norm': 2812.184960881328, 'learning_rate': 1.8087863175779156e-05, 'epoch': 0.29} +{'loss': 190.6351, 'grad_norm': 2378.3676533395055, 'learning_rate': 1.808529864008541e-05, 'epoch': 0.29} +{'loss': 185.9829, 'grad_norm': 2739.7263502980863, 'learning_rate': 1.808273256784051e-05, 'epoch': 0.29} +{'loss': 194.7356, 'grad_norm': 2320.7650439535123, 'learning_rate': 1.808016495953212e-05, 'epoch': 0.29} +{'loss': 186.61, 'grad_norm': 2631.633898882975, 'learning_rate': 1.8077595815648202e-05, 'epoch': 0.29} +{'loss': 200.7277, 'grad_norm': 2510.55307665867, 'learning_rate': 1.8075025136676996e-05, 'epoch': 0.29} +{'loss': 188.3204, 'grad_norm': 2743.6241773808056, 'learning_rate': 1.8072452923107042e-05, 'epoch': 0.29} +{'loss': 187.201, 'grad_norm': 2647.60777670293, 'learning_rate': 1.8069879175427168e-05, 'epoch': 0.29} +{'loss': 187.4703, 'grad_norm': 22572.185981531864, 'learning_rate': 1.8067303894126502e-05, 'epoch': 0.29} +{'loss': 194.8281, 'grad_norm': 2840.3050007556208, 'learning_rate': 1.8064727079694454e-05, 'epoch': 0.29} +{'loss': 183.9738, 'grad_norm': 2140.9108648261426, 'learning_rate': 1.8062148732620723e-05, 'epoch': 0.29} +{'loss': 185.4414, 'grad_norm': 2602.2305293909567, 'learning_rate': 1.805956885339531e-05, 'epoch': 0.29} +{'loss': 195.5051, 'grad_norm': 2710.3563982626188, 'learning_rate': 1.8056987442508505e-05, 'epoch': 0.29} +{'loss': 197.6739, 'grad_norm': 2961.08601226502, 'learning_rate': 1.805440450045088e-05, 'epoch': 0.29} +{'loss': 186.5205, 'grad_norm': 2803.9453286208995, 'learning_rate': 1.8051820027713306e-05, 'epoch': 0.29} +{'loss': 193.4212, 'grad_norm': 2429.2921859579087, 'learning_rate': 1.8049234024786945e-05, 'epoch': 0.29} +{'loss': 187.851, 'grad_norm': 2889.518312701177, 'learning_rate': 1.8046646492163246e-05, 'epoch': 0.29} +{'loss': 193.8648, 'grad_norm': 3037.493775257716, 'learning_rate': 1.804405743033395e-05, 'epoch': 0.29} +{'loss': 189.0074, 'grad_norm': 3416.0484476862434, 'learning_rate': 1.8041466839791087e-05, 'epoch': 0.29} +{'loss': 191.4167, 'grad_norm': 2507.036606576521, 'learning_rate': 1.8038874721026988e-05, 'epoch': 0.3} +{'loss': 184.6883, 'grad_norm': 2369.4215690584415, 'learning_rate': 1.8036281074534258e-05, 'epoch': 0.3} +{'loss': 182.9139, 'grad_norm': 2214.397558717107, 'learning_rate': 1.80336859008058e-05, 'epoch': 0.3} +{'loss': 188.7128, 'grad_norm': 2551.269166650469, 'learning_rate': 1.803108920033481e-05, 'epoch': 0.3} +{'loss': 184.9972, 'grad_norm': 2182.4609472801703, 'learning_rate': 1.8028490973614775e-05, 'epoch': 0.3} +{'loss': 192.155, 'grad_norm': 2683.7911218711088, 'learning_rate': 1.802589122113946e-05, 'epoch': 0.3} +{'loss': 190.281, 'grad_norm': 2280.1975701119495, 'learning_rate': 1.802328994340294e-05, 'epoch': 0.3} +{'loss': 184.0094, 'grad_norm': 2546.0080406663037, 'learning_rate': 1.802068714089956e-05, 'epoch': 0.3} +{'loss': 186.6751, 'grad_norm': 3622.8846492358784, 'learning_rate': 1.8018082814123966e-05, 'epoch': 0.3} +{'loss': 191.4334, 'grad_norm': 2569.96942643962, 'learning_rate': 1.801547696357109e-05, 'epoch': 0.3} +{'loss': 194.308, 'grad_norm': 2692.397331374536, 'learning_rate': 1.8012869589736156e-05, 'epoch': 0.3} +{'loss': 189.0513, 'grad_norm': 2443.6205588816606, 'learning_rate': 1.8010260693114677e-05, 'epoch': 0.3} +{'loss': 188.1388, 'grad_norm': 2433.8160123260263, 'learning_rate': 1.800765027420245e-05, 'epoch': 0.3} +{'loss': 187.3047, 'grad_norm': 2416.1690863152976, 'learning_rate': 1.8005038333495572e-05, 'epoch': 0.3} +{'loss': 189.21, 'grad_norm': 2836.1190793790915, 'learning_rate': 1.8002424871490422e-05, 'epoch': 0.3} +{'loss': 188.6414, 'grad_norm': 2860.381745676482, 'learning_rate': 1.7999809888683663e-05, 'epoch': 0.3} +{'loss': 193.2627, 'grad_norm': 2504.4754602066682, 'learning_rate': 1.799719338557226e-05, 'epoch': 0.3} +{'loss': 226.05, 'grad_norm': 3146.619337451722, 'learning_rate': 1.7994575362653454e-05, 'epoch': 0.3} +{'loss': 190.8974, 'grad_norm': 2619.5657791757094, 'learning_rate': 1.799195582042479e-05, 'epoch': 0.3} +{'loss': 184.9423, 'grad_norm': 2498.9009312413637, 'learning_rate': 1.798933475938408e-05, 'epoch': 0.3} +{'loss': 189.5222, 'grad_norm': 2698.8884731027533, 'learning_rate': 1.7986712180029448e-05, 'epoch': 0.3} +{'loss': 189.9199, 'grad_norm': 2746.314292763167, 'learning_rate': 1.7984088082859294e-05, 'epoch': 0.3} +{'loss': 194.1919, 'grad_norm': 2269.6366557137253, 'learning_rate': 1.79814624683723e-05, 'epoch': 0.3} +{'loss': 186.1645, 'grad_norm': 2446.732218316643, 'learning_rate': 1.797883533706745e-05, 'epoch': 0.3} +{'loss': 193.3566, 'grad_norm': 2479.6200980678204, 'learning_rate': 1.797620668944402e-05, 'epoch': 0.3} +{'loss': 185.8015, 'grad_norm': 2439.3446020031606, 'learning_rate': 1.7973576526001553e-05, 'epoch': 0.3} +{'loss': 188.5483, 'grad_norm': 3253.9695181425, 'learning_rate': 1.7970944847239896e-05, 'epoch': 0.3} +{'loss': 190.7488, 'grad_norm': 2730.3311284682345, 'learning_rate': 1.7968311653659177e-05, 'epoch': 0.3} +{'loss': 192.4487, 'grad_norm': 3040.8691621756357, 'learning_rate': 1.796567694575982e-05, 'epoch': 0.3} +{'loss': 188.8439, 'grad_norm': 2172.529756458176, 'learning_rate': 1.7963040724042527e-05, 'epoch': 0.3} +{'loss': 187.3429, 'grad_norm': 2989.7030907650674, 'learning_rate': 1.7960402989008297e-05, 'epoch': 0.3} +{'loss': 186.7615, 'grad_norm': 2835.219194177346, 'learning_rate': 1.7957763741158408e-05, 'epoch': 0.3} +{'loss': 193.1897, 'grad_norm': 3189.1064865902026, 'learning_rate': 1.795512298099443e-05, 'epoch': 0.3} +{'loss': 192.9184, 'grad_norm': 2511.8354643380417, 'learning_rate': 1.7952480709018225e-05, 'epoch': 0.3} +{'loss': 192.0078, 'grad_norm': 2204.6644077056108, 'learning_rate': 1.7949836925731934e-05, 'epoch': 0.3} +{'loss': 188.6253, 'grad_norm': 2618.0759084461997, 'learning_rate': 1.7947191631637983e-05, 'epoch': 0.3} +{'loss': 188.7251, 'grad_norm': 2592.13357095536, 'learning_rate': 1.79445448272391e-05, 'epoch': 0.3} +{'loss': 190.5041, 'grad_norm': 2374.3071264346117, 'learning_rate': 1.7941896513038282e-05, 'epoch': 0.3} +{'loss': 203.539, 'grad_norm': 2855.9262331679056, 'learning_rate': 1.793924668953883e-05, 'epoch': 0.3} +{'loss': 185.7345, 'grad_norm': 2123.221545315018, 'learning_rate': 1.7936595357244314e-05, 'epoch': 0.3} +{'loss': 184.3439, 'grad_norm': 2220.0680690803338, 'learning_rate': 1.7933942516658608e-05, 'epoch': 0.3} +{'loss': 191.1307, 'grad_norm': 2477.246327152527, 'learning_rate': 1.7931288168285863e-05, 'epoch': 0.3} +{'loss': 187.527, 'grad_norm': 2735.519881341122, 'learning_rate': 1.7928632312630517e-05, 'epoch': 0.3} +{'loss': 184.4963, 'grad_norm': 2467.362064193397, 'learning_rate': 1.7925974950197294e-05, 'epoch': 0.3} +{'loss': 187.1809, 'grad_norm': 2747.7144561736477, 'learning_rate': 1.792331608149121e-05, 'epoch': 0.3} +{'loss': 188.3882, 'grad_norm': 2920.2473210935027, 'learning_rate': 1.7920655707017557e-05, 'epoch': 0.3} +{'loss': 187.7761, 'grad_norm': 2392.966119707642, 'learning_rate': 1.7917993827281926e-05, 'epoch': 0.3} +{'loss': 191.6476, 'grad_norm': 2306.7369812087295, 'learning_rate': 1.7915330442790183e-05, 'epoch': 0.3} +{'loss': 190.6121, 'grad_norm': 2281.8661449488222, 'learning_rate': 1.7912665554048486e-05, 'epoch': 0.3} +{'loss': 191.3505, 'grad_norm': 2429.9499037842825, 'learning_rate': 1.790999916156328e-05, 'epoch': 0.3} +{'loss': 189.9779, 'grad_norm': 2330.740682913808, 'learning_rate': 1.7907331265841296e-05, 'epoch': 0.3} +{'loss': 188.3296, 'grad_norm': 2464.9535619446665, 'learning_rate': 1.7904661867389536e-05, 'epoch': 0.3} +{'loss': 186.8177, 'grad_norm': 3038.2450725339468, 'learning_rate': 1.790199096671531e-05, 'epoch': 0.31} +{'loss': 187.526, 'grad_norm': 2295.311468006239, 'learning_rate': 1.7899318564326193e-05, 'epoch': 0.31} +{'loss': 184.2181, 'grad_norm': 2649.640455599717, 'learning_rate': 1.7896644660730068e-05, 'epoch': 0.31} +{'loss': 193.8646, 'grad_norm': 2498.8895710778443, 'learning_rate': 1.789396925643508e-05, 'epoch': 0.31} +{'loss': 184.0933, 'grad_norm': 2666.8841126310895, 'learning_rate': 1.7891292351949678e-05, 'epoch': 0.31} +{'loss': 184.0299, 'grad_norm': 2229.352838890506, 'learning_rate': 1.788861394778258e-05, 'epoch': 0.31} +{'loss': 190.7659, 'grad_norm': 2903.5960727309007, 'learning_rate': 1.7885934044442806e-05, 'epoch': 0.31} +{'loss': 189.3396, 'grad_norm': 2651.0838557064108, 'learning_rate': 1.788325264243964e-05, 'epoch': 0.31} +{'loss': 191.4417, 'grad_norm': 2326.929694336909, 'learning_rate': 1.788056974228267e-05, 'epoch': 0.31} +{'loss': 188.3492, 'grad_norm': 2691.1733333214697, 'learning_rate': 1.7877885344481763e-05, 'epoch': 0.31} +{'loss': 188.7965, 'grad_norm': 2952.5164837877264, 'learning_rate': 1.787519944954707e-05, 'epoch': 0.31} +{'loss': 190.6774, 'grad_norm': 2458.1458865996824, 'learning_rate': 1.7872512057989016e-05, 'epoch': 0.31} +{'loss': 184.9071, 'grad_norm': 2842.2901838595203, 'learning_rate': 1.7869823170318325e-05, 'epoch': 0.31} +{'loss': 189.0938, 'grad_norm': 2400.438269123516, 'learning_rate': 1.7867132787046e-05, 'epoch': 0.31} +{'loss': 183.4448, 'grad_norm': 2471.7936413787365, 'learning_rate': 1.7864440908683333e-05, 'epoch': 0.31} +[2025-05-28 06:04:39,070] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is about to be saved! +[2025-05-28 06:04:39,100] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 06:04:39,101] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 06:04:39,250] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 06:04:39,320] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 06:10:19,455] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 06:10:19,482] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 06:10:24,825] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step16000 is ready now! +[2025-05-28 06:10:24,879] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is about to be saved! +[2025-05-28 06:10:24,907] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 06:10:24,907] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 06:10:25,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 06:10:25,237] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 06:16:28,627] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 06:16:28,754] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-16000/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 06:16:46,230] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step16000 is ready now! +{'loss': 182.2201, 'grad_norm': 2644.937100618652, 'learning_rate': 1.7861747535741892e-05, 'epoch': 0.31} +{'loss': 185.2174, 'grad_norm': 2602.947198609924, 'learning_rate': 1.7859052668733534e-05, 'epoch': 0.31} +{'loss': 188.8603, 'grad_norm': 2569.1244062188603, 'learning_rate': 1.7856356308170394e-05, 'epoch': 0.31} +{'loss': 186.8469, 'grad_norm': 2244.784545841244, 'learning_rate': 1.7853658454564897e-05, 'epoch': 0.31} +{'loss': 186.0008, 'grad_norm': 2531.4275263992336, 'learning_rate': 1.785095910842975e-05, 'epoch': 0.31} +{'loss': 184.4329, 'grad_norm': 2502.2605596112503, 'learning_rate': 1.784825827027795e-05, 'epoch': 0.31} +{'loss': 196.4073, 'grad_norm': 2256.982173482112, 'learning_rate': 1.7845555940622764e-05, 'epoch': 0.31} +{'loss': 206.9052, 'grad_norm': 2567.829345635159, 'learning_rate': 1.784285211997775e-05, 'epoch': 0.31} +{'loss': 187.322, 'grad_norm': 2426.952053982401, 'learning_rate': 1.7840146808856757e-05, 'epoch': 0.31} +{'loss': 191.1122, 'grad_norm': 3658.8650564862814, 'learning_rate': 1.7837440007773895e-05, 'epoch': 0.31} +{'loss': 190.8477, 'grad_norm': 2676.4433632728487, 'learning_rate': 1.783473171724359e-05, 'epoch': 0.31} +{'loss': 189.7366, 'grad_norm': 2866.049182434945, 'learning_rate': 1.7832021937780515e-05, 'epoch': 0.31} +{'loss': 179.6613, 'grad_norm': 2498.3039762474436, 'learning_rate': 1.782931066989965e-05, 'epoch': 0.31} +{'loss': 187.2187, 'grad_norm': 2340.1450095419273, 'learning_rate': 1.782659791411626e-05, 'epoch': 0.31} +{'loss': 190.0375, 'grad_norm': 2883.731772552669, 'learning_rate': 1.7823883670945874e-05, 'epoch': 0.31} +{'loss': 186.2283, 'grad_norm': 3315.580070154653, 'learning_rate': 1.7821167940904314e-05, 'epoch': 0.31} +{'loss': 187.9388, 'grad_norm': 2700.682780511236, 'learning_rate': 1.781845072450769e-05, 'epoch': 0.31} +{'loss': 185.8149, 'grad_norm': 2643.577901882918, 'learning_rate': 1.7815732022272386e-05, 'epoch': 0.31} +{'loss': 183.1802, 'grad_norm': 2509.1034208567035, 'learning_rate': 1.781301183471507e-05, 'epoch': 0.31} +{'loss': 188.4076, 'grad_norm': 2713.9334214841642, 'learning_rate': 1.7810290162352693e-05, 'epoch': 0.31} +{'loss': 186.8735, 'grad_norm': 2549.6106881390056, 'learning_rate': 1.7807567005702492e-05, 'epoch': 0.31} +{'loss': 182.3956, 'grad_norm': 2448.3075281804377, 'learning_rate': 1.780484236528198e-05, 'epoch': 0.31} +{'loss': 191.6899, 'grad_norm': 2691.144765792385, 'learning_rate': 1.7802116241608956e-05, 'epoch': 0.31} +{'loss': 185.9337, 'grad_norm': 2796.190185973419, 'learning_rate': 1.7799388635201498e-05, 'epoch': 0.31} +{'loss': 191.9388, 'grad_norm': 2628.7660585043095, 'learning_rate': 1.779665954657797e-05, 'epoch': 0.31} +{'loss': 180.2556, 'grad_norm': 2762.397660128535, 'learning_rate': 1.7793928976257014e-05, 'epoch': 0.31} +{'loss': 183.2351, 'grad_norm': 2678.5577554644187, 'learning_rate': 1.7791196924757557e-05, 'epoch': 0.31} +{'loss': 184.3036, 'grad_norm': 2228.45048653189, 'learning_rate': 1.7788463392598798e-05, 'epoch': 0.31} +{'loss': 189.4302, 'grad_norm': 2455.7556251313126, 'learning_rate': 1.7785728380300234e-05, 'epoch': 0.31} +{'loss': 185.905, 'grad_norm': 2701.0925716466354, 'learning_rate': 1.7782991888381624e-05, 'epoch': 0.31} +{'loss': 181.5441, 'grad_norm': 2448.8780932403897, 'learning_rate': 1.7780253917363026e-05, 'epoch': 0.31} +{'loss': 189.0164, 'grad_norm': 2536.032152143572, 'learning_rate': 1.777751446776477e-05, 'epoch': 0.31} +{'loss': 194.8302, 'grad_norm': 2379.873450200452, 'learning_rate': 1.7774773540107463e-05, 'epoch': 0.31} +{'loss': 185.4581, 'grad_norm': 2844.844457553255, 'learning_rate': 1.7772031134912003e-05, 'epoch': 0.31} +{'loss': 186.3044, 'grad_norm': 2773.505812333381, 'learning_rate': 1.776928725269956e-05, 'epoch': 0.31} +{'loss': 181.7097, 'grad_norm': 2750.661001021052, 'learning_rate': 1.7766541893991592e-05, 'epoch': 0.31} +{'loss': 188.5182, 'grad_norm': 2889.7491186316856, 'learning_rate': 1.7763795059309834e-05, 'epoch': 0.31} +{'loss': 180.1321, 'grad_norm': 2164.916023118076, 'learning_rate': 1.7761046749176302e-05, 'epoch': 0.32} +{'loss': 184.8365, 'grad_norm': 2215.702608690166, 'learning_rate': 1.7758296964113285e-05, 'epoch': 0.32} +{'loss': 183.847, 'grad_norm': 2120.0001718063513, 'learning_rate': 1.7755545704643368e-05, 'epoch': 0.32} +{'loss': 182.5282, 'grad_norm': 1853.0626899602223, 'learning_rate': 1.7752792971289406e-05, 'epoch': 0.32} +{'loss': 186.2965, 'grad_norm': 2361.507966994114, 'learning_rate': 1.7750038764574532e-05, 'epoch': 0.32} +{'loss': 184.3817, 'grad_norm': 2186.7400695631827, 'learning_rate': 1.7747283085022164e-05, 'epoch': 0.32} +{'loss': 184.7207, 'grad_norm': 2260.810389336417, 'learning_rate': 1.7744525933156e-05, 'epoch': 0.32} +{'loss': 181.9036, 'grad_norm': 2175.0921318457195, 'learning_rate': 1.774176730950001e-05, 'epoch': 0.32} +{'loss': 190.8229, 'grad_norm': 2227.965818664965, 'learning_rate': 1.773900721457846e-05, 'epoch': 0.32} +{'loss': 188.2476, 'grad_norm': 2469.6948401934737, 'learning_rate': 1.7736245648915882e-05, 'epoch': 0.32} +{'loss': 182.4349, 'grad_norm': 2124.519508137789, 'learning_rate': 1.7733482613037087e-05, 'epoch': 0.32} +{'loss': 183.9537, 'grad_norm': 2220.119635712506, 'learning_rate': 1.7730718107467172e-05, 'epoch': 0.32} +{'loss': 187.4891, 'grad_norm': 2625.3214546762647, 'learning_rate': 1.7727952132731512e-05, 'epoch': 0.32} +{'loss': 185.4357, 'grad_norm': 2285.662750457124, 'learning_rate': 1.7725184689355762e-05, 'epoch': 0.32} +{'loss': 187.4687, 'grad_norm': 2352.1583765412693, 'learning_rate': 1.7722415777865845e-05, 'epoch': 0.32} +{'loss': 187.1854, 'grad_norm': 2670.4967950347263, 'learning_rate': 1.7719645398787982e-05, 'epoch': 0.32} +{'loss': 191.0729, 'grad_norm': 2254.1960724521296, 'learning_rate': 1.771687355264866e-05, 'epoch': 0.32} +{'loss': 183.8866, 'grad_norm': 2219.028543959555, 'learning_rate': 1.771410023997465e-05, 'epoch': 0.32} +{'loss': 189.4827, 'grad_norm': 2483.175306831619, 'learning_rate': 1.7711325461292994e-05, 'epoch': 0.32} +{'loss': 182.4437, 'grad_norm': 2560.326237135939, 'learning_rate': 1.770854921713102e-05, 'epoch': 0.32} +{'loss': 191.8627, 'grad_norm': 2699.710669563929, 'learning_rate': 1.7705771508016335e-05, 'epoch': 0.32} +{'loss': 181.4887, 'grad_norm': 2687.8943432369024, 'learning_rate': 1.770299233447682e-05, 'epoch': 0.32} +{'loss': 185.7579, 'grad_norm': 2729.9297253922527, 'learning_rate': 1.7700211697040638e-05, 'epoch': 0.32} +{'loss': 189.6854, 'grad_norm': 2766.567914552029, 'learning_rate': 1.7697429596236226e-05, 'epoch': 0.32} +{'loss': 183.937, 'grad_norm': 2651.4068410330215, 'learning_rate': 1.7694646032592304e-05, 'epoch': 0.32} +{'loss': 184.8562, 'grad_norm': 2339.6553361129063, 'learning_rate': 1.7691861006637868e-05, 'epoch': 0.32} +{'loss': 185.7243, 'grad_norm': 2917.573712856425, 'learning_rate': 1.768907451890219e-05, 'epoch': 0.32} +{'loss': 177.2135, 'grad_norm': 2384.534827066407, 'learning_rate': 1.7686286569914825e-05, 'epoch': 0.32} +{'loss': 185.88, 'grad_norm': 2644.9194417387566, 'learning_rate': 1.7683497160205595e-05, 'epoch': 0.32} +{'loss': 182.6401, 'grad_norm': 2431.250981208034, 'learning_rate': 1.768070629030461e-05, 'epoch': 0.32} +{'loss': 190.449, 'grad_norm': 2731.4606738593043, 'learning_rate': 1.7677913960742253e-05, 'epoch': 0.32} +{'loss': 179.3787, 'grad_norm': 2686.961899230602, 'learning_rate': 1.7675120172049194e-05, 'epoch': 0.32} +{'loss': 188.2818, 'grad_norm': 2388.1722745919487, 'learning_rate': 1.7672324924756364e-05, 'epoch': 0.32} +{'loss': 185.5715, 'grad_norm': 3124.620267938293, 'learning_rate': 1.7669528219394978e-05, 'epoch': 0.32} +{'loss': 184.3045, 'grad_norm': 2957.1942113949617, 'learning_rate': 1.766673005649653e-05, 'epoch': 0.32} +{'loss': 180.0612, 'grad_norm': 3196.76041797853, 'learning_rate': 1.7663930436592793e-05, 'epoch': 0.32} +{'loss': 182.2411, 'grad_norm': 2814.054216520108, 'learning_rate': 1.7661129360215813e-05, 'epoch': 0.32} +{'loss': 184.1505, 'grad_norm': 2453.9530945083147, 'learning_rate': 1.7658326827897915e-05, 'epoch': 0.32} +{'loss': 178.1123, 'grad_norm': 2133.501640799005, 'learning_rate': 1.7655522840171697e-05, 'epoch': 0.32} +{'loss': 180.9492, 'grad_norm': 2357.6706749043205, 'learning_rate': 1.7652717397570033e-05, 'epoch': 0.32} +{'loss': 183.0854, 'grad_norm': 2532.1124993298463, 'learning_rate': 1.7649910500626087e-05, 'epoch': 0.32} +{'loss': 180.6858, 'grad_norm': 2437.0314008466707, 'learning_rate': 1.7647102149873276e-05, 'epoch': 0.32} +{'loss': 186.6021, 'grad_norm': 2723.2023701509233, 'learning_rate': 1.764429234584532e-05, 'epoch': 0.32} +{'loss': 180.9445, 'grad_norm': 2379.4498653377996, 'learning_rate': 1.764148108907619e-05, 'epoch': 0.32} +{'loss': 182.2675, 'grad_norm': 2443.366745552899, 'learning_rate': 1.763866838010015e-05, 'epoch': 0.32} +{'loss': 177.8601, 'grad_norm': 2516.237846240041, 'learning_rate': 1.763585421945173e-05, 'epoch': 0.32} +{'loss': 186.6442, 'grad_norm': 2202.639948265219, 'learning_rate': 1.763303860766575e-05, 'epoch': 0.32} +{'loss': 181.5116, 'grad_norm': 2399.8782174705098, 'learning_rate': 1.7630221545277287e-05, 'epoch': 0.32} +{'loss': 185.5808, 'grad_norm': 2369.839572184225, 'learning_rate': 1.7627403032821705e-05, 'epoch': 0.32} +{'loss': 184.0884, 'grad_norm': 2423.2194526101957, 'learning_rate': 1.7624583070834646e-05, 'epoch': 0.32} +{'loss': 186.5604, 'grad_norm': 2438.833296116412, 'learning_rate': 1.7621761659852013e-05, 'epoch': 0.32} +{'loss': 182.8318, 'grad_norm': 2277.56168219972, 'learning_rate': 1.7618938800410003e-05, 'epoch': 0.32} +{'loss': 187.5371, 'grad_norm': 2825.5575954739816, 'learning_rate': 1.761611449304508e-05, 'epoch': 0.33} +{'loss': 186.4286, 'grad_norm': 2307.5791997427036, 'learning_rate': 1.7613288738293978e-05, 'epoch': 0.33} +{'loss': 185.9883, 'grad_norm': 2549.4375004478934, 'learning_rate': 1.7610461536693713e-05, 'epoch': 0.33} +{'loss': 178.3552, 'grad_norm': 2279.9182677038625, 'learning_rate': 1.7607632888781573e-05, 'epoch': 0.33} +{'loss': 178.4311, 'grad_norm': 2698.6623115714997, 'learning_rate': 1.760480279509512e-05, 'epoch': 0.33} +{'loss': 183.8252, 'grad_norm': 2628.756543000794, 'learning_rate': 1.7601971256172195e-05, 'epoch': 0.33} +{'loss': 189.6821, 'grad_norm': 2554.4226067091136, 'learning_rate': 1.7599138272550912e-05, 'epoch': 0.33} +{'loss': 177.5857, 'grad_norm': 2357.640278704835, 'learning_rate': 1.7596303844769656e-05, 'epoch': 0.33} +{'loss': 180.1023, 'grad_norm': 2713.962288206036, 'learning_rate': 1.7593467973367088e-05, 'epoch': 0.33} +{'loss': 181.9139, 'grad_norm': 3445.7330410295936, 'learning_rate': 1.7590630658882145e-05, 'epoch': 0.33} +{'loss': 186.1206, 'grad_norm': 3090.565729752887, 'learning_rate': 1.758779190185404e-05, 'epoch': 0.33} +{'loss': 182.1295, 'grad_norm': 2276.8832882185607, 'learning_rate': 1.758495170282226e-05, 'epoch': 0.33} +{'loss': 177.3316, 'grad_norm': 2276.435471292693, 'learning_rate': 1.7582110062326558e-05, 'epoch': 0.33} +{'loss': 182.592, 'grad_norm': 2195.1027072553034, 'learning_rate': 1.757926698090697e-05, 'epoch': 0.33} +{'loss': 186.2865, 'grad_norm': 2477.03935305996, 'learning_rate': 1.7576422459103797e-05, 'epoch': 0.33} +{'loss': 179.7773, 'grad_norm': 2278.2280717111034, 'learning_rate': 1.757357649745763e-05, 'epoch': 0.33} +{'loss': 176.7281, 'grad_norm': 2341.2423095083414, 'learning_rate': 1.757072909650931e-05, 'epoch': 0.33} +{'loss': 185.4872, 'grad_norm': 2346.3625418207976, 'learning_rate': 1.7567880256799976e-05, 'epoch': 0.33} +{'loss': 185.2794, 'grad_norm': 2219.6668639555255, 'learning_rate': 1.7565029978871025e-05, 'epoch': 0.33} +{'loss': 180.2302, 'grad_norm': 2223.057836093783, 'learning_rate': 1.7562178263264128e-05, 'epoch': 0.33} +{'loss': 185.4914, 'grad_norm': 2459.938088939793, 'learning_rate': 1.7559325110521233e-05, 'epoch': 0.33} +{'loss': 182.7102, 'grad_norm': 2485.9492487781886, 'learning_rate': 1.755647052118457e-05, 'epoch': 0.33} +{'loss': 188.5882, 'grad_norm': 2584.0571140378565, 'learning_rate': 1.755361449579662e-05, 'epoch': 0.33} +{'loss': 183.5658, 'grad_norm': 2739.1158518916923, 'learning_rate': 1.7550757034900153e-05, 'epoch': 0.33} +{'loss': 192.2552, 'grad_norm': 2806.443267161164, 'learning_rate': 1.754789813903821e-05, 'epoch': 0.33} +{'loss': 183.6711, 'grad_norm': 2477.6459921735654, 'learning_rate': 1.7545037808754105e-05, 'epoch': 0.33} +{'loss': 185.084, 'grad_norm': 2277.8258026026965, 'learning_rate': 1.754217604459142e-05, 'epoch': 0.33} +{'loss': 182.7998, 'grad_norm': 2478.9097710551864, 'learning_rate': 1.753931284709401e-05, 'epoch': 0.33} +{'loss': 188.5018, 'grad_norm': 2326.6685224670264, 'learning_rate': 1.7536448216806013e-05, 'epoch': 0.33} +{'loss': 183.6114, 'grad_norm': 2665.7785877138467, 'learning_rate': 1.753358215427182e-05, 'epoch': 0.33} +{'loss': 187.4781, 'grad_norm': 2720.5711319550046, 'learning_rate': 1.7530714660036112e-05, 'epoch': 0.33} +{'loss': 179.8814, 'grad_norm': 3067.9517561772473, 'learning_rate': 1.752784573464383e-05, 'epoch': 0.33} +{'loss': 181.8988, 'grad_norm': 2873.907583640698, 'learning_rate': 1.7524975378640198e-05, 'epoch': 0.33} +{'loss': 179.4811, 'grad_norm': 2420.3537358977987, 'learning_rate': 1.75221035925707e-05, 'epoch': 0.33} +{'loss': 187.4108, 'grad_norm': 2251.24129521632, 'learning_rate': 1.75192303769811e-05, 'epoch': 0.33} +{'loss': 180.2269, 'grad_norm': 2396.725802497059, 'learning_rate': 1.7516355732417434e-05, 'epoch': 0.33} +{'loss': 184.0559, 'grad_norm': 2581.502567610728, 'learning_rate': 1.7513479659426002e-05, 'epoch': 0.33} +{'loss': 189.8593, 'grad_norm': 2245.4181172098197, 'learning_rate': 1.7510602158553384e-05, 'epoch': 0.33} +{'loss': 181.6053, 'grad_norm': 3086.641254666112, 'learning_rate': 1.7507723230346427e-05, 'epoch': 0.33} +{'loss': 179.7241, 'grad_norm': 2406.0506925955137, 'learning_rate': 1.7504842875352254e-05, 'epoch': 0.33} +{'loss': 183.91, 'grad_norm': 2279.5165554248765, 'learning_rate': 1.7501961094118245e-05, 'epoch': 0.33} +{'loss': 184.3867, 'grad_norm': 2689.5997962929882, 'learning_rate': 1.7499077887192067e-05, 'epoch': 0.33} +{'loss': 202.2992, 'grad_norm': 116975.37367586068, 'learning_rate': 1.7496193255121656e-05, 'epoch': 0.33} +{'loss': 191.5012, 'grad_norm': 4367.971206548351, 'learning_rate': 1.7493307198455208e-05, 'epoch': 0.33} +{'loss': 185.3933, 'grad_norm': 2467.839619812558, 'learning_rate': 1.7490419717741204e-05, 'epoch': 0.33} +{'loss': 179.9714, 'grad_norm': 2390.940636359756, 'learning_rate': 1.7487530813528383e-05, 'epoch': 0.33} +{'loss': 184.0053, 'grad_norm': 2274.2953478790073, 'learning_rate': 1.7484640486365757e-05, 'epoch': 0.33} +{'loss': 188.7135, 'grad_norm': 2473.532806374735, 'learning_rate': 1.7481748736802618e-05, 'epoch': 0.33} +{'loss': 184.2777, 'grad_norm': 2160.9922729148766, 'learning_rate': 1.747885556538852e-05, 'epoch': 0.33} +{'loss': 187.0185, 'grad_norm': 2391.9731323829765, 'learning_rate': 1.7475960972673287e-05, 'epoch': 0.33} +{'loss': 185.397, 'grad_norm': 4279.182020759863, 'learning_rate': 1.7473064959207017e-05, 'epoch': 0.33} +{'loss': 185.6205, 'grad_norm': 2531.5882070954603, 'learning_rate': 1.747016752554007e-05, 'epoch': 0.33} +{'loss': 179.3988, 'grad_norm': 2115.113310034793, 'learning_rate': 1.746726867222309e-05, 'epoch': 0.34} +{'loss': 219.4404, 'grad_norm': 4780.6994950826565, 'learning_rate': 1.746436839980698e-05, 'epoch': 0.34} +{'loss': 180.4612, 'grad_norm': 2717.009639562965, 'learning_rate': 1.7461466708842913e-05, 'epoch': 0.34} +{'loss': 176.5844, 'grad_norm': 2372.9037573959176, 'learning_rate': 1.7458563599882335e-05, 'epoch': 0.34} +{'loss': 175.4402, 'grad_norm': 2500.5464191480432, 'learning_rate': 1.745565907347696e-05, 'epoch': 0.34} +{'loss': 181.13, 'grad_norm': 2505.5314218666063, 'learning_rate': 1.7452753130178774e-05, 'epoch': 0.34} +{'loss': 188.0479, 'grad_norm': 3150.177092910114, 'learning_rate': 1.7449845770540028e-05, 'epoch': 0.34} +{'loss': 180.9616, 'grad_norm': 2347.553230914256, 'learning_rate': 1.7446936995113245e-05, 'epoch': 0.34} +{'loss': 182.3693, 'grad_norm': 2482.700647531374, 'learning_rate': 1.7444026804451214e-05, 'epoch': 0.34} +{'loss': 181.9866, 'grad_norm': 2239.814684606311, 'learning_rate': 1.7441115199106996e-05, 'epoch': 0.34} +{'loss': 180.1771, 'grad_norm': 2607.5000999850818, 'learning_rate': 1.7438202179633923e-05, 'epoch': 0.34} +{'loss': 198.8215, 'grad_norm': 2323.7927427670434, 'learning_rate': 1.743528774658559e-05, 'epoch': 0.34} +{'loss': 183.7794, 'grad_norm': 2789.4689975397478, 'learning_rate': 1.7432371900515868e-05, 'epoch': 0.34} +{'loss': 184.0885, 'grad_norm': 2688.1744209236663, 'learning_rate': 1.7429454641978882e-05, 'epoch': 0.34} +{'loss': 186.9349, 'grad_norm': 2419.258008951273, 'learning_rate': 1.7426535971529042e-05, 'epoch': 0.34} +{'loss': 182.278, 'grad_norm': 2361.4454099113145, 'learning_rate': 1.7423615889721027e-05, 'epoch': 0.34} +{'loss': 186.1485, 'grad_norm': 2377.5615818651804, 'learning_rate': 1.7420694397109762e-05, 'epoch': 0.34} +{'loss': 190.3573, 'grad_norm': 2564.8010493013367, 'learning_rate': 1.7417771494250462e-05, 'epoch': 0.34} +{'loss': 185.6845, 'grad_norm': 2798.9049682226782, 'learning_rate': 1.741484718169861e-05, 'epoch': 0.34} +{'loss': 186.4314, 'grad_norm': 2478.9039865245795, 'learning_rate': 1.7411921460009935e-05, 'epoch': 0.34} +{'loss': 183.9511, 'grad_norm': 2260.4276993948374, 'learning_rate': 1.7408994329740462e-05, 'epoch': 0.34} +{'loss': 182.0005, 'grad_norm': 2385.285171631718, 'learning_rate': 1.740606579144646e-05, 'epoch': 0.34} +{'loss': 180.7346, 'grad_norm': 2460.6908834085116, 'learning_rate': 1.740313584568449e-05, 'epoch': 0.34} +{'loss': 178.7739, 'grad_norm': 2126.523147881116, 'learning_rate': 1.7400204493011354e-05, 'epoch': 0.34} +{'loss': 182.863, 'grad_norm': 2813.5176616124495, 'learning_rate': 1.7397271733984136e-05, 'epoch': 0.34} +{'loss': 184.5925, 'grad_norm': 2640.7926676167162, 'learning_rate': 1.739433756916019e-05, 'epoch': 0.34} +{'loss': 183.39, 'grad_norm': 2757.3884489789284, 'learning_rate': 1.7391401999097127e-05, 'epoch': 0.34} +{'loss': 186.3019, 'grad_norm': 2403.464662551618, 'learning_rate': 1.7388465024352833e-05, 'epoch': 0.34} +{'loss': 185.5719, 'grad_norm': 2670.9209047421036, 'learning_rate': 1.7385526645485458e-05, 'epoch': 0.34} +{'loss': 184.1291, 'grad_norm': 2637.373903771353, 'learning_rate': 1.738258686305342e-05, 'epoch': 0.34} +{'loss': 183.8404, 'grad_norm': 2434.1036663914256, 'learning_rate': 1.73796456776154e-05, 'epoch': 0.34} +{'loss': 180.1615, 'grad_norm': 2223.758511600664, 'learning_rate': 1.737670308973035e-05, 'epoch': 0.34} +{'loss': 181.4694, 'grad_norm': 2563.7018057285395, 'learning_rate': 1.737375909995749e-05, 'epoch': 0.34} +{'loss': 182.0893, 'grad_norm': 2637.8404143161583, 'learning_rate': 1.7370813708856302e-05, 'epoch': 0.34} +{'loss': 181.6531, 'grad_norm': 2287.0699692319295, 'learning_rate': 1.7367866916986533e-05, 'epoch': 0.34} +{'loss': 182.5605, 'grad_norm': 2340.0998284409284, 'learning_rate': 1.73649187249082e-05, 'epoch': 0.34} +{'loss': 179.5792, 'grad_norm': 2631.0149391597242, 'learning_rate': 1.7361969133181585e-05, 'epoch': 0.34} +{'loss': 180.9929, 'grad_norm': 2178.386159538226, 'learning_rate': 1.735901814236724e-05, 'epoch': 0.34} +{'loss': 183.7955, 'grad_norm': 2269.5583238864892, 'learning_rate': 1.7356065753025974e-05, 'epoch': 0.34} +{'loss': 180.0829, 'grad_norm': 2377.340986305443, 'learning_rate': 1.735311196571887e-05, 'epoch': 0.34} +{'loss': 183.9701, 'grad_norm': 2755.9328793375726, 'learning_rate': 1.735015678100727e-05, 'epoch': 0.34} +{'loss': 173.6719, 'grad_norm': 2296.1209714692363, 'learning_rate': 1.734720019945279e-05, 'epoch': 0.34} +{'loss': 179.3739, 'grad_norm': 2708.411306350159, 'learning_rate': 1.73442422216173e-05, 'epoch': 0.34} +{'loss': 185.0142, 'grad_norm': 2153.283298526691, 'learning_rate': 1.734128284806295e-05, 'epoch': 0.34} +{'loss': 179.0872, 'grad_norm': 2336.7901979953367, 'learning_rate': 1.7338322079352137e-05, 'epoch': 0.34} +{'loss': 178.0037, 'grad_norm': 2155.897319164744, 'learning_rate': 1.733535991604754e-05, 'epoch': 0.34} +{'loss': 181.2076, 'grad_norm': 2130.2669529493082, 'learning_rate': 1.7332396358712093e-05, 'epoch': 0.34} +{'loss': 182.0629, 'grad_norm': 2160.5155591196835, 'learning_rate': 1.7329431407909002e-05, 'epoch': 0.34} +{'loss': 181.6685, 'grad_norm': 2289.198141646882, 'learning_rate': 1.732646506420173e-05, 'epoch': 0.34} +{'loss': 184.1671, 'grad_norm': 2483.379616417591, 'learning_rate': 1.7323497328154014e-05, 'epoch': 0.34} +{'loss': 182.4525, 'grad_norm': 2255.597836472529, 'learning_rate': 1.7320528200329846e-05, 'epoch': 0.34} +{'loss': 182.8761, 'grad_norm': 2407.028713128842, 'learning_rate': 1.7317557681293483e-05, 'epoch': 0.34} +{'loss': 178.3164, 'grad_norm': 2447.94386802855, 'learning_rate': 1.731458577160946e-05, 'epoch': 0.35} +{'loss': 187.5281, 'grad_norm': 2284.8965594904375, 'learning_rate': 1.7311612471842558e-05, 'epoch': 0.35} +{'loss': 183.1191, 'grad_norm': 3229.1621702051125, 'learning_rate': 1.7308637782557833e-05, 'epoch': 0.35} +{'loss': 179.5906, 'grad_norm': 2308.6468068848426, 'learning_rate': 1.7305661704320607e-05, 'epoch': 0.35} +{'loss': 179.2182, 'grad_norm': 2039.437188116595, 'learning_rate': 1.7302684237696455e-05, 'epoch': 0.35} +{'loss': 182.6104, 'grad_norm': 3338.1580017504234, 'learning_rate': 1.729970538325122e-05, 'epoch': 0.35} +{'loss': 180.0191, 'grad_norm': 2753.0786796550724, 'learning_rate': 1.7296725141551025e-05, 'epoch': 0.35} +[2025-05-28 08:05:19,529] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is about to be saved! +[2025-05-28 08:05:19,551] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 08:05:19,551] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 08:05:19,603] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 08:05:19,734] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 08:11:28,026] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 08:11:28,118] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 08:11:31,341] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step18000 is ready now! +[2025-05-28 08:11:31,373] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is about to be saved! +[2025-05-28 08:11:31,395] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 08:11:31,396] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 08:11:31,429] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 08:11:31,539] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 08:17:42,931] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 08:17:43,042] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-18000/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 08:17:43,053] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step18000 is ready now! +{'loss': 184.8583, 'grad_norm': 2482.620032503158, 'learning_rate': 1.7293743513162234e-05, 'epoch': 0.35} +{'loss': 181.1501, 'grad_norm': 2279.106975737152, 'learning_rate': 1.7290760498651482e-05, 'epoch': 0.35} +{'loss': 182.842, 'grad_norm': 2374.922845160889, 'learning_rate': 1.728777609858567e-05, 'epoch': 0.35} +{'loss': 179.914, 'grad_norm': 2245.2754960897037, 'learning_rate': 1.728479031353196e-05, 'epoch': 0.35} +{'loss': 178.3052, 'grad_norm': 1962.455110754646, 'learning_rate': 1.7281803144057786e-05, 'epoch': 0.35} +{'loss': 178.847, 'grad_norm': 2126.7656525103707, 'learning_rate': 1.7278814590730826e-05, 'epoch': 0.35} +{'loss': 181.0262, 'grad_norm': 2368.152310571178, 'learning_rate': 1.727582465411904e-05, 'epoch': 0.35} +{'loss': 178.6393, 'grad_norm': 2286.5348081824695, 'learning_rate': 1.7272833334790642e-05, 'epoch': 0.35} +{'loss': 174.8823, 'grad_norm': 2455.487345659528, 'learning_rate': 1.726984063331411e-05, 'epoch': 0.35} +{'loss': 180.5634, 'grad_norm': 2210.4775318463335, 'learning_rate': 1.7266846550258184e-05, 'epoch': 0.35} +{'loss': 183.3663, 'grad_norm': 2473.7236241005494, 'learning_rate': 1.7263851086191865e-05, 'epoch': 0.35} +{'loss': 183.6858, 'grad_norm': 2197.2737250545315, 'learning_rate': 1.7260854241684426e-05, 'epoch': 0.35} +{'loss': 178.9926, 'grad_norm': 2227.192077157188, 'learning_rate': 1.7257856017305387e-05, 'epoch': 0.35} +{'loss': 177.4925, 'grad_norm': 2223.616179996759, 'learning_rate': 1.7254856413624544e-05, 'epoch': 0.35} +{'loss': 185.8326, 'grad_norm': 2295.033756970397, 'learning_rate': 1.7251855431211944e-05, 'epoch': 0.35} +{'loss': 179.7263, 'grad_norm': 2489.5069997825776, 'learning_rate': 1.7248853070637904e-05, 'epoch': 0.35} +{'loss': 179.455, 'grad_norm': 2249.9670414415295, 'learning_rate': 1.7245849332473006e-05, 'epoch': 0.35} +{'loss': 175.8318, 'grad_norm': 2112.407655807533, 'learning_rate': 1.7242844217288085e-05, 'epoch': 0.35} +{'loss': 179.1778, 'grad_norm': 2453.20545329674, 'learning_rate': 1.7239837725654232e-05, 'epoch': 0.35} +{'loss': 186.0407, 'grad_norm': 3285.5906005564993, 'learning_rate': 1.723682985814282e-05, 'epoch': 0.35} +{'loss': 191.6061, 'grad_norm': 2993.193056698358, 'learning_rate': 1.7233820615325467e-05, 'epoch': 0.35} +{'loss': 183.2719, 'grad_norm': 2522.6414056522167, 'learning_rate': 1.7230809997774057e-05, 'epoch': 0.35} +{'loss': 176.7782, 'grad_norm': 2378.8701442656397, 'learning_rate': 1.722779800606074e-05, 'epoch': 0.35} +{'loss': 182.1581, 'grad_norm': 2395.9276070740834, 'learning_rate': 1.7224784640757918e-05, 'epoch': 0.35} +{'loss': 177.2259, 'grad_norm': 2625.177230662922, 'learning_rate': 1.722176990243826e-05, 'epoch': 0.35} +{'loss': 176.3747, 'grad_norm': 2185.2533459914407, 'learning_rate': 1.7218753791674692e-05, 'epoch': 0.35} +{'loss': 174.5608, 'grad_norm': 2341.399430745318, 'learning_rate': 1.7215736309040408e-05, 'epoch': 0.35} +{'loss': 181.642, 'grad_norm': 2162.9128472203, 'learning_rate': 1.721271745510886e-05, 'epoch': 0.35} +{'loss': 172.8615, 'grad_norm': 2154.1723097203944, 'learning_rate': 1.7209697230453752e-05, 'epoch': 0.35} +{'loss': 177.1023, 'grad_norm': 2258.8579167293656, 'learning_rate': 1.720667563564906e-05, 'epoch': 0.35} +{'loss': 180.3888, 'grad_norm': 2367.1760665336697, 'learning_rate': 1.7203652671269014e-05, 'epoch': 0.35} +{'loss': 176.9683, 'grad_norm': 2157.6727141463734, 'learning_rate': 1.7200628337888106e-05, 'epoch': 0.35} +{'loss': 177.066, 'grad_norm': 2199.704129002943, 'learning_rate': 1.7197602636081087e-05, 'epoch': 0.35} +{'loss': 181.2795, 'grad_norm': 2994.1116116696535, 'learning_rate': 1.7194575566422975e-05, 'epoch': 0.35} +{'loss': 181.2846, 'grad_norm': 2505.156602715034, 'learning_rate': 1.7191547129489035e-05, 'epoch': 0.35} +{'loss': 183.8751, 'grad_norm': 2747.785283665254, 'learning_rate': 1.7188517325854804e-05, 'epoch': 0.35} +{'loss': 176.7603, 'grad_norm': 2429.2117209822363, 'learning_rate': 1.718548615609607e-05, 'epoch': 0.35} +{'loss': 179.5399, 'grad_norm': 2881.7826754175144, 'learning_rate': 1.7182453620788887e-05, 'epoch': 0.35} +{'loss': 175.0936, 'grad_norm': 3143.5441290609533, 'learning_rate': 1.7179419720509565e-05, 'epoch': 0.35} +{'loss': 178.3597, 'grad_norm': 2842.5608308299766, 'learning_rate': 1.7176384455834677e-05, 'epoch': 0.35} +{'loss': 179.0031, 'grad_norm': 2568.2714035706754, 'learning_rate': 1.7173347827341046e-05, 'epoch': 0.35} +{'loss': 171.6035, 'grad_norm': 2274.832194534198, 'learning_rate': 1.7170309835605767e-05, 'epoch': 0.35} +{'loss': 175.3457, 'grad_norm': 2133.7587830351017, 'learning_rate': 1.7167270481206182e-05, 'epoch': 0.35} +{'loss': 181.6189, 'grad_norm': 2385.45312731625, 'learning_rate': 1.7164229764719905e-05, 'epoch': 0.35} +{'loss': 178.8322, 'grad_norm': 2106.8448835470053, 'learning_rate': 1.7161187686724795e-05, 'epoch': 0.35} +{'loss': 189.27, 'grad_norm': 2440.4325970313967, 'learning_rate': 1.7158144247798985e-05, 'epoch': 0.36} +{'loss': 176.93, 'grad_norm': 2194.3500059642492, 'learning_rate': 1.7155099448520847e-05, 'epoch': 0.36} +{'loss': 177.8307, 'grad_norm': 2186.654242481445, 'learning_rate': 1.715205328946903e-05, 'epoch': 0.36} +{'loss': 179.2949, 'grad_norm': 2490.845645593411, 'learning_rate': 1.714900577122243e-05, 'epoch': 0.36} +{'loss': 180.3657, 'grad_norm': 2027.2782521926126, 'learning_rate': 1.714595689436021e-05, 'epoch': 0.36} +{'loss': 179.8458, 'grad_norm': 2771.5004342704774, 'learning_rate': 1.7142906659461785e-05, 'epoch': 0.36} +{'loss': 179.4278, 'grad_norm': 2254.6673686435047, 'learning_rate': 1.7139855067106824e-05, 'epoch': 0.36} +{'loss': 184.9289, 'grad_norm': 3002.4471502845545, 'learning_rate': 1.7136802117875267e-05, 'epoch': 0.36} +{'loss': 176.687, 'grad_norm': 2668.5228772205055, 'learning_rate': 1.71337478123473e-05, 'epoch': 0.36} +{'loss': 176.7254, 'grad_norm': 2560.988077981974, 'learning_rate': 1.7130692151103373e-05, 'epoch': 0.36} +{'loss': 178.3393, 'grad_norm': 1984.7168098250918, 'learning_rate': 1.7127635134724193e-05, 'epoch': 0.36} +{'loss': 178.3688, 'grad_norm': 2521.886949049647, 'learning_rate': 1.7124576763790716e-05, 'epoch': 0.36} +{'loss': 183.3593, 'grad_norm': 2291.2367909137843, 'learning_rate': 1.7121517038884174e-05, 'epoch': 0.36} +{'loss': 176.6394, 'grad_norm': 2445.4024561011706, 'learning_rate': 1.7118455960586038e-05, 'epoch': 0.36} +{'loss': 181.3887, 'grad_norm': 2569.5550487426144, 'learning_rate': 1.7115393529478042e-05, 'epoch': 0.36} +{'loss': 180.6883, 'grad_norm': 2085.440311800775, 'learning_rate': 1.7112329746142184e-05, 'epoch': 0.36} +{'loss': 178.4542, 'grad_norm': 2698.753369293674, 'learning_rate': 1.710926461116071e-05, 'epoch': 0.36} +{'loss': 176.0848, 'grad_norm': 2335.2350766309332, 'learning_rate': 1.7106198125116124e-05, 'epoch': 0.36} +{'loss': 179.2946, 'grad_norm': 2369.8608750402914, 'learning_rate': 1.7103130288591194e-05, 'epoch': 0.36} +{'loss': 180.0899, 'grad_norm': 2034.166537141851, 'learning_rate': 1.7100061102168934e-05, 'epoch': 0.36} +{'loss': 180.8983, 'grad_norm': 3218.2419089457017, 'learning_rate': 1.7096990566432628e-05, 'epoch': 0.36} +{'loss': 182.7509, 'grad_norm': 2368.0412179411096, 'learning_rate': 1.70939186819658e-05, 'epoch': 0.36} +{'loss': 180.4605, 'grad_norm': 2182.0488096824115, 'learning_rate': 1.7090845449352245e-05, 'epoch': 0.36} +{'loss': 177.5212, 'grad_norm': 2254.3938700987146, 'learning_rate': 1.7087770869176005e-05, 'epoch': 0.36} +{'loss': 179.4254, 'grad_norm': 2785.1451661696306, 'learning_rate': 1.7084694942021383e-05, 'epoch': 0.36} +{'loss': 180.3198, 'grad_norm': 2237.88998054963, 'learning_rate': 1.7081617668472933e-05, 'epoch': 0.36} +{'loss': 173.964, 'grad_norm': 2286.942108571108, 'learning_rate': 1.7078539049115472e-05, 'epoch': 0.36} +{'loss': 176.0236, 'grad_norm': 2557.23216248867, 'learning_rate': 1.7075459084534068e-05, 'epoch': 0.36} +{'loss': 181.7515, 'grad_norm': 2106.5326613298425, 'learning_rate': 1.7072377775314044e-05, 'epoch': 0.36} +{'loss': 176.5289, 'grad_norm': 2434.4608697328067, 'learning_rate': 1.706929512204098e-05, 'epoch': 0.36} +{'loss': 177.8752, 'grad_norm': 2076.2119478807895, 'learning_rate': 1.7066211125300713e-05, 'epoch': 0.36} +{'loss': 180.9622, 'grad_norm': 2560.335969111318, 'learning_rate': 1.7063125785679333e-05, 'epoch': 0.36} +{'loss': 175.5294, 'grad_norm': 2221.9500436025505, 'learning_rate': 1.7060039103763185e-05, 'epoch': 0.36} +{'loss': 178.518, 'grad_norm': 2390.7014745655497, 'learning_rate': 1.7056951080138867e-05, 'epoch': 0.36} +{'loss': 179.6526, 'grad_norm': 2347.2071326846576, 'learning_rate': 1.7053861715393246e-05, 'epoch': 0.36} +{'loss': 176.9993, 'grad_norm': 2325.728330283408, 'learning_rate': 1.705077101011342e-05, 'epoch': 0.36} +{'loss': 177.9207, 'grad_norm': 2423.953508069884, 'learning_rate': 1.7047678964886763e-05, 'epoch': 0.36} +{'loss': 178.1561, 'grad_norm': 2573.7217293943236, 'learning_rate': 1.704458558030089e-05, 'epoch': 0.36} +{'loss': 184.4754, 'grad_norm': 2897.234479060767, 'learning_rate': 1.7041490856943682e-05, 'epoch': 0.36} +{'loss': 175.4248, 'grad_norm': 2386.317231600371, 'learning_rate': 1.7038394795403267e-05, 'epoch': 0.36} +{'loss': 173.0614, 'grad_norm': 2259.367549251842, 'learning_rate': 1.703529739626802e-05, 'epoch': 0.36} +{'loss': 176.7604, 'grad_norm': 2404.1668867911453, 'learning_rate': 1.703219866012659e-05, 'epoch': 0.36} +{'loss': 175.0992, 'grad_norm': 2387.5338982703615, 'learning_rate': 1.7029098587567864e-05, 'epoch': 0.36} +{'loss': 177.2503, 'grad_norm': 2330.40750045295, 'learning_rate': 1.7025997179180985e-05, 'epoch': 0.36} +{'loss': 178.8931, 'grad_norm': 2385.1651654525444, 'learning_rate': 1.7022894435555356e-05, 'epoch': 0.36} +{'loss': 174.8912, 'grad_norm': 2282.4605924205434, 'learning_rate': 1.701979035728063e-05, 'epoch': 0.36} +{'loss': 171.886, 'grad_norm': 2438.5593672594446, 'learning_rate': 1.7016684944946716e-05, 'epoch': 0.36} +{'loss': 184.4558, 'grad_norm': 2732.1498404976937, 'learning_rate': 1.701357819914377e-05, 'epoch': 0.36} +{'loss': 182.2216, 'grad_norm': 2411.824003506597, 'learning_rate': 1.701047012046221e-05, 'epoch': 0.36} +{'loss': 181.5982, 'grad_norm': 2152.4127930827713, 'learning_rate': 1.7007360709492703e-05, 'epoch': 0.36} +{'loss': 178.5429, 'grad_norm': 2330.3320296667166, 'learning_rate': 1.700424996682617e-05, 'epoch': 0.36} +{'loss': 176.2641, 'grad_norm': 2186.7914829610104, 'learning_rate': 1.7001137893053782e-05, 'epoch': 0.36} +{'loss': 177.9809, 'grad_norm': 2088.0260043579287, 'learning_rate': 1.6998024488766963e-05, 'epoch': 0.37} +{'loss': 177.4387, 'grad_norm': 2079.6154222766154, 'learning_rate': 1.6994909754557404e-05, 'epoch': 0.37} +{'loss': 177.433, 'grad_norm': 2183.466559078124, 'learning_rate': 1.6991793691017023e-05, 'epoch': 0.37} +{'loss': 178.0145, 'grad_norm': 2295.782229462367, 'learning_rate': 1.698867629873801e-05, 'epoch': 0.37} +{'loss': 177.8338, 'grad_norm': 2892.2734320364043, 'learning_rate': 1.698555757831281e-05, 'epoch': 0.37} +{'loss': 184.1877, 'grad_norm': 2996.3771738430028, 'learning_rate': 1.6982437530334104e-05, 'epoch': 0.37} +{'loss': 182.9983, 'grad_norm': 3190.4399460097943, 'learning_rate': 1.6979316155394834e-05, 'epoch': 0.37} +{'loss': 173.3924, 'grad_norm': 2239.785246862199, 'learning_rate': 1.69761934540882e-05, 'epoch': 0.37} +{'loss': 175.2241, 'grad_norm': 2396.14526474857, 'learning_rate': 1.697306942700764e-05, 'epoch': 0.37} +{'loss': 178.0326, 'grad_norm': 2379.3689961026867, 'learning_rate': 1.696994407474686e-05, 'epoch': 0.37} +{'loss': 177.1986, 'grad_norm': 2410.8532166804907, 'learning_rate': 1.696681739789981e-05, 'epoch': 0.37} +{'loss': 177.919, 'grad_norm': 2222.5927543431767, 'learning_rate': 1.6963689397060682e-05, 'epoch': 0.37} +{'loss': 181.1941, 'grad_norm': 2176.1194463955, 'learning_rate': 1.6960560072823944e-05, 'epoch': 0.37} +{'loss': 179.2485, 'grad_norm': 2527.2452963191886, 'learning_rate': 1.695742942578429e-05, 'epoch': 0.37} +{'loss': 177.662, 'grad_norm': 2174.6958620870887, 'learning_rate': 1.6954297456536677e-05, 'epoch': 0.37} +{'loss': 173.1366, 'grad_norm': 2791.5798323812796, 'learning_rate': 1.6951164165676317e-05, 'epoch': 0.37} +{'loss': 173.7264, 'grad_norm': 2544.8977947321373, 'learning_rate': 1.6948029553798666e-05, 'epoch': 0.37} +{'loss': 177.3325, 'grad_norm': 3275.6504677163807, 'learning_rate': 1.694489362149944e-05, 'epoch': 0.37} +{'loss': 177.7088, 'grad_norm': 2068.6967771660875, 'learning_rate': 1.6941756369374586e-05, 'epoch': 0.37} +{'loss': 172.9436, 'grad_norm': 2461.277618018355, 'learning_rate': 1.6938617798020328e-05, 'epoch': 0.37} +{'loss': 196.3911, 'grad_norm': 2503.855219241769, 'learning_rate': 1.6935477908033124e-05, 'epoch': 0.37} +{'loss': 178.885, 'grad_norm': 2318.9894707097783, 'learning_rate': 1.6932336700009687e-05, 'epoch': 0.37} +{'loss': 181.0119, 'grad_norm': 3104.455229961397, 'learning_rate': 1.692919417454698e-05, 'epoch': 0.37} +{'loss': 177.9388, 'grad_norm': 2345.3092953386617, 'learning_rate': 1.6926050332242216e-05, 'epoch': 0.37} +{'loss': 178.5692, 'grad_norm': 2503.719933471244, 'learning_rate': 1.692290517369286e-05, 'epoch': 0.37} +{'loss': 177.9877, 'grad_norm': 2150.247292196147, 'learning_rate': 1.691975869949663e-05, 'epoch': 0.37} +{'loss': 171.3645, 'grad_norm': 2131.07798483423, 'learning_rate': 1.6916610910251482e-05, 'epoch': 0.37} +{'loss': 174.8747, 'grad_norm': 2189.767856327906, 'learning_rate': 1.6913461806555635e-05, 'epoch': 0.37} +{'loss': 174.6612, 'grad_norm': 2008.3700498291196, 'learning_rate': 1.691031138900755e-05, 'epoch': 0.37} +{'loss': 176.6956, 'grad_norm': 2033.159028563772, 'learning_rate': 1.6907159658205946e-05, 'epoch': 0.37} +{'loss': 172.1559, 'grad_norm': 2345.0200986389555, 'learning_rate': 1.690400661474978e-05, 'epoch': 0.37} +{'loss': 176.3835, 'grad_norm': 2077.167185888612, 'learning_rate': 1.690085225923827e-05, 'epoch': 0.37} +{'loss': 177.0809, 'grad_norm': 2116.7453037467844, 'learning_rate': 1.6897696592270873e-05, 'epoch': 0.37} +{'loss': 177.5036, 'grad_norm': 2087.7140297847814, 'learning_rate': 1.6894539614447304e-05, 'epoch': 0.37} +{'loss': 178.7069, 'grad_norm': 2435.907566349486, 'learning_rate': 1.689138132636752e-05, 'epoch': 0.37} +{'loss': 187.2689, 'grad_norm': 2186.735423801424, 'learning_rate': 1.6888221728631734e-05, 'epoch': 0.37} +{'loss': 173.6673, 'grad_norm': 2217.1478833872084, 'learning_rate': 1.6885060821840398e-05, 'epoch': 0.37} +{'loss': 184.1857, 'grad_norm': 2752.436698323492, 'learning_rate': 1.6881898606594225e-05, 'epoch': 0.37} +{'loss': 178.3533, 'grad_norm': 2565.1603525140054, 'learning_rate': 1.687873508349417e-05, 'epoch': 0.37} +{'loss': 172.8136, 'grad_norm': 2269.776302675692, 'learning_rate': 1.6875570253141434e-05, 'epoch': 0.37} +{'loss': 178.6375, 'grad_norm': 2003.1783147361548, 'learning_rate': 1.6872404116137472e-05, 'epoch': 0.37} +{'loss': 176.6674, 'grad_norm': 2061.7363260438287, 'learning_rate': 1.686923667308398e-05, 'epoch': 0.37} +{'loss': 178.7815, 'grad_norm': 2253.945229821322, 'learning_rate': 1.6866067924582917e-05, 'epoch': 0.37} +{'loss': 179.3106, 'grad_norm': 2344.785726649761, 'learning_rate': 1.6862897871236476e-05, 'epoch': 0.37} +{'loss': 176.989, 'grad_norm': 2260.3669684647425, 'learning_rate': 1.6859726513647093e-05, 'epoch': 0.37} +{'loss': 179.3294, 'grad_norm': 2107.0869211600498, 'learning_rate': 1.6856553852417477e-05, 'epoch': 0.37} +{'loss': 175.9617, 'grad_norm': 2179.026507203713, 'learning_rate': 1.6853379888150555e-05, 'epoch': 0.37} +{'loss': 174.0893, 'grad_norm': 2365.419783978548, 'learning_rate': 1.685020462144952e-05, 'epoch': 0.37} +{'loss': 176.7547, 'grad_norm': 2398.6275068783216, 'learning_rate': 1.6847028052917814e-05, 'epoch': 0.37} +{'loss': 175.9799, 'grad_norm': 2255.516873618414, 'learning_rate': 1.684385018315911e-05, 'epoch': 0.37} +{'loss': 181.3903, 'grad_norm': 2359.6028920041813, 'learning_rate': 1.6840671012777347e-05, 'epoch': 0.37} +{'loss': 177.241, 'grad_norm': 2670.8058793619016, 'learning_rate': 1.68374905423767e-05, 'epoch': 0.37} +{'loss': 176.5216, 'grad_norm': 2409.7172733700236, 'learning_rate': 1.6834308772561588e-05, 'epoch': 0.38} +{'loss': 172.1126, 'grad_norm': 2139.6205758901274, 'learning_rate': 1.6831125703936692e-05, 'epoch': 0.38} +{'loss': 173.1965, 'grad_norm': 2309.722073997241, 'learning_rate': 1.6827941337106927e-05, 'epoch': 0.38} +{'loss': 180.6558, 'grad_norm': 2565.1721619597324, 'learning_rate': 1.6824755672677458e-05, 'epoch': 0.38} +{'loss': 174.7729, 'grad_norm': 2377.288308975535, 'learning_rate': 1.68215687112537e-05, 'epoch': 0.38} +{'loss': 178.3177, 'grad_norm': 2039.9612915822318, 'learning_rate': 1.6818380453441304e-05, 'epoch': 0.38} +{'loss': 179.4268, 'grad_norm': 2334.3504238354885, 'learning_rate': 1.681519089984618e-05, 'epoch': 0.38} +{'loss': 176.0501, 'grad_norm': 2190.0760622830294, 'learning_rate': 1.681200005107448e-05, 'epoch': 0.38} +{'loss': 174.5067, 'grad_norm': 2416.5764764419855, 'learning_rate': 1.6808807907732595e-05, 'epoch': 0.38} +{'loss': 174.7078, 'grad_norm': 2350.9611005738243, 'learning_rate': 1.6805614470427176e-05, 'epoch': 0.38} +{'loss': 175.5328, 'grad_norm': 2056.7334152785425, 'learning_rate': 1.6802419739765114e-05, 'epoch': 0.38} +{'loss': 174.7458, 'grad_norm': 2141.141307136222, 'learning_rate': 1.679922371635353e-05, 'epoch': 0.38} +{'loss': 175.3115, 'grad_norm': 1918.6805812422035, 'learning_rate': 1.6796026400799817e-05, 'epoch': 0.38} +{'loss': 180.1646, 'grad_norm': 2074.253365860128, 'learning_rate': 1.67928277937116e-05, 'epoch': 0.38} +{'loss': 175.6724, 'grad_norm': 3150.6915421957524, 'learning_rate': 1.6789627895696745e-05, 'epoch': 0.38} +{'loss': 171.1596, 'grad_norm': 3187.157245771054, 'learning_rate': 1.6786426707363374e-05, 'epoch': 0.38} +{'loss': 174.0127, 'grad_norm': 2592.423203309771, 'learning_rate': 1.6783224229319847e-05, 'epoch': 0.38} +{'loss': 171.1167, 'grad_norm': 2158.831185399281, 'learning_rate': 1.678002046217477e-05, 'epoch': 0.38} +{'loss': 174.4866, 'grad_norm': 2367.2126906262483, 'learning_rate': 1.6776815406536996e-05, 'epoch': 0.38} +{'loss': 180.5608, 'grad_norm': 2521.3287021187257, 'learning_rate': 1.6773609063015622e-05, 'epoch': 0.38} +{'loss': 173.8653, 'grad_norm': 2399.0266280029923, 'learning_rate': 1.6770401432219995e-05, 'epoch': 0.38} +{'loss': 174.4729, 'grad_norm': 2638.8074140377103, 'learning_rate': 1.6767192514759695e-05, 'epoch': 0.38} +{'loss': 177.1506, 'grad_norm': 2273.4517136564773, 'learning_rate': 1.6763982311244555e-05, 'epoch': 0.38} +{'loss': 172.871, 'grad_norm': 2320.2654331128556, 'learning_rate': 1.676077082228465e-05, 'epoch': 0.38} +{'loss': 176.3006, 'grad_norm': 2059.859294817196, 'learning_rate': 1.67575580484903e-05, 'epoch': 0.38} +{'loss': 172.5088, 'grad_norm': 2106.2098624182677, 'learning_rate': 1.6754343990472072e-05, 'epoch': 0.38} +{'loss': 173.6675, 'grad_norm': 2926.0217940727753, 'learning_rate': 1.6751128648840766e-05, 'epoch': 0.38} +{'loss': 170.6179, 'grad_norm': 2833.4811649942653, 'learning_rate': 1.674791202420744e-05, 'epoch': 0.38} +{'loss': 174.5535, 'grad_norm': 2310.88660188628, 'learning_rate': 1.6744694117183392e-05, 'epoch': 0.38} +{'loss': 169.0575, 'grad_norm': 2678.1252789881655, 'learning_rate': 1.6741474928380156e-05, 'epoch': 0.38} +{'loss': 174.0586, 'grad_norm': 2796.058254533567, 'learning_rate': 1.6738254458409518e-05, 'epoch': 0.38} +{'loss': 169.911, 'grad_norm': 1876.9451828072229, 'learning_rate': 1.6735032707883502e-05, 'epoch': 0.38} +{'loss': 172.8506, 'grad_norm': 2432.7146701931842, 'learning_rate': 1.6731809677414382e-05, 'epoch': 0.38} +{'loss': 174.3377, 'grad_norm': 2731.6124823935197, 'learning_rate': 1.6728585367614664e-05, 'epoch': 0.38} +{'loss': 174.3543, 'grad_norm': 2205.1574448448014, 'learning_rate': 1.672535977909711e-05, 'epoch': 0.38} +{'loss': 176.3505, 'grad_norm': 2213.147762089198, 'learning_rate': 1.672213291247472e-05, 'epoch': 0.38} +{'loss': 175.9528, 'grad_norm': 2304.0768710065013, 'learning_rate': 1.671890476836073e-05, 'epoch': 0.38} +{'loss': 179.1109, 'grad_norm': 2064.0555454301416, 'learning_rate': 1.6715675347368633e-05, 'epoch': 0.38} +{'loss': 181.0836, 'grad_norm': 2552.971026352866, 'learning_rate': 1.6712444650112152e-05, 'epoch': 0.38} +{'loss': 181.2831, 'grad_norm': 2222.0645086228637, 'learning_rate': 1.670921267720526e-05, 'epoch': 0.38} +{'loss': 186.8612, 'grad_norm': 2190.40270752582, 'learning_rate': 1.6705979429262164e-05, 'epoch': 0.38} +{'loss': 179.3468, 'grad_norm': 2370.8523768370424, 'learning_rate': 1.6702744906897318e-05, 'epoch': 0.38} +{'loss': 179.8203, 'grad_norm': 2541.083271881423, 'learning_rate': 1.6699509110725424e-05, 'epoch': 0.38} +{'loss': 177.4306, 'grad_norm': 2784.6659830328276, 'learning_rate': 1.6696272041361427e-05, 'epoch': 0.38} +{'loss': 171.2616, 'grad_norm': 2097.552344608413, 'learning_rate': 1.6693033699420493e-05, 'epoch': 0.38} +{'loss': 179.5593, 'grad_norm': 2374.7716600622284, 'learning_rate': 1.6689794085518057e-05, 'epoch': 0.38} +{'loss': 172.8365, 'grad_norm': 2310.1625403856387, 'learning_rate': 1.668655320026978e-05, 'epoch': 0.38} +{'loss': 177.3793, 'grad_norm': 2369.1208729172417, 'learning_rate': 1.6683311044291563e-05, 'epoch': 0.38} +{'loss': 175.8258, 'grad_norm': 2201.8256465384598, 'learning_rate': 1.6680067618199566e-05, 'epoch': 0.38} +{'loss': 175.4538, 'grad_norm': 2120.3650064650265, 'learning_rate': 1.6676822922610166e-05, 'epoch': 0.38} +{'loss': 174.4195, 'grad_norm': 2511.458587184414, 'learning_rate': 1.6673576958139997e-05, 'epoch': 0.38} +[2025-05-28 10:06:15,049] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is about to be saved! +[2025-05-28 10:06:15,074] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 10:06:15,074] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 10:06:15,130] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 10:06:15,537] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 10:12:14,160] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 10:12:14,232] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 10:12:14,330] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step20000 is ready now! +[2025-05-28 10:12:14,365] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is about to be saved! +[2025-05-28 10:12:14,395] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 10:12:14,395] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 10:12:14,451] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 10:12:14,535] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 10:18:26,875] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 10:18:26,994] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-20000/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 10:18:34,469] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step20000 is ready now! +{'loss': 174.3799, 'grad_norm': 2197.0308033051942, 'learning_rate': 1.6670329725405936e-05, 'epoch': 0.38} +{'loss': 172.5405, 'grad_norm': 2165.2480731541173, 'learning_rate': 1.6667081225025087e-05, 'epoch': 0.39} +{'loss': 169.3103, 'grad_norm': 2002.0055554898668, 'learning_rate': 1.666383145761481e-05, 'epoch': 0.39} +{'loss': 174.287, 'grad_norm': 2293.2406677050035, 'learning_rate': 1.6660580423792697e-05, 'epoch': 0.39} +{'loss': 174.7951, 'grad_norm': 2067.916511138974, 'learning_rate': 1.6657328124176582e-05, 'epoch': 0.39} +{'loss': 173.6627, 'grad_norm': 2090.5572534632624, 'learning_rate': 1.6654074559384542e-05, 'epoch': 0.39} +{'loss': 178.5275, 'grad_norm': 2776.098723347367, 'learning_rate': 1.665081973003489e-05, 'epoch': 0.39} +{'loss': 172.2911, 'grad_norm': 2361.328274217663, 'learning_rate': 1.6647563636746188e-05, 'epoch': 0.39} +{'loss': 172.674, 'grad_norm': 2357.548182739666, 'learning_rate': 1.6644306280137227e-05, 'epoch': 0.39} +{'loss': 177.4894, 'grad_norm': 2104.6920890226756, 'learning_rate': 1.6641047660827042e-05, 'epoch': 0.39} +{'loss': 177.2969, 'grad_norm': 1915.7871660758963, 'learning_rate': 1.6637787779434917e-05, 'epoch': 0.39} +{'loss': 172.9952, 'grad_norm': 2114.302302671334, 'learning_rate': 1.663452663658036e-05, 'epoch': 0.39} +{'loss': 179.34, 'grad_norm': 2487.322363091351, 'learning_rate': 1.6631264232883136e-05, 'epoch': 0.39} +{'loss': 176.2812, 'grad_norm': 2287.313349527756, 'learning_rate': 1.6628000568963233e-05, 'epoch': 0.39} +{'loss': 177.1346, 'grad_norm': 2374.6074185522657, 'learning_rate': 1.6624735645440887e-05, 'epoch': 0.39} +{'loss': 177.9803, 'grad_norm': 2178.3001099164694, 'learning_rate': 1.662146946293658e-05, 'epoch': 0.39} +{'loss': 178.1638, 'grad_norm': 2853.5871986715865, 'learning_rate': 1.6618202022071014e-05, 'epoch': 0.39} +{'loss': 170.663, 'grad_norm': 2383.77636987694, 'learning_rate': 1.6614933323465155e-05, 'epoch': 0.39} +{'loss': 174.8104, 'grad_norm': 2013.5309918567434, 'learning_rate': 1.6611663367740182e-05, 'epoch': 0.39} +{'loss': 172.2717, 'grad_norm': 2363.870127133821, 'learning_rate': 1.6608392155517536e-05, 'epoch': 0.39} +{'loss': 178.4544, 'grad_norm': 2032.3643013256876, 'learning_rate': 1.6605119687418882e-05, 'epoch': 0.39} +{'loss': 175.8825, 'grad_norm': 2563.477574688229, 'learning_rate': 1.6601845964066128e-05, 'epoch': 0.39} +{'loss': 172.412, 'grad_norm': 2625.5594886954073, 'learning_rate': 1.6598570986081424e-05, 'epoch': 0.39} +{'loss': 184.3773, 'grad_norm': 2332.5529133753344, 'learning_rate': 1.659529475408715e-05, 'epoch': 0.39} +{'loss': 173.463, 'grad_norm': 2192.008904232008, 'learning_rate': 1.6592017268705935e-05, 'epoch': 0.39} +{'loss': 173.372, 'grad_norm': 2196.6432148075323, 'learning_rate': 1.6588738530560638e-05, 'epoch': 0.39} +{'loss': 173.1969, 'grad_norm': 2538.4282756020866, 'learning_rate': 1.6585458540274362e-05, 'epoch': 0.39} +{'loss': 176.4724, 'grad_norm': 2463.690812552943, 'learning_rate': 1.6582177298470442e-05, 'epoch': 0.39} +{'loss': 171.9195, 'grad_norm': 1947.6353869879417, 'learning_rate': 1.6578894805772453e-05, 'epoch': 0.39} +{'loss': 173.8802, 'grad_norm': 2082.7142820281288, 'learning_rate': 1.657561106280421e-05, 'epoch': 0.39} +{'loss': 177.5473, 'grad_norm': 2244.645552898031, 'learning_rate': 1.6572326070189766e-05, 'epoch': 0.39} +{'loss': 181.6909, 'grad_norm': 3285.091897740648, 'learning_rate': 1.6569039828553402e-05, 'epoch': 0.39} +{'loss': 178.8138, 'grad_norm': 2663.4032144153725, 'learning_rate': 1.6565752338519653e-05, 'epoch': 0.39} +{'loss': 175.5057, 'grad_norm': 2161.718216387654, 'learning_rate': 1.6562463600713277e-05, 'epoch': 0.39} +{'loss': 170.7948, 'grad_norm': 2089.4331367203, 'learning_rate': 1.655917361575928e-05, 'epoch': 0.39} +{'loss': 175.1157, 'grad_norm': 2314.27797400209, 'learning_rate': 1.655588238428289e-05, 'epoch': 0.39} +{'loss': 174.426, 'grad_norm': 2186.04542450256, 'learning_rate': 1.6552589906909586e-05, 'epoch': 0.39} +{'loss': 171.6056, 'grad_norm': 2283.2117692617526, 'learning_rate': 1.654929618426508e-05, 'epoch': 0.39} +{'loss': 170.4684, 'grad_norm': 2477.4510894303926, 'learning_rate': 1.654600121697532e-05, 'epoch': 0.39} +{'loss': 171.0848, 'grad_norm': 2305.2220603799055, 'learning_rate': 1.6542705005666485e-05, 'epoch': 0.39} +{'loss': 178.0409, 'grad_norm': 2307.4268270587895, 'learning_rate': 1.6539407550965007e-05, 'epoch': 0.39} +{'loss': 175.0824, 'grad_norm': 2166.2084340132137, 'learning_rate': 1.6536108853497527e-05, 'epoch': 0.39} +{'loss': 179.2245, 'grad_norm': 2772.961367950514, 'learning_rate': 1.6532808913890952e-05, 'epoch': 0.39} +{'loss': 169.3666, 'grad_norm': 2095.4458255444624, 'learning_rate': 1.65295077327724e-05, 'epoch': 0.39} +{'loss': 172.9335, 'grad_norm': 1991.8661087953037, 'learning_rate': 1.6526205310769244e-05, 'epoch': 0.39} +{'loss': 173.4275, 'grad_norm': 3063.9096868274846, 'learning_rate': 1.6522901648509085e-05, 'epoch': 0.39} +{'loss': 172.501, 'grad_norm': 2066.107583881089, 'learning_rate': 1.6519596746619752e-05, 'epoch': 0.39} +{'loss': 170.6929, 'grad_norm': 2186.6051909780704, 'learning_rate': 1.6516290605729323e-05, 'epoch': 0.39} +{'loss': 173.4086, 'grad_norm': 2179.268371703361, 'learning_rate': 1.651298322646611e-05, 'epoch': 0.39} +{'loss': 172.6752, 'grad_norm': 3487.9379851825606, 'learning_rate': 1.6509674609458645e-05, 'epoch': 0.39} +{'loss': 175.1382, 'grad_norm': 3097.152107031946, 'learning_rate': 1.650636475533571e-05, 'epoch': 0.39} +{'loss': 172.7493, 'grad_norm': 2680.610882800094, 'learning_rate': 1.6503053664726323e-05, 'epoch': 0.39} +{'loss': 174.6113, 'grad_norm': 2274.057391160151, 'learning_rate': 1.649974133825973e-05, 'epoch': 0.39} +{'loss': 172.8187, 'grad_norm': 2010.7049555911099, 'learning_rate': 1.649642777656541e-05, 'epoch': 0.4} +{'loss': 175.2653, 'grad_norm': 2073.905901785732, 'learning_rate': 1.6493112980273086e-05, 'epoch': 0.4} +{'loss': 171.6175, 'grad_norm': 2496.0401336022046, 'learning_rate': 1.6489796950012707e-05, 'epoch': 0.4} +{'loss': 173.4442, 'grad_norm': 2239.2130695782025, 'learning_rate': 1.6486479686414465e-05, 'epoch': 0.4} +{'loss': 166.1389, 'grad_norm': 2953.3930158115077, 'learning_rate': 1.6483161190108778e-05, 'epoch': 0.4} +{'loss': 176.0549, 'grad_norm': 2521.4369977791553, 'learning_rate': 1.64798414617263e-05, 'epoch': 0.4} +{'loss': 167.911, 'grad_norm': 2140.5820936047753, 'learning_rate': 1.647652050189792e-05, 'epoch': 0.4} +{'loss': 170.6763, 'grad_norm': 2315.403742106998, 'learning_rate': 1.6473198311254767e-05, 'epoch': 0.4} +{'loss': 171.7888, 'grad_norm': 2179.713892209417, 'learning_rate': 1.6469874890428195e-05, 'epoch': 0.4} +{'loss': 176.8984, 'grad_norm': 2144.3296022615573, 'learning_rate': 1.64665502400498e-05, 'epoch': 0.4} +{'loss': 172.9721, 'grad_norm': 2125.698279767421, 'learning_rate': 1.6463224360751397e-05, 'epoch': 0.4} +{'loss': 172.4979, 'grad_norm': 2125.0138496355853, 'learning_rate': 1.645989725316506e-05, 'epoch': 0.4} +{'loss': 173.3186, 'grad_norm': 2221.3539191664686, 'learning_rate': 1.6456568917923066e-05, 'epoch': 0.4} +{'loss': 167.3925, 'grad_norm': 2354.0069978550314, 'learning_rate': 1.6453239355657947e-05, 'epoch': 0.4} +{'loss': 176.6859, 'grad_norm': 2353.8943748820757, 'learning_rate': 1.6449908567002464e-05, 'epoch': 0.4} +{'loss': 171.2473, 'grad_norm': 2357.301332931081, 'learning_rate': 1.6446576552589608e-05, 'epoch': 0.4} +{'loss': 183.3972, 'grad_norm': 2259.9305013856247, 'learning_rate': 1.6443243313052603e-05, 'epoch': 0.4} +{'loss': 173.3809, 'grad_norm': 1904.8820431486781, 'learning_rate': 1.64399088490249e-05, 'epoch': 0.4} +{'loss': 172.1442, 'grad_norm': 2461.3653626945966, 'learning_rate': 1.64365731611402e-05, 'epoch': 0.4} +{'loss': 174.7375, 'grad_norm': 2482.655689764607, 'learning_rate': 1.643323625003242e-05, 'epoch': 0.4} +{'loss': 182.6854, 'grad_norm': 2888.0401543482785, 'learning_rate': 1.6429898116335718e-05, 'epoch': 0.4} +{'loss': 176.7023, 'grad_norm': 2840.806825395549, 'learning_rate': 1.6426558760684472e-05, 'epoch': 0.4} +{'loss': 172.8153, 'grad_norm': 2310.2268960773863, 'learning_rate': 1.6423218183713313e-05, 'epoch': 0.4} +{'loss': 170.5116, 'grad_norm': 2929.280509274687, 'learning_rate': 1.641987638605709e-05, 'epoch': 0.4} +{'loss': 177.7323, 'grad_norm': 2199.802234992196, 'learning_rate': 1.641653336835089e-05, 'epoch': 0.4} +{'loss': 171.3734, 'grad_norm': 2054.4451691484837, 'learning_rate': 1.6413189131230022e-05, 'epoch': 0.4} +{'loss': 171.5077, 'grad_norm': 2040.359693913783, 'learning_rate': 1.6409843675330032e-05, 'epoch': 0.4} +{'loss': 174.7737, 'grad_norm': 2230.652100475331, 'learning_rate': 1.640649700128671e-05, 'epoch': 0.4} +{'loss': 174.2601, 'grad_norm': 2529.8337267001643, 'learning_rate': 1.6403149109736055e-05, 'epoch': 0.4} +{'loss': 171.581, 'grad_norm': 2439.956482879401, 'learning_rate': 1.639980000131432e-05, 'epoch': 0.4} +{'loss': 174.2593, 'grad_norm': 2251.5341278643123, 'learning_rate': 1.6396449676657967e-05, 'epoch': 0.4} +{'loss': 174.6037, 'grad_norm': 2421.921040395491, 'learning_rate': 1.6393098136403708e-05, 'epoch': 0.4} +{'loss': 173.4213, 'grad_norm': 2125.454791849175, 'learning_rate': 1.6389745381188475e-05, 'epoch': 0.4} +{'loss': 170.8362, 'grad_norm': 2503.2494052410066, 'learning_rate': 1.638639141164944e-05, 'epoch': 0.4} +{'loss': 175.6863, 'grad_norm': 2083.7760173162933, 'learning_rate': 1.638303622842399e-05, 'epoch': 0.4} +{'loss': 173.4057, 'grad_norm': 2135.753528167582, 'learning_rate': 1.6379679832149763e-05, 'epoch': 0.4} +{'loss': 168.6634, 'grad_norm': 1992.5786167732078, 'learning_rate': 1.637632222346461e-05, 'epoch': 0.4} +{'loss': 168.5115, 'grad_norm': 2312.476594283389, 'learning_rate': 1.6372963403006624e-05, 'epoch': 0.4} +{'loss': 170.4934, 'grad_norm': 1935.090526720446, 'learning_rate': 1.6369603371414123e-05, 'epoch': 0.4} +{'loss': 174.8277, 'grad_norm': 2398.93784906803, 'learning_rate': 1.6366242129325652e-05, 'epoch': 0.4} +{'loss': 171.7041, 'grad_norm': 2460.127542004893, 'learning_rate': 1.6362879677379995e-05, 'epoch': 0.4} +{'loss': 173.3167, 'grad_norm': 2308.3590950330686, 'learning_rate': 1.6359516016216163e-05, 'epoch': 0.4} +{'loss': 173.6453, 'grad_norm': 2120.3592671014544, 'learning_rate': 1.6356151146473386e-05, 'epoch': 0.4} +{'loss': 172.9284, 'grad_norm': 2280.5628635923586, 'learning_rate': 1.6352785068791143e-05, 'epoch': 0.4} +{'loss': 173.2407, 'grad_norm': 1908.689485572536, 'learning_rate': 1.6349417783809123e-05, 'epoch': 0.4} +{'loss': 175.3489, 'grad_norm': 4943.1022885007915, 'learning_rate': 1.6346049292167264e-05, 'epoch': 0.4} +{'loss': 174.827, 'grad_norm': 2224.3401036768614, 'learning_rate': 1.634267959450571e-05, 'epoch': 0.4} +{'loss': 170.9078, 'grad_norm': 2219.7688328444215, 'learning_rate': 1.633930869146486e-05, 'epoch': 0.4} +{'loss': 165.0249, 'grad_norm': 2041.5189237537495, 'learning_rate': 1.6335936583685317e-05, 'epoch': 0.4} +{'loss': 170.4682, 'grad_norm': 2842.158131125609, 'learning_rate': 1.6332563271807935e-05, 'epoch': 0.4} +{'loss': 173.8394, 'grad_norm': 2340.252843728698, 'learning_rate': 1.6329188756473785e-05, 'epoch': 0.4} +{'loss': 171.2991, 'grad_norm': 2246.18480649987, 'learning_rate': 1.6325813038324163e-05, 'epoch': 0.4} +{'loss': 168.8408, 'grad_norm': 2282.5667499826914, 'learning_rate': 1.6322436118000604e-05, 'epoch': 0.41} +{'loss': 169.3136, 'grad_norm': 1999.0457763720995, 'learning_rate': 1.6319057996144868e-05, 'epoch': 0.41} +{'loss': 174.2958, 'grad_norm': 1885.2702099661788, 'learning_rate': 1.6315678673398935e-05, 'epoch': 0.41} +{'loss': 170.769, 'grad_norm': 2147.8962934708775, 'learning_rate': 1.631229815040503e-05, 'epoch': 0.41} +{'loss': 176.7423, 'grad_norm': 2028.1420421501932, 'learning_rate': 1.6308916427805587e-05, 'epoch': 0.41} +{'loss': 168.8739, 'grad_norm': 2534.523326049801, 'learning_rate': 1.6305533506243287e-05, 'epoch': 0.41} +{'loss': 166.7589, 'grad_norm': 2117.2943363096606, 'learning_rate': 1.6302149386361023e-05, 'epoch': 0.41} +{'loss': 172.0037, 'grad_norm': 2489.1800728224184, 'learning_rate': 1.629876406880192e-05, 'epoch': 0.41} +{'loss': 173.4496, 'grad_norm': 2663.5897929045923, 'learning_rate': 1.6295377554209338e-05, 'epoch': 0.41} +{'loss': 165.8455, 'grad_norm': 2196.2546092473003, 'learning_rate': 1.6291989843226855e-05, 'epoch': 0.41} +{'loss': 171.1656, 'grad_norm': 2308.783020464287, 'learning_rate': 1.628860093649828e-05, 'epoch': 0.41} +{'loss': 169.1301, 'grad_norm': 2218.3431893942475, 'learning_rate': 1.6285210834667657e-05, 'epoch': 0.41} +{'loss': 174.2514, 'grad_norm': 1900.4861769782995, 'learning_rate': 1.628181953837924e-05, 'epoch': 0.41} +{'loss': 171.8137, 'grad_norm': 2478.6617414968064, 'learning_rate': 1.6278427048277524e-05, 'epoch': 0.41} +{'loss': 173.7705, 'grad_norm': 2709.133949086513, 'learning_rate': 1.627503336500723e-05, 'epoch': 0.41} +{'loss': 184.3907, 'grad_norm': 2126.76471930929, 'learning_rate': 1.6271638489213297e-05, 'epoch': 0.41} +{'loss': 177.9338, 'grad_norm': 2212.819914584412, 'learning_rate': 1.6268242421540897e-05, 'epoch': 0.41} +{'loss': 169.0411, 'grad_norm': 2220.0884923042645, 'learning_rate': 1.626484516263543e-05, 'epoch': 0.41} +{'loss': 170.29, 'grad_norm': 2110.9940877809813, 'learning_rate': 1.626144671314252e-05, 'epoch': 0.41} +{'loss': 171.6301, 'grad_norm': 2021.1229338483954, 'learning_rate': 1.6258047073708012e-05, 'epoch': 0.41} +{'loss': 175.6044, 'grad_norm': 11387.521261578338, 'learning_rate': 1.6254646244977986e-05, 'epoch': 0.41} +{'loss': 177.9524, 'grad_norm': 2261.132970714459, 'learning_rate': 1.6251244227598747e-05, 'epoch': 0.41} +{'loss': 173.3971, 'grad_norm': 2102.183730314071, 'learning_rate': 1.624784102221682e-05, 'epoch': 0.41} +{'loss': 173.7868, 'grad_norm': 2351.0316417950503, 'learning_rate': 1.624443662947896e-05, 'epoch': 0.41} +{'loss': 173.5521, 'grad_norm': 1816.2363838701744, 'learning_rate': 1.6241031050032146e-05, 'epoch': 0.41} +{'loss': 169.4277, 'grad_norm': 2028.167188326762, 'learning_rate': 1.6237624284523582e-05, 'epoch': 0.41} +{'loss': 165.7357, 'grad_norm': 2117.1050983124337, 'learning_rate': 1.62342163336007e-05, 'epoch': 0.41} +{'loss': 165.1966, 'grad_norm': 2240.9826447596283, 'learning_rate': 1.6230807197911155e-05, 'epoch': 0.41} +{'loss': 170.2427, 'grad_norm': 2021.1134661118945, 'learning_rate': 1.622739687810283e-05, 'epoch': 0.41} +{'loss': 168.7079, 'grad_norm': 2097.294761784478, 'learning_rate': 1.622398537482383e-05, 'epoch': 0.41} +{'loss': 171.2775, 'grad_norm': 2299.9936233525955, 'learning_rate': 1.6220572688722485e-05, 'epoch': 0.41} +{'loss': 169.077, 'grad_norm': 1887.400663224519, 'learning_rate': 1.621715882044735e-05, 'epoch': 0.41} +{'loss': 174.1272, 'grad_norm': 2433.761607428838, 'learning_rate': 1.621374377064721e-05, 'epoch': 0.41} +{'loss': 168.9051, 'grad_norm': 2469.8360606374576, 'learning_rate': 1.6210327539971066e-05, 'epoch': 0.41} +{'loss': 171.0639, 'grad_norm': 2109.415731593552, 'learning_rate': 1.6206910129068146e-05, 'epoch': 0.41} +{'loss': 170.2893, 'grad_norm': 2329.793384918122, 'learning_rate': 1.620349153858791e-05, 'epoch': 0.41} +{'loss': 174.552, 'grad_norm': 2784.07153617418, 'learning_rate': 1.6200071769180026e-05, 'epoch': 0.41} +{'loss': 172.4811, 'grad_norm': 2159.598881664766, 'learning_rate': 1.6196650821494403e-05, 'epoch': 0.41} +{'loss': 171.136, 'grad_norm': 2031.0053364924227, 'learning_rate': 1.6193228696181166e-05, 'epoch': 0.41} +{'loss': 173.2919, 'grad_norm': 2049.4029601151583, 'learning_rate': 1.6189805393890665e-05, 'epoch': 0.41} +{'loss': 174.456, 'grad_norm': 2111.6748981712303, 'learning_rate': 1.6186380915273472e-05, 'epoch': 0.41} +{'loss': 173.2525, 'grad_norm': 2200.3555490295726, 'learning_rate': 1.6182955260980383e-05, 'epoch': 0.41} +{'loss': 170.2072, 'grad_norm': 1940.0979562577945, 'learning_rate': 1.617952843166242e-05, 'epoch': 0.41} +{'loss': 174.2216, 'grad_norm': 2193.850952303983, 'learning_rate': 1.6176100427970826e-05, 'epoch': 0.41} +{'loss': 169.0639, 'grad_norm': 2370.037608038932, 'learning_rate': 1.6172671250557065e-05, 'epoch': 0.41} +{'loss': 166.2878, 'grad_norm': 2040.7488144146394, 'learning_rate': 1.6169240900072826e-05, 'epoch': 0.41} +{'loss': 169.5681, 'grad_norm': 2163.917610630364, 'learning_rate': 1.616580937717003e-05, 'epoch': 0.41} +{'loss': 167.1433, 'grad_norm': 2243.874720992856, 'learning_rate': 1.6162376682500806e-05, 'epoch': 0.41} +{'loss': 175.2752, 'grad_norm': 2158.1031100078826, 'learning_rate': 1.6158942816717516e-05, 'epoch': 0.41} +{'loss': 171.0518, 'grad_norm': 2464.7619498015615, 'learning_rate': 1.6155507780472733e-05, 'epoch': 0.41} +{'loss': 169.8312, 'grad_norm': 2229.495383590754, 'learning_rate': 1.615207157441927e-05, 'epoch': 0.41} +{'loss': 170.5005, 'grad_norm': 2076.9211069578078, 'learning_rate': 1.6148634199210148e-05, 'epoch': 0.41} +{'loss': 168.1831, 'grad_norm': 2096.4247582891608, 'learning_rate': 1.6145195655498614e-05, 'epoch': 0.42} +{'loss': 174.4393, 'grad_norm': 2247.843518104514, 'learning_rate': 1.6141755943938137e-05, 'epoch': 0.42} +{'loss': 170.2732, 'grad_norm': 2199.0563224974576, 'learning_rate': 1.6138315065182413e-05, 'epoch': 0.42} +{'loss': 172.9941, 'grad_norm': 2088.5273106127233, 'learning_rate': 1.613487301988535e-05, 'epoch': 0.42} +{'loss': 175.4467, 'grad_norm': 2324.534122332003, 'learning_rate': 1.6131429808701087e-05, 'epoch': 0.42} +{'loss': 176.1813, 'grad_norm': 2249.8448184155986, 'learning_rate': 1.612798543228398e-05, 'epoch': 0.42} +{'loss': 171.6567, 'grad_norm': 2052.215853633076, 'learning_rate': 1.6124539891288607e-05, 'epoch': 0.42} +{'loss': 170.8886, 'grad_norm': 2198.398005127088, 'learning_rate': 1.6121093186369767e-05, 'epoch': 0.42} +{'loss': 167.5376, 'grad_norm': 2833.0687088063023, 'learning_rate': 1.611764531818248e-05, 'epoch': 0.42} +{'loss': 172.5218, 'grad_norm': 2416.457621347399, 'learning_rate': 1.6114196287381997e-05, 'epoch': 0.42} +{'loss': 171.8825, 'grad_norm': 2170.018944750378, 'learning_rate': 1.6110746094623766e-05, 'epoch': 0.42} +{'loss': 169.7613, 'grad_norm': 2378.2070786271847, 'learning_rate': 1.610729474056348e-05, 'epoch': 0.42} +{'loss': 169.5359, 'grad_norm': 2033.6242362289918, 'learning_rate': 1.610384222585704e-05, 'epoch': 0.42} +{'loss': 166.7337, 'grad_norm': 2364.725842725486, 'learning_rate': 1.610038855116057e-05, 'epoch': 0.42} +{'loss': 173.6409, 'grad_norm': 2136.9689755954228, 'learning_rate': 1.6096933717130418e-05, 'epoch': 0.42} +{'loss': 167.9116, 'grad_norm': 1976.781654395267, 'learning_rate': 1.6093477724423147e-05, 'epoch': 0.42} +{'loss': 166.7784, 'grad_norm': 2113.343444472536, 'learning_rate': 1.609002057369555e-05, 'epoch': 0.42} +{'loss': 172.7661, 'grad_norm': 2188.361274680432, 'learning_rate': 1.6086562265604625e-05, 'epoch': 0.42} +{'loss': 170.1698, 'grad_norm': 2145.7512651889992, 'learning_rate': 1.60831028008076e-05, 'epoch': 0.42} +{'loss': 173.4158, 'grad_norm': 1936.4297230402603, 'learning_rate': 1.6079642179961917e-05, 'epoch': 0.42} +{'loss': 172.9104, 'grad_norm': 2258.433256621759, 'learning_rate': 1.607618040372525e-05, 'epoch': 0.42} +{'loss': 175.9763, 'grad_norm': 1811.343182380123, 'learning_rate': 1.6072717472755477e-05, 'epoch': 0.42} +{'loss': 170.8112, 'grad_norm': 2987.6573327591623, 'learning_rate': 1.6069253387710707e-05, 'epoch': 0.42} +{'loss': 170.7448, 'grad_norm': 2605.144955246662, 'learning_rate': 1.606578814924926e-05, 'epoch': 0.42} +{'loss': 171.9434, 'grad_norm': 2441.502487927553, 'learning_rate': 1.606232175802968e-05, 'epoch': 0.42} +{'loss': 164.794, 'grad_norm': 1959.7621453828476, 'learning_rate': 1.605885421471073e-05, 'epoch': 0.42} +{'loss': 166.0441, 'grad_norm': 2179.7145155187504, 'learning_rate': 1.605538551995139e-05, 'epoch': 0.42} +{'loss': 167.8531, 'grad_norm': 2010.6755933935121, 'learning_rate': 1.6051915674410857e-05, 'epoch': 0.42} +{'loss': 169.6157, 'grad_norm': 1935.562425525348, 'learning_rate': 1.6048444678748555e-05, 'epoch': 0.42} +{'loss': 167.2482, 'grad_norm': 1970.2666982876544, 'learning_rate': 1.604497253362412e-05, 'epoch': 0.42} +{'loss': 171.1745, 'grad_norm': 2229.155950431185, 'learning_rate': 1.6041499239697405e-05, 'epoch': 0.42} +{'loss': 170.9094, 'grad_norm': 2088.4977883563597, 'learning_rate': 1.6038024797628484e-05, 'epoch': 0.42} +{'loss': 169.9755, 'grad_norm': 1966.1198960938657, 'learning_rate': 1.603454920807765e-05, 'epoch': 0.42} +{'loss': 168.8541, 'grad_norm': 2153.651256395168, 'learning_rate': 1.603107247170541e-05, 'epoch': 0.42} +{'loss': 169.4073, 'grad_norm': 2063.8300187805635, 'learning_rate': 1.60275945891725e-05, 'epoch': 0.42} +{'loss': 175.3959, 'grad_norm': 2020.643160422108, 'learning_rate': 1.602411556113986e-05, 'epoch': 0.42} +{'loss': 172.6945, 'grad_norm': 2207.032295358164, 'learning_rate': 1.6020635388268647e-05, 'epoch': 0.42} +{'loss': 167.4566, 'grad_norm': 2219.445873781522, 'learning_rate': 1.601715407122026e-05, 'epoch': 0.42} +{'loss': 171.0413, 'grad_norm': 2868.078748371678, 'learning_rate': 1.601367161065628e-05, 'epoch': 0.42} +{'loss': 167.619, 'grad_norm': 2462.686059489864, 'learning_rate': 1.601018800723853e-05, 'epoch': 0.42} +{'loss': 163.908, 'grad_norm': 2722.2963172098985, 'learning_rate': 1.600670326162904e-05, 'epoch': 0.42} +{'loss': 168.5813, 'grad_norm': 2208.7913506019054, 'learning_rate': 1.600321737449007e-05, 'epoch': 0.42} +{'loss': 166.0159, 'grad_norm': 1932.935809822312, 'learning_rate': 1.5999730346484076e-05, 'epoch': 0.42} +[2025-05-28 12:07:01,122] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step22000 is about to be saved! +[2025-05-28 12:07:01,220] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 12:07:01,221] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 12:07:01,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 12:07:01,437] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 12:13:12,588] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 12:13:12,664] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 12:13:13,110] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step22000 is ready now! +[2025-05-28 12:13:13,190] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step22000 is about to be saved! +[2025-05-28 12:13:13,227] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 12:13:13,227] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 12:13:13,259] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 12:13:13,416] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 12:19:18,655] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 12:19:18,726] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-22000/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 12:19:31,391] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step22000 is ready now! +{'loss': 166.4574, 'grad_norm': 2258.5963052505845, 'learning_rate': 1.5996242178273743e-05, 'epoch': 0.42} +{'loss': 172.0938, 'grad_norm': 2574.443149561418, 'learning_rate': 1.599275287052198e-05, 'epoch': 0.42} +{'loss': 168.0814, 'grad_norm': 1925.8711153268093, 'learning_rate': 1.5989262423891894e-05, 'epoch': 0.42} +{'loss': 170.4323, 'grad_norm': 2057.9559007545736, 'learning_rate': 1.5985770839046822e-05, 'epoch': 0.42} +{'loss': 173.412, 'grad_norm': 2554.907420051963, 'learning_rate': 1.598227811665032e-05, 'epoch': 0.42} +{'loss': 171.4978, 'grad_norm': 2034.2208750376724, 'learning_rate': 1.5978784257366143e-05, 'epoch': 0.42} +{'loss': 172.6309, 'grad_norm': 2272.9526445697516, 'learning_rate': 1.5975289261858283e-05, 'epoch': 0.42} +{'loss': 172.6906, 'grad_norm': 4927.097232560345, 'learning_rate': 1.5971793130790933e-05, 'epoch': 0.42} +{'loss': 170.065, 'grad_norm': 3223.576776602122, 'learning_rate': 1.5968295864828504e-05, 'epoch': 0.42} +{'loss': 169.707, 'grad_norm': 2232.8519811421384, 'learning_rate': 1.5964797464635632e-05, 'epoch': 0.43} +{'loss': 170.8295, 'grad_norm': 2048.278792695853, 'learning_rate': 1.596129793087716e-05, 'epoch': 0.43} +{'loss': 174.8247, 'grad_norm': 2180.668838320117, 'learning_rate': 1.5957797264218145e-05, 'epoch': 0.43} +{'loss': 168.976, 'grad_norm': 2033.9063928282792, 'learning_rate': 1.595429546532387e-05, 'epoch': 0.43} +{'loss': 165.8783, 'grad_norm': 2402.5474259434964, 'learning_rate': 1.5950792534859816e-05, 'epoch': 0.43} +{'loss': 170.4144, 'grad_norm': 2015.2042023710358, 'learning_rate': 1.59472884734917e-05, 'epoch': 0.43} +{'loss': 167.5595, 'grad_norm': 2033.7995324285905, 'learning_rate': 1.5943783281885434e-05, 'epoch': 0.43} +{'loss': 170.1899, 'grad_norm': 2153.948977064517, 'learning_rate': 1.594027696070716e-05, 'epoch': 0.43} +{'loss': 167.8063, 'grad_norm': 2296.374139745664, 'learning_rate': 1.5936769510623222e-05, 'epoch': 0.43} +{'loss': 169.3381, 'grad_norm': 2242.1324136413705, 'learning_rate': 1.5933260932300192e-05, 'epoch': 0.43} +{'loss': 164.1773, 'grad_norm': 2024.8243572455265, 'learning_rate': 1.5929751226404843e-05, 'epoch': 0.43} +{'loss': 167.295, 'grad_norm': 2130.0572716963384, 'learning_rate': 1.5926240393604172e-05, 'epoch': 0.43} +{'loss': 168.5081, 'grad_norm': 2854.483754322574, 'learning_rate': 1.5922728434565387e-05, 'epoch': 0.43} +{'loss': 168.6534, 'grad_norm': 1884.9639205407461, 'learning_rate': 1.5919215349955907e-05, 'epoch': 0.43} +{'loss': 166.555, 'grad_norm': 2821.292894083381, 'learning_rate': 1.5915701140443372e-05, 'epoch': 0.43} +{'loss': 166.006, 'grad_norm': 2107.950954851539, 'learning_rate': 1.5912185806695628e-05, 'epoch': 0.43} +{'loss': 171.5676, 'grad_norm': 2650.24088279195, 'learning_rate': 1.590866934938074e-05, 'epoch': 0.43} +{'loss': 165.5204, 'grad_norm': 2488.1281184062464, 'learning_rate': 1.590515176916698e-05, 'epoch': 0.43} +{'loss': 164.1832, 'grad_norm': 1820.5688298960868, 'learning_rate': 1.590163306672285e-05, 'epoch': 0.43} +{'loss': 164.113, 'grad_norm': 2288.756174923136, 'learning_rate': 1.589811324271704e-05, 'epoch': 0.43} +{'loss': 168.6961, 'grad_norm': 2467.0913710666814, 'learning_rate': 1.589459229781847e-05, 'epoch': 0.43} +{'loss': 167.3534, 'grad_norm': 2689.2493706085775, 'learning_rate': 1.5891070232696274e-05, 'epoch': 0.43} +{'loss': 167.2848, 'grad_norm': 2537.5524914705356, 'learning_rate': 1.5887547048019794e-05, 'epoch': 0.43} +{'loss': 168.1937, 'grad_norm': 2509.337039085596, 'learning_rate': 1.588402274445858e-05, 'epoch': 0.43} +{'loss': 162.9298, 'grad_norm': 1996.9889819190496, 'learning_rate': 1.5880497322682405e-05, 'epoch': 0.43} +{'loss': 165.7726, 'grad_norm': 1919.606727406959, 'learning_rate': 1.587697078336125e-05, 'epoch': 0.43} +{'loss': 163.6781, 'grad_norm': 2129.4370789366576, 'learning_rate': 1.58734431271653e-05, 'epoch': 0.43} +{'loss': 166.3615, 'grad_norm': 2149.7349253601797, 'learning_rate': 1.5869914354764964e-05, 'epoch': 0.43} +{'loss': 165.1894, 'grad_norm': 2101.5469047482125, 'learning_rate': 1.5866384466830863e-05, 'epoch': 0.43} +{'loss': 168.9131, 'grad_norm': 2058.906391146034, 'learning_rate': 1.5862853464033822e-05, 'epoch': 0.43} +{'loss': 170.143, 'grad_norm': 2266.1769911826664, 'learning_rate': 1.5859321347044882e-05, 'epoch': 0.43} +{'loss': 174.3159, 'grad_norm': 2241.2820713774804, 'learning_rate': 1.5855788116535296e-05, 'epoch': 0.43} +{'loss': 165.8356, 'grad_norm': 2002.9004030297945, 'learning_rate': 1.5852253773176535e-05, 'epoch': 0.43} +{'loss': 173.8653, 'grad_norm': 2067.4235101292325, 'learning_rate': 1.5848718317640264e-05, 'epoch': 0.43} +{'loss': 170.4829, 'grad_norm': 2150.539789342145, 'learning_rate': 1.584518175059837e-05, 'epoch': 0.43} +{'loss': 166.6563, 'grad_norm': 2018.970056470453, 'learning_rate': 1.5841644072722964e-05, 'epoch': 0.43} +{'loss': 169.1333, 'grad_norm': 2200.873160051114, 'learning_rate': 1.5838105284686344e-05, 'epoch': 0.43} +{'loss': 169.3119, 'grad_norm': 2261.617855043718, 'learning_rate': 1.5834565387161034e-05, 'epoch': 0.43} +{'loss': 168.6195, 'grad_norm': 2268.7556996315047, 'learning_rate': 1.583102438081977e-05, 'epoch': 0.43} +{'loss': 166.3194, 'grad_norm': 2327.205117222702, 'learning_rate': 1.582748226633548e-05, 'epoch': 0.43} +{'loss': 173.5646, 'grad_norm': 2335.142808774291, 'learning_rate': 1.5823939044381334e-05, 'epoch': 0.43} +{'loss': 166.4086, 'grad_norm': 3020.4784623838477, 'learning_rate': 1.5820394715630685e-05, 'epoch': 0.43} +{'loss': 165.0181, 'grad_norm': 2204.627879263952, 'learning_rate': 1.581684928075711e-05, 'epoch': 0.43} +{'loss': 173.8692, 'grad_norm': 2306.666654345983, 'learning_rate': 1.581330274043439e-05, 'epoch': 0.43} +{'loss': 164.9414, 'grad_norm': 2121.041129345257, 'learning_rate': 1.580975509533652e-05, 'epoch': 0.43} +{'loss': 170.3385, 'grad_norm': 2268.0698675712547, 'learning_rate': 1.58062063461377e-05, 'epoch': 0.43} +{'loss': 167.9048, 'grad_norm': 2277.624891138103, 'learning_rate': 1.5802656493512352e-05, 'epoch': 0.43} +{'loss': 175.8037, 'grad_norm': 2173.534278202169, 'learning_rate': 1.5799105538135095e-05, 'epoch': 0.43} +{'loss': 165.5313, 'grad_norm': 2200.398539534646, 'learning_rate': 1.5795553480680762e-05, 'epoch': 0.43} +{'loss': 167.9138, 'grad_norm': 2152.033175031207, 'learning_rate': 1.5792000321824392e-05, 'epoch': 0.43} +{'loss': 174.1643, 'grad_norm': 2158.201374313882, 'learning_rate': 1.578844606224124e-05, 'epoch': 0.43} +{'loss': 165.7943, 'grad_norm': 2046.9143054830213, 'learning_rate': 1.5784890702606763e-05, 'epoch': 0.43} +{'loss': 168.7993, 'grad_norm': 1951.4366219006934, 'learning_rate': 1.5781334243596637e-05, 'epoch': 0.44} +{'loss': 168.4422, 'grad_norm': 1753.534278075212, 'learning_rate': 1.5777776685886736e-05, 'epoch': 0.44} +{'loss': 165.2484, 'grad_norm': 2008.558235488974, 'learning_rate': 1.5774218030153148e-05, 'epoch': 0.44} +{'loss': 169.9383, 'grad_norm': 2004.387479268091, 'learning_rate': 1.577065827707217e-05, 'epoch': 0.44} +{'loss': 164.5938, 'grad_norm': 2482.722528965405, 'learning_rate': 1.5767097427320307e-05, 'epoch': 0.44} +{'loss': 168.2969, 'grad_norm': 2167.6724546020437, 'learning_rate': 1.576353548157427e-05, 'epoch': 0.44} +{'loss': 164.0226, 'grad_norm': 1888.0511497663501, 'learning_rate': 1.5759972440510985e-05, 'epoch': 0.44} +{'loss': 166.8777, 'grad_norm': 2024.245766963027, 'learning_rate': 1.575640830480758e-05, 'epoch': 0.44} +{'loss': 166.288, 'grad_norm': 2305.0596651661313, 'learning_rate': 1.5752843075141387e-05, 'epoch': 0.44} +{'loss': 169.7916, 'grad_norm': 1949.7235039538402, 'learning_rate': 1.574927675218996e-05, 'epoch': 0.44} +{'loss': 169.9108, 'grad_norm': 2650.27210949482, 'learning_rate': 1.5745709336631048e-05, 'epoch': 0.44} +{'loss': 168.3702, 'grad_norm': 2252.9298969105153, 'learning_rate': 1.5742140829142608e-05, 'epoch': 0.44} +{'loss': 165.755, 'grad_norm': 2495.084413926129, 'learning_rate': 1.5738571230402814e-05, 'epoch': 0.44} +{'loss': 167.9823, 'grad_norm': 2239.803481849846, 'learning_rate': 1.573500054109004e-05, 'epoch': 0.44} +{'loss': 172.3639, 'grad_norm': 2516.868277888633, 'learning_rate': 1.5731428761882873e-05, 'epoch': 0.44} +{'loss': 166.2017, 'grad_norm': 2317.2028130468957, 'learning_rate': 1.5727855893460096e-05, 'epoch': 0.44} +{'loss': 166.4592, 'grad_norm': 1874.180938962604, 'learning_rate': 1.572428193650071e-05, 'epoch': 0.44} +{'loss': 170.3986, 'grad_norm': 2303.1792140542575, 'learning_rate': 1.572070689168392e-05, 'epoch': 0.44} +{'loss': 168.5608, 'grad_norm': 2519.4977139125886, 'learning_rate': 1.5717130759689137e-05, 'epoch': 0.44} +{'loss': 166.6021, 'grad_norm': 2238.1758471961502, 'learning_rate': 1.5713553541195973e-05, 'epoch': 0.44} +{'loss': 172.687, 'grad_norm': 1933.4067855950352, 'learning_rate': 1.570997523688426e-05, 'epoch': 0.44} +{'loss': 164.2812, 'grad_norm': 2269.4076490084094, 'learning_rate': 1.5706395847434023e-05, 'epoch': 0.44} +{'loss': 170.327, 'grad_norm': 2354.7360380082937, 'learning_rate': 1.5702815373525496e-05, 'epoch': 0.44} +{'loss': 163.1358, 'grad_norm': 2002.7653819193513, 'learning_rate': 1.5699233815839133e-05, 'epoch': 0.44} +{'loss': 168.9424, 'grad_norm': 2434.614842748491, 'learning_rate': 1.569565117505557e-05, 'epoch': 0.44} +{'loss': 169.7292, 'grad_norm': 2295.0209403060553, 'learning_rate': 1.5692067451855667e-05, 'epoch': 0.44} +{'loss': 172.6948, 'grad_norm': 2060.5054281137855, 'learning_rate': 1.568848264692048e-05, 'epoch': 0.44} +{'loss': 168.7833, 'grad_norm': 2090.3926445914512, 'learning_rate': 1.568489676093128e-05, 'epoch': 0.44} +{'loss': 167.0336, 'grad_norm': 1856.9868145004568, 'learning_rate': 1.5681309794569535e-05, 'epoch': 0.44} +{'loss': 165.0819, 'grad_norm': 1851.8340000454962, 'learning_rate': 1.5677721748516925e-05, 'epoch': 0.44} +{'loss': 165.6263, 'grad_norm': 2358.680615148725, 'learning_rate': 1.567413262345533e-05, 'epoch': 0.44} +{'loss': 166.8392, 'grad_norm': 2156.5017164001556, 'learning_rate': 1.567054242006683e-05, 'epoch': 0.44} +{'loss': 162.0119, 'grad_norm': 2036.29348455704, 'learning_rate': 1.5666951139033723e-05, 'epoch': 0.44} +{'loss': 164.0932, 'grad_norm': 2151.154586238911, 'learning_rate': 1.5663358781038508e-05, 'epoch': 0.44} +{'loss': 163.9966, 'grad_norm': 1884.019116086453, 'learning_rate': 1.565976534676388e-05, 'epoch': 0.44} +{'loss': 168.5161, 'grad_norm': 1918.2877337512268, 'learning_rate': 1.565617083689275e-05, 'epoch': 0.44} +{'loss': 170.9488, 'grad_norm': 2422.2914695874183, 'learning_rate': 1.565257525210822e-05, 'epoch': 0.44} +{'loss': 166.6161, 'grad_norm': 2097.520821600227, 'learning_rate': 1.5648978593093613e-05, 'epoch': 0.44} +{'loss': 164.7565, 'grad_norm': 1926.5672281267941, 'learning_rate': 1.5645380860532443e-05, 'epoch': 0.44} +{'loss': 164.0654, 'grad_norm': 2504.169188517526, 'learning_rate': 1.5641782055108435e-05, 'epoch': 0.44} +{'loss': 165.7647, 'grad_norm': 1864.9813286398617, 'learning_rate': 1.563818217750551e-05, 'epoch': 0.44} +{'loss': 164.5087, 'grad_norm': 1904.8049924393526, 'learning_rate': 1.5634581228407807e-05, 'epoch': 0.44} +{'loss': 169.3068, 'grad_norm': 2396.253575701011, 'learning_rate': 1.563097920849965e-05, 'epoch': 0.44} +{'loss': 164.335, 'grad_norm': 2170.5842858136643, 'learning_rate': 1.5627376118465584e-05, 'epoch': 0.44} +{'loss': 171.5131, 'grad_norm': 1924.511846302921, 'learning_rate': 1.5623771958990342e-05, 'epoch': 0.44} +{'loss': 171.8633, 'grad_norm': 2413.8602599127585, 'learning_rate': 1.562016673075887e-05, 'epoch': 0.44} +{'loss': 170.0479, 'grad_norm': 2167.4292882701106, 'learning_rate': 1.5616560434456324e-05, 'epoch': 0.44} +{'loss': 164.724, 'grad_norm': 2073.688775177645, 'learning_rate': 1.5612953070768043e-05, 'epoch': 0.44} +{'loss': 169.5081, 'grad_norm': 2199.4505355418373, 'learning_rate': 1.5609344640379585e-05, 'epoch': 0.44} +{'loss': 169.1582, 'grad_norm': 2260.0141095905196, 'learning_rate': 1.56057351439767e-05, 'epoch': 0.44} +{'loss': 164.802, 'grad_norm': 2300.021938950231, 'learning_rate': 1.5602124582245355e-05, 'epoch': 0.44} +{'loss': 166.6413, 'grad_norm': 2436.5735292727804, 'learning_rate': 1.5598512955871706e-05, 'epoch': 0.44} +{'loss': 172.4023, 'grad_norm': 2108.7133242646155, 'learning_rate': 1.559490026554211e-05, 'epoch': 0.45} +{'loss': 167.4143, 'grad_norm': 2165.340044582447, 'learning_rate': 1.559128651194314e-05, 'epoch': 0.45} +{'loss': 170.2773, 'grad_norm': 2271.393673223469, 'learning_rate': 1.5587671695761564e-05, 'epoch': 0.45} +{'loss': 169.0378, 'grad_norm': 2187.0779087881106, 'learning_rate': 1.5584055817684346e-05, 'epoch': 0.45} +{'loss': 170.2202, 'grad_norm': 2570.959564937327, 'learning_rate': 1.5580438878398657e-05, 'epoch': 0.45} +{'loss': 163.7518, 'grad_norm': 1953.3712216770928, 'learning_rate': 1.557682087859187e-05, 'epoch': 0.45} +{'loss': 164.7977, 'grad_norm': 2197.1785085127785, 'learning_rate': 1.5573201818951563e-05, 'epoch': 0.45} +{'loss': 165.4936, 'grad_norm': 2071.094352426886, 'learning_rate': 1.556958170016551e-05, 'epoch': 0.45} +{'loss': 176.9665, 'grad_norm': 2341.9508525631595, 'learning_rate': 1.5565960522921682e-05, 'epoch': 0.45} +{'loss': 170.7298, 'grad_norm': 2151.2460041804507, 'learning_rate': 1.5562338287908266e-05, 'epoch': 0.45} +{'loss': 169.2945, 'grad_norm': 2376.532787130493, 'learning_rate': 1.5558714995813636e-05, 'epoch': 0.45} +{'loss': 164.2785, 'grad_norm': 2111.6522326909017, 'learning_rate': 1.555509064732637e-05, 'epoch': 0.45} +{'loss': 165.2998, 'grad_norm': 1819.589819712101, 'learning_rate': 1.5551465243135254e-05, 'epoch': 0.45} +{'loss': 167.9492, 'grad_norm': 2182.7334021893575, 'learning_rate': 1.554783878392926e-05, 'epoch': 0.45} +{'loss': 166.5315, 'grad_norm': 2207.110559402928, 'learning_rate': 1.554421127039758e-05, 'epoch': 0.45} +{'loss': 164.9224, 'grad_norm': 1955.9981293490782, 'learning_rate': 1.554058270322959e-05, 'epoch': 0.45} +{'loss': 165.5366, 'grad_norm': 2197.6943834718877, 'learning_rate': 1.553695308311488e-05, 'epoch': 0.45} +{'loss': 163.7282, 'grad_norm': 2023.6785473161844, 'learning_rate': 1.5533322410743223e-05, 'epoch': 0.45} +{'loss': 166.3262, 'grad_norm': 2221.5860033920067, 'learning_rate': 1.55296906868046e-05, 'epoch': 0.45} +{'loss': 161.0696, 'grad_norm': 2060.593304642912, 'learning_rate': 1.5526057911989206e-05, 'epoch': 0.45} +{'loss': 163.5131, 'grad_norm': 2455.0569343974726, 'learning_rate': 1.552242408698741e-05, 'epoch': 0.45} +{'loss': 163.4793, 'grad_norm': 2182.282373137838, 'learning_rate': 1.55187892124898e-05, 'epoch': 0.45} +{'loss': 164.393, 'grad_norm': 2080.1942665235647, 'learning_rate': 1.5515153289187157e-05, 'epoch': 0.45} +{'loss': 169.4423, 'grad_norm': 2161.2360187853023, 'learning_rate': 1.5511516317770457e-05, 'epoch': 0.45} +{'loss': 171.8609, 'grad_norm': 2377.766681457525, 'learning_rate': 1.5507878298930888e-05, 'epoch': 0.45} +{'loss': 170.9781, 'grad_norm': 3964.7319001974347, 'learning_rate': 1.550423923335982e-05, 'epoch': 0.45} +{'loss': 169.1312, 'grad_norm': 2143.097396362546, 'learning_rate': 1.5500599121748832e-05, 'epoch': 0.45} +{'loss': 170.609, 'grad_norm': 1886.3353674739558, 'learning_rate': 1.5496957964789704e-05, 'epoch': 0.45} +{'loss': 164.3724, 'grad_norm': 2004.8576068219854, 'learning_rate': 1.549331576317441e-05, 'epoch': 0.45} +{'loss': 169.7558, 'grad_norm': 2675.9622925931644, 'learning_rate': 1.5489672517595117e-05, 'epoch': 0.45} +{'loss': 165.2677, 'grad_norm': 2034.8111874489819, 'learning_rate': 1.5486028228744206e-05, 'epoch': 0.45} +{'loss': 165.5242, 'grad_norm': 2279.55271199066, 'learning_rate': 1.5482382897314243e-05, 'epoch': 0.45} +{'loss': 171.7162, 'grad_norm': 2063.956940155306, 'learning_rate': 1.5478736523997996e-05, 'epoch': 0.45} +{'loss': 164.0665, 'grad_norm': 2175.071070227959, 'learning_rate': 1.547508910948843e-05, 'epoch': 0.45} +{'loss': 165.9679, 'grad_norm': 2453.0933151571758, 'learning_rate': 1.5471440654478714e-05, 'epoch': 0.45} +{'loss': 161.9281, 'grad_norm': 1975.2099234566128, 'learning_rate': 1.5467791159662207e-05, 'epoch': 0.45} +{'loss': 168.1503, 'grad_norm': 2109.070477405324, 'learning_rate': 1.5464140625732467e-05, 'epoch': 0.45} +{'loss': 160.3894, 'grad_norm': 1991.6887296591694, 'learning_rate': 1.546048905338325e-05, 'epoch': 0.45} +{'loss': 167.3967, 'grad_norm': 2216.777951578083, 'learning_rate': 1.5456836443308512e-05, 'epoch': 0.45} +{'loss': 164.055, 'grad_norm': 2191.7042073455395, 'learning_rate': 1.545318279620241e-05, 'epoch': 0.45} +{'loss': 164.8362, 'grad_norm': 2008.1607242147084, 'learning_rate': 1.544952811275928e-05, 'epoch': 0.45} +{'loss': 166.2684, 'grad_norm': 2198.1348711314636, 'learning_rate': 1.5445872393673684e-05, 'epoch': 0.45} +{'loss': 163.7014, 'grad_norm': 1892.9607571213573, 'learning_rate': 1.5442215639640352e-05, 'epoch': 0.45} +{'loss': 161.7995, 'grad_norm': 2023.3554663808557, 'learning_rate': 1.5438557851354222e-05, 'epoch': 0.45} +{'loss': 165.5357, 'grad_norm': 2094.8042000539804, 'learning_rate': 1.543489902951044e-05, 'epoch': 0.45} +{'loss': 167.1753, 'grad_norm': 2900.361099086172, 'learning_rate': 1.543123917480433e-05, 'epoch': 0.45} +{'loss': 167.8375, 'grad_norm': 2215.2252741749553, 'learning_rate': 1.5427578287931423e-05, 'epoch': 0.45} +{'loss': 162.9985, 'grad_norm': 2255.392299119775, 'learning_rate': 1.5423916369587438e-05, 'epoch': 0.45} +{'loss': 163.2608, 'grad_norm': 2014.145251803009, 'learning_rate': 1.5420253420468304e-05, 'epoch': 0.45} +{'loss': 165.1683, 'grad_norm': 2004.0885807657228, 'learning_rate': 1.5416589441270136e-05, 'epoch': 0.45} +{'loss': 166.949, 'grad_norm': 1995.4621884856551, 'learning_rate': 1.541292443268924e-05, 'epoch': 0.45} +{'loss': 167.0387, 'grad_norm': 3724.827598017535, 'learning_rate': 1.540925839542213e-05, 'epoch': 0.45} +{'loss': 165.6599, 'grad_norm': 2378.808997373595, 'learning_rate': 1.5405591330165503e-05, 'epoch': 0.46} +{'loss': 166.5445, 'grad_norm': 2228.6346728828116, 'learning_rate': 1.5401923237616264e-05, 'epoch': 0.46} +{'loss': 166.2888, 'grad_norm': 2473.7406445956944, 'learning_rate': 1.5398254118471502e-05, 'epoch': 0.46} +{'loss': 167.3927, 'grad_norm': 2075.4921905637298, 'learning_rate': 1.5394583973428508e-05, 'epoch': 0.46} +{'loss': 168.4228, 'grad_norm': 1782.7795472242433, 'learning_rate': 1.539091280318477e-05, 'epoch': 0.46} +{'loss': 163.1567, 'grad_norm': 2263.4588218311933, 'learning_rate': 1.5387240608437962e-05, 'epoch': 0.46} +{'loss': 164.8548, 'grad_norm': 1979.0729583018472, 'learning_rate': 1.5383567389885955e-05, 'epoch': 0.46} +{'loss': 162.1476, 'grad_norm': 2006.1679461697397, 'learning_rate': 1.537989314822682e-05, 'epoch': 0.46} +{'loss': 165.4449, 'grad_norm': 1952.9281672807779, 'learning_rate': 1.537621788415882e-05, 'epoch': 0.46} +{'loss': 167.4024, 'grad_norm': 2161.813516824751, 'learning_rate': 1.5372541598380416e-05, 'epoch': 0.46} +{'loss': 169.9689, 'grad_norm': 2000.0605150802144, 'learning_rate': 1.5368864291590248e-05, 'epoch': 0.46} +{'loss': 164.619, 'grad_norm': 2173.938078955209, 'learning_rate': 1.536518596448717e-05, 'epoch': 0.46} +{'loss': 168.2557, 'grad_norm': 2149.3398150379044, 'learning_rate': 1.5361506617770217e-05, 'epoch': 0.46} +{'loss': 160.839, 'grad_norm': 2062.7646891655945, 'learning_rate': 1.5357826252138623e-05, 'epoch': 0.46} +{'loss': 169.2905, 'grad_norm': 1817.5572630388688, 'learning_rate': 1.5354144868291817e-05, 'epoch': 0.46} +{'loss': 165.1941, 'grad_norm': 2295.674120483997, 'learning_rate': 1.5350462466929414e-05, 'epoch': 0.46} +{'loss': 167.6489, 'grad_norm': 2110.0821671521576, 'learning_rate': 1.5346779048751225e-05, 'epoch': 0.46} +{'loss': 166.0128, 'grad_norm': 2971.3313250084934, 'learning_rate': 1.5343094614457262e-05, 'epoch': 0.46} +{'loss': 162.9047, 'grad_norm': 1793.6212327817077, 'learning_rate': 1.533940916474772e-05, 'epoch': 0.46} +{'loss': 160.9458, 'grad_norm': 3258.9393386767038, 'learning_rate': 1.5335722700323e-05, 'epoch': 0.46} +{'loss': 164.9443, 'grad_norm': 2356.377167950531, 'learning_rate': 1.5332035221883673e-05, 'epoch': 0.46} +{'loss': 161.6157, 'grad_norm': 2004.986950024668, 'learning_rate': 1.532834673013053e-05, 'epoch': 0.46} +{'loss': 160.2313, 'grad_norm': 2004.0670886233581, 'learning_rate': 1.5324657225764538e-05, 'epoch': 0.46} +{'loss': 157.6185, 'grad_norm': 2509.627508289782, 'learning_rate': 1.5320966709486857e-05, 'epoch': 0.46} +{'loss': 163.0602, 'grad_norm': 2003.4525612094062, 'learning_rate': 1.5317275181998844e-05, 'epoch': 0.46} +{'loss': 163.6542, 'grad_norm': 2294.2153193146833, 'learning_rate': 1.5313582644002046e-05, 'epoch': 0.46} +{'loss': 157.0924, 'grad_norm': 1824.9037055711326, 'learning_rate': 1.5309889096198203e-05, 'epoch': 0.46} +{'loss': 161.2604, 'grad_norm': 1943.2635344302926, 'learning_rate': 1.5306194539289248e-05, 'epoch': 0.46} +{'loss': 162.3094, 'grad_norm': 1953.6910679804905, 'learning_rate': 1.53024989739773e-05, 'epoch': 0.46} +{'loss': 160.2414, 'grad_norm': 2123.222798802063, 'learning_rate': 1.5298802400964678e-05, 'epoch': 0.46} +{'loss': 163.226, 'grad_norm': 2000.84287770549, 'learning_rate': 1.5295104820953887e-05, 'epoch': 0.46} +{'loss': 165.9873, 'grad_norm': 1924.9430478201857, 'learning_rate': 1.5291406234647622e-05, 'epoch': 0.46} +{'loss': 165.6112, 'grad_norm': 2130.665763116054, 'learning_rate': 1.528770664274878e-05, 'epoch': 0.46} +{'loss': 162.7805, 'grad_norm': 2110.137253217719, 'learning_rate': 1.5284006045960432e-05, 'epoch': 0.46} +{'loss': 165.0431, 'grad_norm': 1920.5089547104462, 'learning_rate': 1.528030444498585e-05, 'epoch': 0.46} +[2025-05-28 14:07:52,247] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step24000 is about to be saved! +[2025-05-28 14:07:52,334] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 14:07:52,334] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 14:07:52,433] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 14:07:52,545] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 14:13:46,147] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 14:13:46,317] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 14:13:59,366] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step24000 is ready now! +[2025-05-28 14:13:59,623] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step24000 is about to be saved! +[2025-05-28 14:13:59,647] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 14:13:59,647] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 14:13:59,761] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 14:13:59,830] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 14:20:05,839] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 14:20:05,929] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-24000/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 14:20:13,456] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step24000 is ready now! +{'loss': 164.5425, 'grad_norm': 1992.0547149681163, 'learning_rate': 1.52766018405285e-05, 'epoch': 0.46} +{'loss': 170.6188, 'grad_norm': 3038.496792898915, 'learning_rate': 1.5272898233292032e-05, 'epoch': 0.46} +{'loss': 165.3409, 'grad_norm': 2357.8198540949584, 'learning_rate': 1.5269193623980295e-05, 'epoch': 0.46} +{'loss': 164.5263, 'grad_norm': 2556.37099358029, 'learning_rate': 1.526548801329731e-05, 'epoch': 0.46} +{'loss': 164.464, 'grad_norm': 1982.0027199430917, 'learning_rate': 1.5261781401947316e-05, 'epoch': 0.46} +{'loss': 167.1423, 'grad_norm': 2110.8857350094795, 'learning_rate': 1.5258073790634715e-05, 'epoch': 0.46} +{'loss': 161.6352, 'grad_norm': 1802.511865540906, 'learning_rate': 1.5254365180064117e-05, 'epoch': 0.46} +{'loss': 166.3117, 'grad_norm': 2244.737498706787, 'learning_rate': 1.5250655570940317e-05, 'epoch': 0.46} +{'loss': 164.3183, 'grad_norm': 1882.726644018674, 'learning_rate': 1.5246944963968294e-05, 'epoch': 0.46} +{'loss': 160.1953, 'grad_norm': 1780.6352592066273, 'learning_rate': 1.5243233359853222e-05, 'epoch': 0.46} +{'loss': 164.2329, 'grad_norm': 2367.4001873831085, 'learning_rate': 1.5239520759300468e-05, 'epoch': 0.46} +{'loss': 166.9219, 'grad_norm': 2312.456201752607, 'learning_rate': 1.5235807163015577e-05, 'epoch': 0.46} +{'loss': 160.7471, 'grad_norm': 1971.1538147204933, 'learning_rate': 1.5232092571704302e-05, 'epoch': 0.46} +{'loss': 164.5744, 'grad_norm': 2581.611145784115, 'learning_rate': 1.5228376986072564e-05, 'epoch': 0.46} +{'loss': 163.5627, 'grad_norm': 2209.9805831550775, 'learning_rate': 1.5224660406826486e-05, 'epoch': 0.46} +{'loss': 165.6153, 'grad_norm': 2116.514740477412, 'learning_rate': 1.5220942834672377e-05, 'epoch': 0.46} +{'loss': 161.0525, 'grad_norm': 2060.210953828836, 'learning_rate': 1.5217224270316728e-05, 'epoch': 0.46} +{'loss': 161.5686, 'grad_norm': 1768.9450835752207, 'learning_rate': 1.5213504714466237e-05, 'epoch': 0.47} +{'loss': 167.332, 'grad_norm': 1784.5146163349602, 'learning_rate': 1.5209784167827763e-05, 'epoch': 0.47} +{'loss': 164.7675, 'grad_norm': 2018.6023817417818, 'learning_rate': 1.520606263110838e-05, 'epoch': 0.47} +{'loss': 165.5881, 'grad_norm': 2158.3366225487325, 'learning_rate': 1.5202340105015334e-05, 'epoch': 0.47} +{'loss': 166.36, 'grad_norm': 2077.282444224324, 'learning_rate': 1.5198616590256064e-05, 'epoch': 0.47} +{'loss': 160.7789, 'grad_norm': 2113.8449449969303, 'learning_rate': 1.5194892087538198e-05, 'epoch': 0.47} +{'loss': 161.7234, 'grad_norm': 1938.1200546892549, 'learning_rate': 1.5191166597569545e-05, 'epoch': 0.47} +{'loss': 165.2965, 'grad_norm': 2248.76349772093, 'learning_rate': 1.5187440121058112e-05, 'epoch': 0.47} +{'loss': 165.4811, 'grad_norm': 2315.2339663037087, 'learning_rate': 1.5183712658712088e-05, 'epoch': 0.47} +{'loss': 172.6108, 'grad_norm': 2524.379518293494, 'learning_rate': 1.5179984211239846e-05, 'epoch': 0.47} +{'loss': 166.2964, 'grad_norm': 1900.853246766495, 'learning_rate': 1.5176254779349954e-05, 'epoch': 0.47} +{'loss': 158.5597, 'grad_norm': 1836.4019957758746, 'learning_rate': 1.5172524363751162e-05, 'epoch': 0.47} +{'loss': 166.2812, 'grad_norm': 2353.4959483512475, 'learning_rate': 1.5168792965152404e-05, 'epoch': 0.47} +{'loss': 164.445, 'grad_norm': 2107.063232050992, 'learning_rate': 1.5165060584262809e-05, 'epoch': 0.47} +{'loss': 162.5646, 'grad_norm': 2155.4710278670295, 'learning_rate': 1.5161327221791687e-05, 'epoch': 0.47} +{'loss': 160.4773, 'grad_norm': 2111.1879946420445, 'learning_rate': 1.5157592878448535e-05, 'epoch': 0.47} +{'loss': 164.8661, 'grad_norm': 2113.785968368115, 'learning_rate': 1.5153857554943039e-05, 'epoch': 0.47} +{'loss': 167.2516, 'grad_norm': 2229.574903762318, 'learning_rate': 1.5150121251985068e-05, 'epoch': 0.47} +{'loss': 169.5609, 'grad_norm': 1939.620927563793, 'learning_rate': 1.5146383970284679e-05, 'epoch': 0.47} +{'loss': 163.0987, 'grad_norm': 2199.2387207271713, 'learning_rate': 1.5142645710552115e-05, 'epoch': 0.47} +{'loss': 162.5068, 'grad_norm': 1828.413846213474, 'learning_rate': 1.5138906473497806e-05, 'epoch': 0.47} +{'loss': 160.7173, 'grad_norm': 1913.433792696749, 'learning_rate': 1.5135166259832365e-05, 'epoch': 0.47} +{'loss': 163.8702, 'grad_norm': 2321.4629130408816, 'learning_rate': 1.5131425070266593e-05, 'epoch': 0.47} +{'loss': 164.4729, 'grad_norm': 2435.959673304361, 'learning_rate': 1.5127682905511472e-05, 'epoch': 0.47} +{'loss': 163.2561, 'grad_norm': 2281.2066902640163, 'learning_rate': 1.5123939766278176e-05, 'epoch': 0.47} +{'loss': 161.7254, 'grad_norm': 1887.0902115407862, 'learning_rate': 1.512019565327806e-05, 'epoch': 0.47} +{'loss': 163.547, 'grad_norm': 2357.488437974906, 'learning_rate': 1.5116450567222665e-05, 'epoch': 0.47} +{'loss': 167.0951, 'grad_norm': 2130.013462814637, 'learning_rate': 1.5112704508823714e-05, 'epoch': 0.47} +{'loss': 160.8118, 'grad_norm': 1979.6305237527974, 'learning_rate': 1.5108957478793125e-05, 'epoch': 0.47} +{'loss': 170.4964, 'grad_norm': 2222.2553827064967, 'learning_rate': 1.5105209477842985e-05, 'epoch': 0.47} +{'loss': 165.0434, 'grad_norm': 2320.490723582596, 'learning_rate': 1.5101460506685582e-05, 'epoch': 0.47} +{'loss': 168.7846, 'grad_norm': 3830.7957406213623, 'learning_rate': 1.5097710566033374e-05, 'epoch': 0.47} +{'loss': 163.9036, 'grad_norm': 1998.3333364389346, 'learning_rate': 1.5093959656599008e-05, 'epoch': 0.47} +{'loss': 167.2122, 'grad_norm': 2074.1264190969923, 'learning_rate': 1.5090207779095322e-05, 'epoch': 0.47} +{'loss': 165.0361, 'grad_norm': 2008.0401672933476, 'learning_rate': 1.5086454934235333e-05, 'epoch': 0.47} +{'loss': 164.3218, 'grad_norm': 2140.7109947756007, 'learning_rate': 1.5082701122732233e-05, 'epoch': 0.47} +{'loss': 168.5087, 'grad_norm': 2096.4497071433566, 'learning_rate': 1.5078946345299418e-05, 'epoch': 0.47} +{'loss': 164.8511, 'grad_norm': 2182.271672743173, 'learning_rate': 1.5075190602650447e-05, 'epoch': 0.47} +{'loss': 166.5479, 'grad_norm': 2074.4676912185, 'learning_rate': 1.5071433895499073e-05, 'epoch': 0.47} +{'loss': 167.8476, 'grad_norm': 2040.533273432971, 'learning_rate': 1.5067676224559231e-05, 'epoch': 0.47} +{'loss': 165.8348, 'grad_norm': 2650.311031240398, 'learning_rate': 1.5063917590545038e-05, 'epoch': 0.47} +{'loss': 162.2387, 'grad_norm': 1770.4614366799035, 'learning_rate': 1.50601579941708e-05, 'epoch': 0.47} +{'loss': 159.8911, 'grad_norm': 1795.0763056873009, 'learning_rate': 1.5056397436150989e-05, 'epoch': 0.47} +{'loss': 162.9147, 'grad_norm': 2186.5569481360044, 'learning_rate': 1.505263591720028e-05, 'epoch': 0.47} +{'loss': 157.3199, 'grad_norm': 1868.2249674029306, 'learning_rate': 1.504887343803352e-05, 'epoch': 0.47} +{'loss': 163.7307, 'grad_norm': 1779.6449746198693, 'learning_rate': 1.5045109999365743e-05, 'epoch': 0.47} +{'loss': 166.9122, 'grad_norm': 2285.2365224203795, 'learning_rate': 1.5041345601912154e-05, 'epoch': 0.47} +{'loss': 170.6052, 'grad_norm': 2725.467071087677, 'learning_rate': 1.5037580246388159e-05, 'epoch': 0.47} +{'loss': 160.7212, 'grad_norm': 2406.12630876544, 'learning_rate': 1.503381393350933e-05, 'epoch': 0.47} +{'loss': 166.3314, 'grad_norm': 2105.9331205245994, 'learning_rate': 1.5030046663991424e-05, 'epoch': 0.47} +{'loss': 163.2775, 'grad_norm': 2127.631161301676, 'learning_rate': 1.5026278438550392e-05, 'epoch': 0.47} +{'loss': 166.0097, 'grad_norm': 2061.616402166364, 'learning_rate': 1.5022509257902354e-05, 'epoch': 0.47} +{'loss': 158.8994, 'grad_norm': 1987.0566855462566, 'learning_rate': 1.5018739122763608e-05, 'epoch': 0.48} +{'loss': 161.0792, 'grad_norm': 1950.8638662652027, 'learning_rate': 1.5014968033850647e-05, 'epoch': 0.48} +{'loss': 160.9339, 'grad_norm': 2115.425601983021, 'learning_rate': 1.5011195991880139e-05, 'epoch': 0.48} +{'loss': 160.9849, 'grad_norm': 2439.507097758897, 'learning_rate': 1.500742299756893e-05, 'epoch': 0.48} +{'loss': 162.8613, 'grad_norm': 2049.6083085306095, 'learning_rate': 1.5003649051634048e-05, 'epoch': 0.48} +{'loss': 161.1437, 'grad_norm': 2474.814377390778, 'learning_rate': 1.4999874154792707e-05, 'epoch': 0.48} +{'loss': 164.4776, 'grad_norm': 2219.9609434836275, 'learning_rate': 1.4996098307762296e-05, 'epoch': 0.48} +{'loss': 161.5815, 'grad_norm': 2051.6376995890855, 'learning_rate': 1.4992321511260391e-05, 'epoch': 0.48} +{'loss': 166.9031, 'grad_norm': 1875.3244878784271, 'learning_rate': 1.4988543766004737e-05, 'epoch': 0.48} +{'loss': 166.3375, 'grad_norm': 3339.4807633028254, 'learning_rate': 1.4984765072713274e-05, 'epoch': 0.48} +{'loss': 178.1476, 'grad_norm': 2422.0663628745715, 'learning_rate': 1.4980985432104107e-05, 'epoch': 0.48} +{'loss': 160.7777, 'grad_norm': 1838.2808753640197, 'learning_rate': 1.4977204844895538e-05, 'epoch': 0.48} +{'loss': 160.9507, 'grad_norm': 1903.3825957973145, 'learning_rate': 1.4973423311806032e-05, 'epoch': 0.48} +{'loss': 164.7885, 'grad_norm': 2597.816369380155, 'learning_rate': 1.4969640833554242e-05, 'epoch': 0.48} +{'loss': 163.8932, 'grad_norm': 3322.5072300682396, 'learning_rate': 1.4965857410859004e-05, 'epoch': 0.48} +{'loss': 164.9154, 'grad_norm': 2109.513042853797, 'learning_rate': 1.4962073044439328e-05, 'epoch': 0.48} +{'loss': 165.5989, 'grad_norm': 1994.108131174417, 'learning_rate': 1.4958287735014403e-05, 'epoch': 0.48} +{'loss': 165.6615, 'grad_norm': 2194.003594475776, 'learning_rate': 1.4954501483303601e-05, 'epoch': 0.48} +{'loss': 158.1679, 'grad_norm': 1981.9310344358184, 'learning_rate': 1.4950714290026467e-05, 'epoch': 0.48} +{'loss': 168.3393, 'grad_norm': 2045.4426715670586, 'learning_rate': 1.4946926155902735e-05, 'epoch': 0.48} +{'loss': 159.0788, 'grad_norm': 2063.3619134951336, 'learning_rate': 1.4943137081652308e-05, 'epoch': 0.48} +{'loss': 162.56, 'grad_norm': 2415.9372426553005, 'learning_rate': 1.4939347067995276e-05, 'epoch': 0.48} +{'loss': 160.3631, 'grad_norm': 2091.6889062453233, 'learning_rate': 1.4935556115651898e-05, 'epoch': 0.48} +{'loss': 162.4684, 'grad_norm': 2933.2274375309735, 'learning_rate': 1.4931764225342616e-05, 'epoch': 0.48} +{'loss': 160.3035, 'grad_norm': 2237.3892290048752, 'learning_rate': 1.4927971397788055e-05, 'epoch': 0.48} +{'loss': 161.8379, 'grad_norm': 2420.6727760889407, 'learning_rate': 1.492417763370901e-05, 'epoch': 0.48} +{'loss': 157.555, 'grad_norm': 1792.5448607770143, 'learning_rate': 1.4920382933826461e-05, 'epoch': 0.48} +{'loss': 162.6885, 'grad_norm': 2152.640413992233, 'learning_rate': 1.491658729886156e-05, 'epoch': 0.48} +{'loss': 159.7306, 'grad_norm': 1965.8329149082704, 'learning_rate': 1.491279072953564e-05, 'epoch': 0.48} +{'loss': 156.6199, 'grad_norm': 2070.9579026589404, 'learning_rate': 1.4908993226570214e-05, 'epoch': 0.48} +{'loss': 163.413, 'grad_norm': 1839.9975449854353, 'learning_rate': 1.4905194790686963e-05, 'epoch': 0.48} +{'loss': 161.503, 'grad_norm': 1847.892709605653, 'learning_rate': 1.4901395422607759e-05, 'epoch': 0.48} +{'loss': 159.7173, 'grad_norm': 2034.4501375517384, 'learning_rate': 1.4897595123054635e-05, 'epoch': 0.48} +{'loss': 164.1825, 'grad_norm': 2010.799619653443, 'learning_rate': 1.4893793892749816e-05, 'epoch': 0.48} +{'loss': 166.5922, 'grad_norm': 2055.126822295832, 'learning_rate': 1.4889991732415703e-05, 'epoch': 0.48} +{'loss': 165.2691, 'grad_norm': 1900.8503890756822, 'learning_rate': 1.4886188642774852e-05, 'epoch': 0.48} +{'loss': 164.8758, 'grad_norm': 2025.9411984176343, 'learning_rate': 1.4882384624550028e-05, 'epoch': 0.48} +{'loss': 163.3146, 'grad_norm': 2099.8979480450425, 'learning_rate': 1.487857967846415e-05, 'epoch': 0.48} +{'loss': 166.6895, 'grad_norm': 2226.559225350183, 'learning_rate': 1.487477380524032e-05, 'epoch': 0.48} +{'loss': 163.3267, 'grad_norm': 2286.7542548119204, 'learning_rate': 1.4870967005601817e-05, 'epoch': 0.48} +{'loss': 165.6622, 'grad_norm': 2023.7131459136624, 'learning_rate': 1.486715928027209e-05, 'epoch': 0.48} +{'loss': 159.0382, 'grad_norm': 2374.840480534816, 'learning_rate': 1.4863350629974778e-05, 'epoch': 0.48} +{'loss': 162.3879, 'grad_norm': 2197.2605032602382, 'learning_rate': 1.485954105543368e-05, 'epoch': 0.48} +{'loss': 159.5893, 'grad_norm': 1969.4372639999265, 'learning_rate': 1.485573055737278e-05, 'epoch': 0.48} +{'loss': 160.7498, 'grad_norm': 2120.9177648881155, 'learning_rate': 1.4851919136516235e-05, 'epoch': 0.48} +{'loss': 167.4849, 'grad_norm': 1911.8363581079327, 'learning_rate': 1.4848106793588374e-05, 'epoch': 0.48} +{'loss': 155.1723, 'grad_norm': 2199.1931563976655, 'learning_rate': 1.4844293529313713e-05, 'epoch': 0.48} +{'loss': 165.4536, 'grad_norm': 2025.8492985005068, 'learning_rate': 1.4840479344416927e-05, 'epoch': 0.48} +{'loss': 158.3601, 'grad_norm': 1803.8126854034006, 'learning_rate': 1.4836664239622871e-05, 'epoch': 0.48} +{'loss': 161.2133, 'grad_norm': 2093.0078329317344, 'learning_rate': 1.483284821565659e-05, 'epoch': 0.48} +{'loss': 158.2123, 'grad_norm': 1989.2090559667715, 'learning_rate': 1.4829031273243277e-05, 'epoch': 0.48} +{'loss': 161.1262, 'grad_norm': 1990.9930756614726, 'learning_rate': 1.4825213413108321e-05, 'epoch': 0.48} +{'loss': 156.2471, 'grad_norm': 2286.4134501111057, 'learning_rate': 1.482139463597728e-05, 'epoch': 0.49} +{'loss': 160.4839, 'grad_norm': 2253.147182953046, 'learning_rate': 1.4817574942575876e-05, 'epoch': 0.49} +{'loss': 164.3954, 'grad_norm': 2336.6967273686864, 'learning_rate': 1.4813754333630025e-05, 'epoch': 0.49} +{'loss': 165.6601, 'grad_norm': 1835.1333313103908, 'learning_rate': 1.4809932809865795e-05, 'epoch': 0.49} +{'loss': 159.6555, 'grad_norm': 1788.9913013450587, 'learning_rate': 1.4806110372009441e-05, 'epoch': 0.49} +{'loss': 163.4408, 'grad_norm': 1972.4289804958466, 'learning_rate': 1.4802287020787396e-05, 'epoch': 0.49} +{'loss': 164.3539, 'grad_norm': 2092.5283617410932, 'learning_rate': 1.4798462756926246e-05, 'epoch': 0.49} +{'loss': 162.4577, 'grad_norm': 2299.777899497942, 'learning_rate': 1.479463758115278e-05, 'epoch': 0.49} +{'loss': 158.8116, 'grad_norm': 2061.2079443136695, 'learning_rate': 1.4790811494193934e-05, 'epoch': 0.49} +{'loss': 161.3942, 'grad_norm': 2269.312855168046, 'learning_rate': 1.4786984496776828e-05, 'epoch': 0.49} +{'loss': 164.6935, 'grad_norm': 2184.6236049480817, 'learning_rate': 1.4783156589628758e-05, 'epoch': 0.49} +{'loss': 159.1444, 'grad_norm': 1874.4425454956804, 'learning_rate': 1.4779327773477185e-05, 'epoch': 0.49} +{'loss': 158.0263, 'grad_norm': 1989.348176623804, 'learning_rate': 1.4775498049049754e-05, 'epoch': 0.49} +{'loss': 160.9018, 'grad_norm': 2090.5821897081496, 'learning_rate': 1.4771667417074269e-05, 'epoch': 0.49} +{'loss': 157.8852, 'grad_norm': 1971.5441308063307, 'learning_rate': 1.4767835878278717e-05, 'epoch': 0.49} +{'loss': 160.8877, 'grad_norm': 2056.9599149055575, 'learning_rate': 1.4764003433391256e-05, 'epoch': 0.49} +{'loss': 163.8799, 'grad_norm': 2246.411310010003, 'learning_rate': 1.4760170083140207e-05, 'epoch': 0.49} +{'loss': 161.1115, 'grad_norm': 1629.6649531736753, 'learning_rate': 1.4756335828254071e-05, 'epoch': 0.49} +{'loss': 160.2814, 'grad_norm': 1842.5536363558508, 'learning_rate': 1.4752500669461524e-05, 'epoch': 0.49} +{'loss': 159.2495, 'grad_norm': 2120.8706195803984, 'learning_rate': 1.4748664607491408e-05, 'epoch': 0.49} +{'loss': 163.506, 'grad_norm': 2322.888834403863, 'learning_rate': 1.4744827643072742e-05, 'epoch': 0.49} +{'loss': 158.2911, 'grad_norm': 1744.7705954466896, 'learning_rate': 1.4740989776934703e-05, 'epoch': 0.49} +{'loss': 164.3418, 'grad_norm': 2206.9118698383686, 'learning_rate': 1.4737151009806654e-05, 'epoch': 0.49} +{'loss': 165.2755, 'grad_norm': 2054.741547878139, 'learning_rate': 1.473331134241813e-05, 'epoch': 0.49} +{'loss': 161.491, 'grad_norm': 2121.1676825486047, 'learning_rate': 1.4729470775498817e-05, 'epoch': 0.49} +{'loss': 159.0398, 'grad_norm': 4267.035059099988, 'learning_rate': 1.4725629309778603e-05, 'epoch': 0.49} +{'loss': 160.5125, 'grad_norm': 2056.39465076962, 'learning_rate': 1.4721786945987519e-05, 'epoch': 0.49} +{'loss': 164.1094, 'grad_norm': 2464.4287055293617, 'learning_rate': 1.4717943684855782e-05, 'epoch': 0.49} +{'loss': 157.4138, 'grad_norm': 1854.8259210103802, 'learning_rate': 1.471409952711378e-05, 'epoch': 0.49} +{'loss': 163.2665, 'grad_norm': 2250.460563923116, 'learning_rate': 1.4710254473492054e-05, 'epoch': 0.49} +{'loss': 157.7154, 'grad_norm': 2152.144263458062, 'learning_rate': 1.4706408524721338e-05, 'epoch': 0.49} +{'loss': 155.6155, 'grad_norm': 2152.6854503940626, 'learning_rate': 1.4702561681532524e-05, 'epoch': 0.49} +{'loss': 161.6712, 'grad_norm': 2643.7510366041424, 'learning_rate': 1.4698713944656673e-05, 'epoch': 0.49} +{'loss': 166.5259, 'grad_norm': 2193.5633475992804, 'learning_rate': 1.4694865314825024e-05, 'epoch': 0.49} +{'loss': 158.2025, 'grad_norm': 1852.0060838775964, 'learning_rate': 1.4691015792768979e-05, 'epoch': 0.49} +{'loss': 156.414, 'grad_norm': 1978.5829612421358, 'learning_rate': 1.4687165379220108e-05, 'epoch': 0.49} +{'loss': 168.9101, 'grad_norm': 2263.2934530181315, 'learning_rate': 1.4683314074910156e-05, 'epoch': 0.49} +{'loss': 161.0688, 'grad_norm': 1832.2767791358701, 'learning_rate': 1.4679461880571033e-05, 'epoch': 0.49} +{'loss': 161.5796, 'grad_norm': 1652.469399404148, 'learning_rate': 1.4675608796934825e-05, 'epoch': 0.49} +{'loss': 158.4534, 'grad_norm': 1928.0483179781447, 'learning_rate': 1.4671754824733776e-05, 'epoch': 0.49} +{'loss': 164.1343, 'grad_norm': 2023.6358478131397, 'learning_rate': 1.4667899964700309e-05, 'epoch': 0.49} +{'loss': 160.6894, 'grad_norm': 1785.991552759312, 'learning_rate': 1.466404421756701e-05, 'epoch': 0.49} +{'loss': 159.8115, 'grad_norm': 2126.997426720493, 'learning_rate': 1.4660187584066636e-05, 'epoch': 0.49} +{'loss': 157.7769, 'grad_norm': 1827.3399546001124, 'learning_rate': 1.4656330064932112e-05, 'epoch': 0.49} +{'loss': 159.4703, 'grad_norm': 1782.1699716347575, 'learning_rate': 1.4652471660896528e-05, 'epoch': 0.49} +{'loss': 159.5507, 'grad_norm': 1921.9954719835935, 'learning_rate': 1.4648612372693147e-05, 'epoch': 0.49} +{'loss': 165.4018, 'grad_norm': 1925.8347936380628, 'learning_rate': 1.4644752201055399e-05, 'epoch': 0.49} +{'loss': 162.3951, 'grad_norm': 2076.3455316279865, 'learning_rate': 1.464089114671688e-05, 'epoch': 0.49} +{'loss': 155.5354, 'grad_norm': 1750.33620222611, 'learning_rate': 1.4637029210411356e-05, 'epoch': 0.49} +{'loss': 162.128, 'grad_norm': 1769.4241664479403, 'learning_rate': 1.4633166392872756e-05, 'epoch': 0.49} +{'loss': 160.0706, 'grad_norm': 2080.706669041736, 'learning_rate': 1.4629302694835183e-05, 'epoch': 0.49} +{'loss': 159.073, 'grad_norm': 1905.8388397267531, 'learning_rate': 1.4625438117032907e-05, 'epoch': 0.49} +{'loss': 162.4987, 'grad_norm': 2160.949711116484, 'learning_rate': 1.4621572660200357e-05, 'epoch': 0.5} +{'loss': 162.9751, 'grad_norm': 1848.3544178299207, 'learning_rate': 1.4617706325072135e-05, 'epoch': 0.5} +{'loss': 156.8845, 'grad_norm': 1792.125844246818, 'learning_rate': 1.461383911238301e-05, 'epoch': 0.5} +{'loss': 159.1929, 'grad_norm': 2008.816398609616, 'learning_rate': 1.4609971022867919e-05, 'epoch': 0.5} +{'loss': 165.5804, 'grad_norm': 2233.643831559731, 'learning_rate': 1.4606102057261964e-05, 'epoch': 0.5} +{'loss': 158.7805, 'grad_norm': 1861.1112741763532, 'learning_rate': 1.4602232216300413e-05, 'epoch': 0.5} +{'loss': 160.1928, 'grad_norm': 1924.0121499472077, 'learning_rate': 1.4598361500718698e-05, 'epoch': 0.5} +{'loss': 158.7009, 'grad_norm': 1846.3558414842475, 'learning_rate': 1.4594489911252421e-05, 'epoch': 0.5} +{'loss': 162.213, 'grad_norm': 1992.9844482132362, 'learning_rate': 1.459061744863735e-05, 'epoch': 0.5} +{'loss': 160.5973, 'grad_norm': 2039.4560457519146, 'learning_rate': 1.4586744113609416e-05, 'epoch': 0.5} +{'loss': 159.0508, 'grad_norm': 1995.6007683250532, 'learning_rate': 1.4582869906904721e-05, 'epoch': 0.5} +{'loss': 159.7291, 'grad_norm': 2076.8084277861835, 'learning_rate': 1.4578994829259527e-05, 'epoch': 0.5} +{'loss': 158.9208, 'grad_norm': 1843.4778433302358, 'learning_rate': 1.4575118881410264e-05, 'epoch': 0.5} +{'loss': 157.4292, 'grad_norm': 1722.284719590851, 'learning_rate': 1.4571242064093526e-05, 'epoch': 0.5} +{'loss': 161.2186, 'grad_norm': 2342.2152985969824, 'learning_rate': 1.4567364378046073e-05, 'epoch': 0.5} +{'loss': 159.7502, 'grad_norm': 1763.4915489155449, 'learning_rate': 1.4563485824004836e-05, 'epoch': 0.5} +{'loss': 162.516, 'grad_norm': 1850.199995375087, 'learning_rate': 1.45596064027069e-05, 'epoch': 0.5} +{'loss': 159.7162, 'grad_norm': 2099.8574214657465, 'learning_rate': 1.4555726114889521e-05, 'epoch': 0.5} +{'loss': 160.0909, 'grad_norm': 2296.970348167029, 'learning_rate': 1.4551844961290123e-05, 'epoch': 0.5} +{'loss': 163.2542, 'grad_norm': 2255.7045838942295, 'learning_rate': 1.4547962942646284e-05, 'epoch': 0.5} +{'loss': 160.0629, 'grad_norm': 1952.9416943126769, 'learning_rate': 1.4544080059695757e-05, 'epoch': 0.5} +{'loss': 163.069, 'grad_norm': 1813.8857958336828, 'learning_rate': 1.4540196313176455e-05, 'epoch': 0.5} +{'loss': 161.8442, 'grad_norm': 1837.1236246074202, 'learning_rate': 1.4536311703826454e-05, 'epoch': 0.5} +{'loss': 159.1223, 'grad_norm': 1686.2743428778663, 'learning_rate': 1.4532426232383998e-05, 'epoch': 0.5} +{'loss': 156.7933, 'grad_norm': 2109.822902678445, 'learning_rate': 1.4528539899587486e-05, 'epoch': 0.5} +{'loss': 164.7224, 'grad_norm': 1856.396477609461, 'learning_rate': 1.4524652706175495e-05, 'epoch': 0.5} +{'loss': 159.8361, 'grad_norm': 1854.9407404628741, 'learning_rate': 1.4520764652886751e-05, 'epoch': 0.5} +[2025-05-28 16:08:50,086] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step26000 is about to be saved! +[2025-05-28 16:08:50,173] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 16:08:50,173] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 16:08:50,260] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 16:08:50,359] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 16:14:53,923] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 16:14:54,150] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 16:14:54,633] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step26000 is ready now! +[2025-05-28 16:14:54,688] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step26000 is about to be saved! +[2025-05-28 16:14:54,711] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 16:14:54,711] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 16:14:54,834] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 16:14:55,041] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 16:21:07,435] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 16:21:07,525] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-26000/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 16:21:10,774] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step26000 is ready now! +{'loss': 159.1416, 'grad_norm': 2080.7619713959853, 'learning_rate': 1.4516875740460154e-05, 'epoch': 0.5} +{'loss': 159.2797, 'grad_norm': 2149.400621792423, 'learning_rate': 1.4512985969634761e-05, 'epoch': 0.5} +{'loss': 167.2208, 'grad_norm': 1938.576527096125, 'learning_rate': 1.450909534114979e-05, 'epoch': 0.5} +{'loss': 157.1129, 'grad_norm': 1762.4540182611986, 'learning_rate': 1.4505203855744637e-05, 'epoch': 0.5} +{'loss': 166.4354, 'grad_norm': 2263.6265901980196, 'learning_rate': 1.4501311514158838e-05, 'epoch': 0.5} +{'loss': 158.4447, 'grad_norm': 2189.243946754445, 'learning_rate': 1.4497418317132109e-05, 'epoch': 0.5} +{'loss': 159.4529, 'grad_norm': 1828.7137262430456, 'learning_rate': 1.4493524265404327e-05, 'epoch': 0.5} +{'loss': 157.8547, 'grad_norm': 2265.5992987129484, 'learning_rate': 1.448962935971552e-05, 'epoch': 0.5} +{'loss': 159.28, 'grad_norm': 2005.0447571213265, 'learning_rate': 1.4485733600805892e-05, 'epoch': 0.5} +{'loss': 157.7217, 'grad_norm': 2009.4301619088158, 'learning_rate': 1.44818369894158e-05, 'epoch': 0.5} +{'loss': 159.3574, 'grad_norm': 1982.2269446399669, 'learning_rate': 1.4477939526285767e-05, 'epoch': 0.5} +{'loss': 158.9974, 'grad_norm': 2876.9218401731314, 'learning_rate': 1.447404121215648e-05, 'epoch': 0.5} +{'loss': 160.0804, 'grad_norm': 2082.8233889278604, 'learning_rate': 1.4470142047768778e-05, 'epoch': 0.5} +{'loss': 159.7184, 'grad_norm': 1827.326221657844, 'learning_rate': 1.4466242033863673e-05, 'epoch': 0.5} +{'loss': 155.8924, 'grad_norm': 2109.1978251872415, 'learning_rate': 1.4462341171182337e-05, 'epoch': 0.5} +{'loss': 155.7179, 'grad_norm': 1992.47024248858, 'learning_rate': 1.445843946046609e-05, 'epoch': 0.5} +{'loss': 161.3989, 'grad_norm': 2257.358658324817, 'learning_rate': 1.4454536902456436e-05, 'epoch': 0.5} +{'loss': 152.1852, 'grad_norm': 1770.2100782201846, 'learning_rate': 1.4450633497895017e-05, 'epoch': 0.5} +{'loss': 155.0584, 'grad_norm': 1919.0655012637037, 'learning_rate': 1.444672924752365e-05, 'epoch': 0.5} +{'loss': 156.3712, 'grad_norm': 3073.358267372732, 'learning_rate': 1.4442824152084309e-05, 'epoch': 0.5} +{'loss': 159.6416, 'grad_norm': 2028.2485322976795, 'learning_rate': 1.4438918212319126e-05, 'epoch': 0.5} +{'loss': 153.327, 'grad_norm': 1867.1989896763955, 'learning_rate': 1.44350114289704e-05, 'epoch': 0.5} +{'loss': 159.8762, 'grad_norm': 1749.5135377715355, 'learning_rate': 1.4431103802780588e-05, 'epoch': 0.5} +{'loss': 157.9319, 'grad_norm': 1996.5088656925707, 'learning_rate': 1.4427195334492294e-05, 'epoch': 0.5} +{'loss': 160.9472, 'grad_norm': 2814.5816248164747, 'learning_rate': 1.442328602484831e-05, 'epoch': 0.5} +{'loss': 159.5302, 'grad_norm': 1851.591909017696, 'learning_rate': 1.4419375874591555e-05, 'epoch': 0.51} +{'loss': 160.3706, 'grad_norm': 1780.2049880291738, 'learning_rate': 1.4415464884465133e-05, 'epoch': 0.51} +{'loss': 156.9449, 'grad_norm': 2451.9939114165177, 'learning_rate': 1.4411553055212302e-05, 'epoch': 0.51} +{'loss': 158.2039, 'grad_norm': 1820.089338701773, 'learning_rate': 1.4407640387576468e-05, 'epoch': 0.51} +{'loss': 178.7239, 'grad_norm': 2304.4975449454, 'learning_rate': 1.4403726882301207e-05, 'epoch': 0.51} +{'loss': 156.4732, 'grad_norm': 2242.019604958423, 'learning_rate': 1.4399812540130256e-05, 'epoch': 0.51} +{'loss': 159.9616, 'grad_norm': 1961.467260464841, 'learning_rate': 1.43958973618075e-05, 'epoch': 0.51} +{'loss': 160.1694, 'grad_norm': 2234.1885759607662, 'learning_rate': 1.4391981348076997e-05, 'epoch': 0.51} +{'loss': 158.5768, 'grad_norm': 2076.43382996268, 'learning_rate': 1.4388064499682947e-05, 'epoch': 0.51} +{'loss': 161.9895, 'grad_norm': 2445.373398258324, 'learning_rate': 1.438414681736973e-05, 'epoch': 0.51} +{'loss': 160.1673, 'grad_norm': 1792.4173787586985, 'learning_rate': 1.4380228301881865e-05, 'epoch': 0.51} +{'loss': 157.9969, 'grad_norm': 2125.453625923411, 'learning_rate': 1.4376308953964035e-05, 'epoch': 0.51} +{'loss': 159.7751, 'grad_norm': 2017.732437948363, 'learning_rate': 1.437238877436109e-05, 'epoch': 0.51} +{'loss': 157.784, 'grad_norm': 1914.8725957693043, 'learning_rate': 1.4368467763818026e-05, 'epoch': 0.51} +{'loss': 160.0336, 'grad_norm': 2000.2812270497775, 'learning_rate': 1.436454592308e-05, 'epoch': 0.51} +{'loss': 157.2784, 'grad_norm': 1842.5382050584415, 'learning_rate': 1.4360623252892336e-05, 'epoch': 0.51} +{'loss': 157.0532, 'grad_norm': 2025.1209741549728, 'learning_rate': 1.43566997540005e-05, 'epoch': 0.51} +{'loss': 157.8135, 'grad_norm': 3444.6239730486054, 'learning_rate': 1.4352775427150134e-05, 'epoch': 0.51} +{'loss': 160.2476, 'grad_norm': 2644.8428116210825, 'learning_rate': 1.4348850273087021e-05, 'epoch': 0.51} +{'loss': 156.5574, 'grad_norm': 2249.682459730705, 'learning_rate': 1.4344924292557104e-05, 'epoch': 0.51} +{'loss': 156.2809, 'grad_norm': 1677.9599820411865, 'learning_rate': 1.4340997486306491e-05, 'epoch': 0.51} +{'loss': 161.791, 'grad_norm': 2189.47711028166, 'learning_rate': 1.4337069855081445e-05, 'epoch': 0.51} +{'loss': 156.5839, 'grad_norm': 1974.8116104924104, 'learning_rate': 1.4333141399628382e-05, 'epoch': 0.51} +{'loss': 160.3868, 'grad_norm': 2122.133633346067, 'learning_rate': 1.432921212069387e-05, 'epoch': 0.51} +{'loss': 153.7721, 'grad_norm': 2012.3737604685714, 'learning_rate': 1.4325282019024645e-05, 'epoch': 0.51} +{'loss': 154.5888, 'grad_norm': 2021.4352453107701, 'learning_rate': 1.4321351095367593e-05, 'epoch': 0.51} +{'loss': 160.0612, 'grad_norm': 2222.875832482164, 'learning_rate': 1.4317419350469755e-05, 'epoch': 0.51} +{'loss': 162.8701, 'grad_norm': 2263.6407300896144, 'learning_rate': 1.4313486785078335e-05, 'epoch': 0.51} +{'loss': 154.4089, 'grad_norm': 1761.0508991648073, 'learning_rate': 1.430955339994068e-05, 'epoch': 0.51} +{'loss': 155.5226, 'grad_norm': 1984.0797317880103, 'learning_rate': 1.4305619195804306e-05, 'epoch': 0.51} +{'loss': 159.0471, 'grad_norm': 2094.8961685307086, 'learning_rate': 1.4301684173416877e-05, 'epoch': 0.51} +{'loss': 159.1014, 'grad_norm': 2136.80551950759, 'learning_rate': 1.429774833352622e-05, 'epoch': 0.51} +{'loss': 158.5359, 'grad_norm': 2305.987326952596, 'learning_rate': 1.4293811676880307e-05, 'epoch': 0.51} +{'loss': 160.3783, 'grad_norm': 1761.320721099218, 'learning_rate': 1.428987420422727e-05, 'epoch': 0.51} +{'loss': 159.8766, 'grad_norm': 2254.7186699965737, 'learning_rate': 1.4285935916315401e-05, 'epoch': 0.51} +{'loss': 156.5602, 'grad_norm': 2054.8294527819776, 'learning_rate': 1.4281996813893138e-05, 'epoch': 0.51} +{'loss': 159.6888, 'grad_norm': 1969.307561286822, 'learning_rate': 1.4278056897709081e-05, 'epoch': 0.51} +{'loss': 156.9893, 'grad_norm': 1725.465480116109, 'learning_rate': 1.427411616851198e-05, 'epoch': 0.51} +{'loss': 152.983, 'grad_norm': 1716.0274347880238, 'learning_rate': 1.4270174627050744e-05, 'epoch': 0.51} +{'loss': 161.7962, 'grad_norm': 3172.092692897106, 'learning_rate': 1.426623227407443e-05, 'epoch': 0.51} +{'loss': 158.6033, 'grad_norm': 1881.9834972134527, 'learning_rate': 1.4262289110332257e-05, 'epoch': 0.51} +{'loss': 157.9655, 'grad_norm': 1856.0180593894245, 'learning_rate': 1.4258345136573595e-05, 'epoch': 0.51} +{'loss': 156.6702, 'grad_norm': 1747.8093347596712, 'learning_rate': 1.425440035354796e-05, 'epoch': 0.51} +{'loss': 159.0581, 'grad_norm': 1751.2544895065166, 'learning_rate': 1.4250454762005035e-05, 'epoch': 0.51} +{'loss': 158.2602, 'grad_norm': 1914.2478230977183, 'learning_rate': 1.4246508362694646e-05, 'epoch': 0.51} +{'loss': 162.3339, 'grad_norm': 1910.6064032709476, 'learning_rate': 1.4242561156366783e-05, 'epoch': 0.51} +{'loss': 154.7191, 'grad_norm': 2038.0582382058367, 'learning_rate': 1.423861314377158e-05, 'epoch': 0.51} +{'loss': 159.2121, 'grad_norm': 2003.599069655081, 'learning_rate': 1.4234664325659324e-05, 'epoch': 0.51} +{'loss': 158.2582, 'grad_norm': 2054.4685704927424, 'learning_rate': 1.4230714702780466e-05, 'epoch': 0.51} +{'loss': 159.489, 'grad_norm': 1927.031739002766, 'learning_rate': 1.42267642758856e-05, 'epoch': 0.51} +{'loss': 159.1386, 'grad_norm': 2018.4958560044322, 'learning_rate': 1.4222813045725467e-05, 'epoch': 0.51} +{'loss': 154.3344, 'grad_norm': 1687.1508491159868, 'learning_rate': 1.4218861013050982e-05, 'epoch': 0.51} +{'loss': 158.532, 'grad_norm': 2082.8089423942993, 'learning_rate': 1.421490817861319e-05, 'epoch': 0.52} +{'loss': 159.1496, 'grad_norm': 1935.9067113321673, 'learning_rate': 1.4210954543163304e-05, 'epoch': 0.52} +{'loss': 154.6141, 'grad_norm': 1968.285034428921, 'learning_rate': 1.4207000107452682e-05, 'epoch': 0.52} +{'loss': 157.397, 'grad_norm': 1958.083150026154, 'learning_rate': 1.420304487223283e-05, 'epoch': 0.52} +{'loss': 158.3256, 'grad_norm': 1781.7758231676626, 'learning_rate': 1.4199088838255419e-05, 'epoch': 0.52} +{'loss': 161.6044, 'grad_norm': 1713.0977942323493, 'learning_rate': 1.4195132006272259e-05, 'epoch': 0.52} +{'loss': 156.2626, 'grad_norm': 2112.7822598030007, 'learning_rate': 1.4191174377035316e-05, 'epoch': 0.52} +{'loss': 157.9784, 'grad_norm': 1789.9255148317443, 'learning_rate': 1.4187215951296714e-05, 'epoch': 0.52} +{'loss': 159.2513, 'grad_norm': 2052.861221225635, 'learning_rate': 1.4183256729808713e-05, 'epoch': 0.52} +{'loss': 158.9098, 'grad_norm': 1941.5317083293198, 'learning_rate': 1.4179296713323742e-05, 'epoch': 0.52} +{'loss': 158.2632, 'grad_norm': 1848.5535759175978, 'learning_rate': 1.4175335902594372e-05, 'epoch': 0.52} +{'loss': 157.2112, 'grad_norm': 2278.1648725413306, 'learning_rate': 1.4171374298373319e-05, 'epoch': 0.52} +{'loss': 152.51, 'grad_norm': 2008.6374567010268, 'learning_rate': 1.4167411901413468e-05, 'epoch': 0.52} +{'loss': 161.9547, 'grad_norm': 1917.659103402022, 'learning_rate': 1.4163448712467828e-05, 'epoch': 0.52} +{'loss': 156.0664, 'grad_norm': 1972.6859108913477, 'learning_rate': 1.415948473228959e-05, 'epoch': 0.52} +{'loss': 161.4639, 'grad_norm': 7356.338513218958, 'learning_rate': 1.4155519961632069e-05, 'epoch': 0.52} +{'loss': 157.1969, 'grad_norm': 1817.4304321607883, 'learning_rate': 1.4151554401248739e-05, 'epoch': 0.52} +{'loss': 154.0262, 'grad_norm': 1891.6655178271424, 'learning_rate': 1.4147588051893233e-05, 'epoch': 0.52} +{'loss': 156.2108, 'grad_norm': 2076.4826069595683, 'learning_rate': 1.4143620914319321e-05, 'epoch': 0.52} +{'loss': 152.7988, 'grad_norm': 2086.9033538062095, 'learning_rate': 1.4139652989280928e-05, 'epoch': 0.52} +{'loss': 157.5128, 'grad_norm': 1688.7039984943299, 'learning_rate': 1.4135684277532133e-05, 'epoch': 0.52} +{'loss': 154.8171, 'grad_norm': 2173.7197164648187, 'learning_rate': 1.4131714779827155e-05, 'epoch': 0.52} +{'loss': 159.5276, 'grad_norm': 1862.4734279795805, 'learning_rate': 1.4127744496920373e-05, 'epoch': 0.52} +{'loss': 154.733, 'grad_norm': 1903.123501580214, 'learning_rate': 1.4123773429566306e-05, 'epoch': 0.52} +{'loss': 155.1439, 'grad_norm': 1893.8506812087667, 'learning_rate': 1.4119801578519625e-05, 'epoch': 0.52} +{'loss': 159.7112, 'grad_norm': 1790.0565467204992, 'learning_rate': 1.4115828944535155e-05, 'epoch': 0.52} +{'loss': 156.6149, 'grad_norm': 2891.256006998661, 'learning_rate': 1.4111855528367861e-05, 'epoch': 0.52} +{'loss': 159.3322, 'grad_norm': 2149.8864082096406, 'learning_rate': 1.4107881330772868e-05, 'epoch': 0.52} +{'loss': 157.7613, 'grad_norm': 1729.1712170712096, 'learning_rate': 1.4103906352505435e-05, 'epoch': 0.52} +{'loss': 157.2642, 'grad_norm': 1870.9627717669428, 'learning_rate': 1.4099930594320983e-05, 'epoch': 0.52} +{'loss': 155.4628, 'grad_norm': 1973.2299579928917, 'learning_rate': 1.4095954056975074e-05, 'epoch': 0.52} +{'loss': 155.4425, 'grad_norm': 2082.913949519445, 'learning_rate': 1.4091976741223414e-05, 'epoch': 0.52} +{'loss': 162.1295, 'grad_norm': 2104.2269893628304, 'learning_rate': 1.4087998647821875e-05, 'epoch': 0.52} +{'loss': 162.1022, 'grad_norm': 2507.9836049758096, 'learning_rate': 1.4084019777526452e-05, 'epoch': 0.52} +{'loss': 153.7844, 'grad_norm': 1857.304623810958, 'learning_rate': 1.4080040131093302e-05, 'epoch': 0.52} +{'loss': 150.0759, 'grad_norm': 1870.1151844076257, 'learning_rate': 1.4076059709278736e-05, 'epoch': 0.52} +{'loss': 153.4065, 'grad_norm': 2006.223081351989, 'learning_rate': 1.4072078512839191e-05, 'epoch': 0.52} +{'loss': 159.2288, 'grad_norm': 1932.305211411251, 'learning_rate': 1.4068096542531273e-05, 'epoch': 0.52} +{'loss': 152.0883, 'grad_norm': 2039.503567194097, 'learning_rate': 1.4064113799111725e-05, 'epoch': 0.52} +{'loss': 152.8591, 'grad_norm': 1743.9496566046246, 'learning_rate': 1.4060130283337432e-05, 'epoch': 0.52} +{'loss': 156.834, 'grad_norm': 2167.6555371619343, 'learning_rate': 1.405614599596544e-05, 'epoch': 0.52} +{'loss': 160.1533, 'grad_norm': 2039.1360989668544, 'learning_rate': 1.4052160937752925e-05, 'epoch': 0.52} +{'loss': 159.2275, 'grad_norm': 1929.8554543745138, 'learning_rate': 1.4048175109457225e-05, 'epoch': 0.52} +{'loss': 157.7422, 'grad_norm': 2256.312847792475, 'learning_rate': 1.4044188511835815e-05, 'epoch': 0.52} +{'loss': 155.6913, 'grad_norm': 1872.9386177593776, 'learning_rate': 1.4040201145646313e-05, 'epoch': 0.52} +{'loss': 156.6117, 'grad_norm': 2014.5550214129912, 'learning_rate': 1.4036213011646496e-05, 'epoch': 0.52} +{'loss': 155.8016, 'grad_norm': 1842.9592013083309, 'learning_rate': 1.4032224110594275e-05, 'epoch': 0.52} +{'loss': 162.0273, 'grad_norm': 1771.022605568496, 'learning_rate': 1.4028234443247712e-05, 'epoch': 0.52} +{'loss': 152.3885, 'grad_norm': 1866.70231072701, 'learning_rate': 1.4024244010365018e-05, 'epoch': 0.52} +{'loss': 151.312, 'grad_norm': 2317.00204890682, 'learning_rate': 1.4020252812704537e-05, 'epoch': 0.52} +{'loss': 159.0778, 'grad_norm': 2395.978600141705, 'learning_rate': 1.4016260851024772e-05, 'epoch': 0.52} +{'loss': 156.1584, 'grad_norm': 2055.8128672666835, 'learning_rate': 1.4012268126084365e-05, 'epoch': 0.52} +{'loss': 155.0622, 'grad_norm': 2390.1937002322798, 'learning_rate': 1.4008274638642103e-05, 'epoch': 0.53} +{'loss': 154.4021, 'grad_norm': 2672.267431327345, 'learning_rate': 1.400428038945692e-05, 'epoch': 0.53} +{'loss': 152.0438, 'grad_norm': 1873.8650307112887, 'learning_rate': 1.4000285379287892e-05, 'epoch': 0.53} +{'loss': 152.1684, 'grad_norm': 1750.2975920205315, 'learning_rate': 1.3996289608894242e-05, 'epoch': 0.53} +{'loss': 156.2587, 'grad_norm': 1858.1191899960966, 'learning_rate': 1.3992293079035337e-05, 'epoch': 0.53} +{'loss': 154.793, 'grad_norm': 1812.0512914723474, 'learning_rate': 1.3988295790470687e-05, 'epoch': 0.53} +{'loss': 154.939, 'grad_norm': 2183.102045876125, 'learning_rate': 1.3984297743959947e-05, 'epoch': 0.53} +{'loss': 157.8435, 'grad_norm': 1817.4875834435425, 'learning_rate': 1.3980298940262918e-05, 'epoch': 0.53} +{'loss': 157.9414, 'grad_norm': 1770.131899244717, 'learning_rate': 1.397629938013954e-05, 'epoch': 0.53} +{'loss': 156.6996, 'grad_norm': 2162.662400588475, 'learning_rate': 1.3972299064349898e-05, 'epoch': 0.53} +{'loss': 154.5824, 'grad_norm': 2052.6743972585914, 'learning_rate': 1.3968297993654232e-05, 'epoch': 0.53} +{'loss': 153.6714, 'grad_norm': 1737.409258807058, 'learning_rate': 1.3964296168812908e-05, 'epoch': 0.53} +{'loss': 155.8615, 'grad_norm': 2144.66287790067, 'learning_rate': 1.3960293590586443e-05, 'epoch': 0.53} +{'loss': 156.8506, 'grad_norm': 1930.943810800277, 'learning_rate': 1.3956290259735499e-05, 'epoch': 0.53} +{'loss': 154.9202, 'grad_norm': 1884.7213638463195, 'learning_rate': 1.3952286177020879e-05, 'epoch': 0.53} +{'loss': 154.3849, 'grad_norm': 1797.595373613977, 'learning_rate': 1.3948281343203533e-05, 'epoch': 0.53} +{'loss': 158.9282, 'grad_norm': 1791.3580618804347, 'learning_rate': 1.3944275759044545e-05, 'epoch': 0.53} +{'loss': 160.5654, 'grad_norm': 1786.9728876179101, 'learning_rate': 1.394026942530515e-05, 'epoch': 0.53} +{'loss': 156.0668, 'grad_norm': 1994.7465631089162, 'learning_rate': 1.3936262342746719e-05, 'epoch': 0.53} +{'loss': 149.9021, 'grad_norm': 1992.8446810435705, 'learning_rate': 1.3932254512130768e-05, 'epoch': 0.53} +{'loss': 157.4511, 'grad_norm': 2217.648195775919, 'learning_rate': 1.3928245934218963e-05, 'epoch': 0.53} +{'loss': 170.781, 'grad_norm': 2505.6771653991173, 'learning_rate': 1.3924236609773094e-05, 'epoch': 0.53} +{'loss': 156.4806, 'grad_norm': 2487.9107099224375, 'learning_rate': 1.3920226539555109e-05, 'epoch': 0.53} +{'loss': 151.8503, 'grad_norm': 2307.661465288968, 'learning_rate': 1.391621572432709e-05, 'epoch': 0.53} +{'loss': 156.0474, 'grad_norm': 2010.1695342217554, 'learning_rate': 1.3912204164851268e-05, 'epoch': 0.53} +{'loss': 153.3225, 'grad_norm': 1808.7892252349318, 'learning_rate': 1.3908191861890007e-05, 'epoch': 0.53} +{'loss': 152.6163, 'grad_norm': 2205.2443933350964, 'learning_rate': 1.3904178816205813e-05, 'epoch': 0.53} +{'loss': 162.1908, 'grad_norm': 2207.9785277095534, 'learning_rate': 1.3900165028561338e-05, 'epoch': 0.53} +{'loss': 156.1196, 'grad_norm': 1833.4775203374263, 'learning_rate': 1.3896150499719372e-05, 'epoch': 0.53} +{'loss': 154.4108, 'grad_norm': 2244.4374169892317, 'learning_rate': 1.3892135230442846e-05, 'epoch': 0.53} +{'loss': 155.5766, 'grad_norm': 1905.851473884484, 'learning_rate': 1.3888119221494835e-05, 'epoch': 0.53} +{'loss': 151.6438, 'grad_norm': 1853.7268127303614, 'learning_rate': 1.3884102473638549e-05, 'epoch': 0.53} +{'loss': 159.6549, 'grad_norm': 2077.8228094494825, 'learning_rate': 1.388008498763734e-05, 'epoch': 0.53} +{'loss': 152.6171, 'grad_norm': 1863.98932706872, 'learning_rate': 1.3876066764254707e-05, 'epoch': 0.53} +{'loss': 158.4037, 'grad_norm': 1742.0385127308907, 'learning_rate': 1.3872047804254276e-05, 'epoch': 0.53} +{'loss': 149.8496, 'grad_norm': 1536.0773895956831, 'learning_rate': 1.3868028108399829e-05, 'epoch': 0.53} +{'loss': 154.7025, 'grad_norm': 1921.1218937830026, 'learning_rate': 1.3864007677455271e-05, 'epoch': 0.53} +{'loss': 157.2209, 'grad_norm': 1991.6982639734074, 'learning_rate': 1.3859986512184663e-05, 'epoch': 0.53} +{'loss': 153.4409, 'grad_norm': 1867.9278677517086, 'learning_rate': 1.385596461335219e-05, 'epoch': 0.53} +{'loss': 155.614, 'grad_norm': 1822.1113329084237, 'learning_rate': 1.3851941981722188e-05, 'epoch': 0.53} +{'loss': 153.3759, 'grad_norm': 2223.238311526777, 'learning_rate': 1.3847918618059133e-05, 'epoch': 0.53} +{'loss': 153.0081, 'grad_norm': 1890.4268173378898, 'learning_rate': 1.3843894523127626e-05, 'epoch': 0.53} +{'loss': 156.6433, 'grad_norm': 2243.09771156773, 'learning_rate': 1.383986969769242e-05, 'epoch': 0.53} +{'loss': 158.6426, 'grad_norm': 1992.2673636824577, 'learning_rate': 1.383584414251841e-05, 'epoch': 0.53} +{'loss': 157.4137, 'grad_norm': 2136.4350333731304, 'learning_rate': 1.3831817858370614e-05, 'epoch': 0.53} +{'loss': 159.8879, 'grad_norm': 1933.162917336748, 'learning_rate': 1.3827790846014205e-05, 'epoch': 0.53} +{'loss': 154.337, 'grad_norm': 2869.4775056515687, 'learning_rate': 1.3823763106214476e-05, 'epoch': 0.53} +{'loss': 156.9305, 'grad_norm': 2451.5419547962815, 'learning_rate': 1.3819734639736879e-05, 'epoch': 0.53} +{'loss': 158.1551, 'grad_norm': 1742.43174360451, 'learning_rate': 1.3815705447346994e-05, 'epoch': 0.53} +{'loss': 155.5401, 'grad_norm': 2121.8334013638837, 'learning_rate': 1.3811675529810535e-05, 'epoch': 0.53} +{'loss': 152.1289, 'grad_norm': 1997.8014899567386, 'learning_rate': 1.380764488789336e-05, 'epoch': 0.53} +{'loss': 153.0646, 'grad_norm': 2379.625262527689, 'learning_rate': 1.380361352236146e-05, 'epoch': 0.53} +{'loss': 157.8924, 'grad_norm': 2796.2861773442432, 'learning_rate': 1.3799581433980967e-05, 'epoch': 0.54} +{'loss': 158.048, 'grad_norm': 1989.8503101118886, 'learning_rate': 1.3795548623518154e-05, 'epoch': 0.54} +{'loss': 154.4454, 'grad_norm': 1849.8118808549111, 'learning_rate': 1.379151509173942e-05, 'epoch': 0.54} +{'loss': 151.9303, 'grad_norm': 2131.2026712315087, 'learning_rate': 1.3787480839411314e-05, 'epoch': 0.54} +{'loss': 153.1317, 'grad_norm': 2076.9686697959314, 'learning_rate': 1.3783445867300515e-05, 'epoch': 0.54} +{'loss': 152.5163, 'grad_norm': 2277.2967575245057, 'learning_rate': 1.3779410176173833e-05, 'epoch': 0.54} +{'loss': 154.1103, 'grad_norm': 1820.9416701550556, 'learning_rate': 1.377537376679823e-05, 'epoch': 0.54} +{'loss': 152.7276, 'grad_norm': 1994.7118268293784, 'learning_rate': 1.3771336639940791e-05, 'epoch': 0.54} +{'loss': 156.0116, 'grad_norm': 1827.0867143206963, 'learning_rate': 1.3767298796368743e-05, 'epoch': 0.54} +{'loss': 152.3035, 'grad_norm': 2049.450488361458, 'learning_rate': 1.3763260236849448e-05, 'epoch': 0.54} +{'loss': 151.0595, 'grad_norm': 1725.076311124163, 'learning_rate': 1.37592209621504e-05, 'epoch': 0.54} +{'loss': 153.3466, 'grad_norm': 2034.958485307947, 'learning_rate': 1.3755180973039241e-05, 'epoch': 0.54} +{'loss': 154.0317, 'grad_norm': 1959.3708243052954, 'learning_rate': 1.3751140270283738e-05, 'epoch': 0.54} +{'loss': 154.8337, 'grad_norm': 1876.201809746128, 'learning_rate': 1.3747098854651797e-05, 'epoch': 0.54} +{'loss': 156.9593, 'grad_norm': 1900.490918464178, 'learning_rate': 1.374305672691146e-05, 'epoch': 0.54} +{'loss': 154.3234, 'grad_norm': 1942.3729034478906, 'learning_rate': 1.3739013887830896e-05, 'epoch': 0.54} +{'loss': 153.3052, 'grad_norm': 1955.2415508256433, 'learning_rate': 1.3734970338178429e-05, 'epoch': 0.54} +{'loss': 153.5988, 'grad_norm': 1625.8741185319, 'learning_rate': 1.3730926078722497e-05, 'epoch': 0.54} +{'loss': 154.7425, 'grad_norm': 1784.7946611715342, 'learning_rate': 1.3726881110231682e-05, 'epoch': 0.54} +[2025-05-28 18:09:46,599] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step28000 is about to be saved! +[2025-05-28 18:09:46,675] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 18:09:46,675] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 18:09:46,726] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 18:09:46,883] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 18:16:01,819] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 18:16:01,950] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 18:16:01,962] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step28000 is ready now! +[2025-05-28 18:16:01,992] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step28000 is about to be saved! +[2025-05-28 18:16:02,017] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 18:16:02,017] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 18:16:02,090] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 18:16:02,113] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 18:22:07,646] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 18:22:07,778] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-28000/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 18:22:10,567] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step28000 is ready now! +{'loss': 156.6985, 'grad_norm': 2023.1429303994955, 'learning_rate': 1.3722835433474704e-05, 'epoch': 0.54} +{'loss': 153.3842, 'grad_norm': 1891.2213541148258, 'learning_rate': 1.3718789049220412e-05, 'epoch': 0.54} +{'loss': 154.8435, 'grad_norm': 1729.7893704345936, 'learning_rate': 1.371474195823779e-05, 'epoch': 0.54} +{'loss': 153.3253, 'grad_norm': 1598.853219960935, 'learning_rate': 1.371069416129596e-05, 'epoch': 0.54} +{'loss': 154.9979, 'grad_norm': 2484.107696763548, 'learning_rate': 1.3706645659164174e-05, 'epoch': 0.54} +{'loss': 155.2877, 'grad_norm': 1940.4623286796223, 'learning_rate': 1.3702596452611822e-05, 'epoch': 0.54} +{'loss': 158.8657, 'grad_norm': 1774.5850856346387, 'learning_rate': 1.3698546542408424e-05, 'epoch': 0.54} +{'loss': 154.4207, 'grad_norm': 1981.077257518629, 'learning_rate': 1.3694495929323635e-05, 'epoch': 0.54} +{'loss': 156.174, 'grad_norm': 2585.8884219711226, 'learning_rate': 1.3690444614127248e-05, 'epoch': 0.54} +{'loss': 155.7946, 'grad_norm': 2194.865763662328, 'learning_rate': 1.3686392597589178e-05, 'epoch': 0.54} +{'loss': 152.5329, 'grad_norm': 2078.880112070112, 'learning_rate': 1.368233988047949e-05, 'epoch': 0.54} +{'loss': 152.755, 'grad_norm': 2009.467160459642, 'learning_rate': 1.3678286463568365e-05, 'epoch': 0.54} +{'loss': 155.0055, 'grad_norm': 1907.4544414982922, 'learning_rate': 1.3674232347626126e-05, 'epoch': 0.54} +{'loss': 153.9251, 'grad_norm': 1809.195635315569, 'learning_rate': 1.3670177533423234e-05, 'epoch': 0.54} +{'loss': 152.5125, 'grad_norm': 1753.0095994295405, 'learning_rate': 1.3666122021730272e-05, 'epoch': 0.54} +{'loss': 151.2744, 'grad_norm': 1927.1689016627613, 'learning_rate': 1.3662065813317958e-05, 'epoch': 0.54} +{'loss': 151.9349, 'grad_norm': 1825.5690367998188, 'learning_rate': 1.3658008908957145e-05, 'epoch': 0.54} +{'loss': 155.3791, 'grad_norm': 1756.4849896286366, 'learning_rate': 1.3653951309418823e-05, 'epoch': 0.54} +{'loss': 162.5215, 'grad_norm': 4211.1255598708085, 'learning_rate': 1.3649893015474103e-05, 'epoch': 0.54} +{'loss': 150.7312, 'grad_norm': 1776.2186588593133, 'learning_rate': 1.3645834027894235e-05, 'epoch': 0.54} +{'loss': 155.063, 'grad_norm': 1741.5043148707748, 'learning_rate': 1.36417743474506e-05, 'epoch': 0.54} +{'loss': 159.6844, 'grad_norm': 2208.024557646125, 'learning_rate': 1.3637713974914711e-05, 'epoch': 0.54} +{'loss': 156.2505, 'grad_norm': 1955.4342262269936, 'learning_rate': 1.3633652911058211e-05, 'epoch': 0.54} +{'loss': 155.1519, 'grad_norm': 1725.7336890580423, 'learning_rate': 1.3629591156652876e-05, 'epoch': 0.54} +{'loss': 159.2656, 'grad_norm': 1808.0891329560372, 'learning_rate': 1.362552871247061e-05, 'epoch': 0.54} +{'loss': 149.5297, 'grad_norm': 2007.1289751075544, 'learning_rate': 1.3621465579283453e-05, 'epoch': 0.54} +{'loss': 154.1513, 'grad_norm': 1893.9658548156353, 'learning_rate': 1.361740175786357e-05, 'epoch': 0.54} +{'loss': 155.0116, 'grad_norm': 2682.04142377919, 'learning_rate': 1.3613337248983265e-05, 'epoch': 0.54} +{'loss': 154.76, 'grad_norm': 1670.2789347665553, 'learning_rate': 1.3609272053414965e-05, 'epoch': 0.54} +{'loss': 150.0861, 'grad_norm': 1733.9447852608812, 'learning_rate': 1.3605206171931232e-05, 'epoch': 0.54} +{'loss': 153.0796, 'grad_norm': 1816.8685003409814, 'learning_rate': 1.3601139605304754e-05, 'epoch': 0.54} +{'loss': 149.4288, 'grad_norm': 1788.6251264189755, 'learning_rate': 1.3597072354308351e-05, 'epoch': 0.54} +{'loss': 154.5149, 'grad_norm': 2253.7702433341183, 'learning_rate': 1.359300441971498e-05, 'epoch': 0.54} +{'loss': 153.8381, 'grad_norm': 1977.040386371444, 'learning_rate': 1.3588935802297717e-05, 'epoch': 0.55} +{'loss': 156.8142, 'grad_norm': 2421.4203005193694, 'learning_rate': 1.3584866502829774e-05, 'epoch': 0.55} +{'loss': 153.7235, 'grad_norm': 2293.547774034466, 'learning_rate': 1.358079652208449e-05, 'epoch': 0.55} +{'loss': 152.0078, 'grad_norm': 1763.033525351545, 'learning_rate': 1.3576725860835336e-05, 'epoch': 0.55} +{'loss': 156.2052, 'grad_norm': 1836.6547019145876, 'learning_rate': 1.3572654519855913e-05, 'epoch': 0.55} +{'loss': 151.4086, 'grad_norm': 1946.6943913945704, 'learning_rate': 1.3568582499919941e-05, 'epoch': 0.55} +{'loss': 150.0806, 'grad_norm': 2096.0354436154253, 'learning_rate': 1.3564509801801286e-05, 'epoch': 0.55} +{'loss': 152.3024, 'grad_norm': 1915.8091598978062, 'learning_rate': 1.3560436426273933e-05, 'epoch': 0.55} +{'loss': 153.6322, 'grad_norm': 1759.4203921143937, 'learning_rate': 1.3556362374111993e-05, 'epoch': 0.55} +{'loss': 160.2011, 'grad_norm': 1972.4164733725552, 'learning_rate': 1.3552287646089712e-05, 'epoch': 0.55} +{'loss': 151.4458, 'grad_norm': 2082.095172814072, 'learning_rate': 1.3548212242981462e-05, 'epoch': 0.55} +{'loss': 154.7203, 'grad_norm': 1903.7525394430327, 'learning_rate': 1.3544136165561742e-05, 'epoch': 0.55} +{'loss': 154.4966, 'grad_norm': 1878.579414325596, 'learning_rate': 1.3540059414605183e-05, 'epoch': 0.55} +{'loss': 151.7046, 'grad_norm': 2122.5106984011663, 'learning_rate': 1.3535981990886535e-05, 'epoch': 0.55} +{'loss': 163.3569, 'grad_norm': 2312.099926105437, 'learning_rate': 1.3531903895180692e-05, 'epoch': 0.55} +{'loss': 173.3694, 'grad_norm': 2607.1186548225564, 'learning_rate': 1.3527825128262656e-05, 'epoch': 0.55} +{'loss': 152.5111, 'grad_norm': 1676.3016299508815, 'learning_rate': 1.3523745690907572e-05, 'epoch': 0.55} +{'loss': 155.3354, 'grad_norm': 1763.139095732711, 'learning_rate': 1.3519665583890708e-05, 'epoch': 0.55} +{'loss': 156.1827, 'grad_norm': 1780.0822361600322, 'learning_rate': 1.3515584807987452e-05, 'epoch': 0.55} +{'loss': 155.7381, 'grad_norm': 3639.6545515804114, 'learning_rate': 1.3511503363973333e-05, 'epoch': 0.55} +{'loss': 155.9905, 'grad_norm': 1899.725356585475, 'learning_rate': 1.3507421252623993e-05, 'epoch': 0.55} +{'loss': 150.3074, 'grad_norm': 1925.8577201530586, 'learning_rate': 1.3503338474715211e-05, 'epoch': 0.55} +{'loss': 152.632, 'grad_norm': 1778.5120636234299, 'learning_rate': 1.3499255031022887e-05, 'epoch': 0.55} +{'loss': 147.1654, 'grad_norm': 1933.2856832482223, 'learning_rate': 1.3495170922323047e-05, 'epoch': 0.55} +{'loss': 155.0, 'grad_norm': 2023.2557781514438, 'learning_rate': 1.349108614939185e-05, 'epoch': 0.55} +{'loss': 152.7049, 'grad_norm': 1835.6751567531683, 'learning_rate': 1.3487000713005574e-05, 'epoch': 0.55} +{'loss': 163.437, 'grad_norm': 1850.0381413175448, 'learning_rate': 1.3482914613940625e-05, 'epoch': 0.55} +{'loss': 157.356, 'grad_norm': 1900.72682053037, 'learning_rate': 1.3478827852973541e-05, 'epoch': 0.55} +{'loss': 156.4203, 'grad_norm': 1946.1076588461033, 'learning_rate': 1.3474740430880977e-05, 'epoch': 0.55} +{'loss': 152.3331, 'grad_norm': 1778.167980241622, 'learning_rate': 1.3470652348439715e-05, 'epoch': 0.55} +{'loss': 151.6968, 'grad_norm': 1804.7482034544087, 'learning_rate': 1.3466563606426669e-05, 'epoch': 0.55} +{'loss': 156.5375, 'grad_norm': 2763.3182738488986, 'learning_rate': 1.3462474205618872e-05, 'epoch': 0.55} +{'loss': 153.599, 'grad_norm': 1792.7575653810954, 'learning_rate': 1.3458384146793486e-05, 'epoch': 0.55} +{'loss': 160.187, 'grad_norm': 1860.312350204221, 'learning_rate': 1.3454293430727794e-05, 'epoch': 0.55} +{'loss': 152.6867, 'grad_norm': 1975.139247776751, 'learning_rate': 1.345020205819921e-05, 'epoch': 0.55} +{'loss': 154.2961, 'grad_norm': 2017.4016601591861, 'learning_rate': 1.3446110029985267e-05, 'epoch': 0.55} +{'loss': 154.078, 'grad_norm': 2044.4036195107992, 'learning_rate': 1.3442017346863618e-05, 'epoch': 0.55} +{'loss': 149.5239, 'grad_norm': 1821.8786892719834, 'learning_rate': 1.343792400961206e-05, 'epoch': 0.55} +{'loss': 152.4245, 'grad_norm': 1698.1387880407533, 'learning_rate': 1.3433830019008494e-05, 'epoch': 0.55} +{'loss': 154.0634, 'grad_norm': 1832.0727253778546, 'learning_rate': 1.342973537583095e-05, 'epoch': 0.55} +{'loss': 154.2211, 'grad_norm': 2100.171663735968, 'learning_rate': 1.3425640080857594e-05, 'epoch': 0.55} +{'loss': 176.3173, 'grad_norm': 2304.417429893227, 'learning_rate': 1.3421544134866694e-05, 'epoch': 0.55} +{'loss': 150.581, 'grad_norm': 1774.340210823349, 'learning_rate': 1.3417447538636665e-05, 'epoch': 0.55} +{'loss': 153.4829, 'grad_norm': 1783.8364857248598, 'learning_rate': 1.341335029294603e-05, 'epoch': 0.55} +{'loss': 155.5279, 'grad_norm': 2038.5792962012663, 'learning_rate': 1.340925239857344e-05, 'epoch': 0.55} +{'loss': 157.0888, 'grad_norm': 1821.5320540123778, 'learning_rate': 1.3405153856297673e-05, 'epoch': 0.55} +{'loss': 149.9089, 'grad_norm': 1677.2854044300357, 'learning_rate': 1.3401054666897619e-05, 'epoch': 0.55} +{'loss': 156.2744, 'grad_norm': 1809.7767359477257, 'learning_rate': 1.339695483115231e-05, 'epoch': 0.55} +{'loss': 152.6864, 'grad_norm': 2003.149681783896, 'learning_rate': 1.3392854349840878e-05, 'epoch': 0.55} +{'loss': 153.7977, 'grad_norm': 2033.8291694279615, 'learning_rate': 1.3388753223742594e-05, 'epoch': 0.55} +{'loss': 163.4245, 'grad_norm': 7272.873888956933, 'learning_rate': 1.338465145363685e-05, 'epoch': 0.55} +{'loss': 152.7847, 'grad_norm': 1897.5124556606556, 'learning_rate': 1.338054904030315e-05, 'epoch': 0.55} +{'loss': 169.8993, 'grad_norm': 2300.834984416165, 'learning_rate': 1.3376445984521134e-05, 'epoch': 0.56} +{'loss': 154.2397, 'grad_norm': 2091.3800941542945, 'learning_rate': 1.3372342287070556e-05, 'epoch': 0.56} +{'loss': 151.9059, 'grad_norm': 1855.5218965141048, 'learning_rate': 1.3368237948731287e-05, 'epoch': 0.56} +{'loss': 153.5619, 'grad_norm': 2067.099670768915, 'learning_rate': 1.3364132970283335e-05, 'epoch': 0.56} +{'loss': 151.1174, 'grad_norm': 1954.6868344627103, 'learning_rate': 1.3360027352506815e-05, 'epoch': 0.56} +{'loss': 153.002, 'grad_norm': 1870.615899734782, 'learning_rate': 1.335592109618197e-05, 'epoch': 0.56} +{'loss': 155.5786, 'grad_norm': 2098.6547542095254, 'learning_rate': 1.3351814202089169e-05, 'epoch': 0.56} +{'loss': 151.6638, 'grad_norm': 1820.7583466799897, 'learning_rate': 1.3347706671008887e-05, 'epoch': 0.56} +{'loss': 150.5962, 'grad_norm': 1735.699768827637, 'learning_rate': 1.3343598503721737e-05, 'epoch': 0.56} +{'loss': 152.8969, 'grad_norm': 1911.4210194574375, 'learning_rate': 1.3339489701008443e-05, 'epoch': 0.56} +{'loss': 151.789, 'grad_norm': 1805.1879104622453, 'learning_rate': 1.3335380263649855e-05, 'epoch': 0.56} +{'loss': 154.0635, 'grad_norm': 2722.7553121699384, 'learning_rate': 1.3331270192426941e-05, 'epoch': 0.56} +{'loss': 150.9435, 'grad_norm': 1910.1229790669622, 'learning_rate': 1.3327159488120784e-05, 'epoch': 0.56} +{'loss': 159.3516, 'grad_norm': 1919.4385275373852, 'learning_rate': 1.3323048151512602e-05, 'epoch': 0.56} +{'loss': 150.4715, 'grad_norm': 1781.9662409919063, 'learning_rate': 1.3318936183383718e-05, 'epoch': 0.56} +{'loss': 153.4674, 'grad_norm': 1655.83919330606, 'learning_rate': 1.3314823584515581e-05, 'epoch': 0.56} +{'loss': 155.1532, 'grad_norm': 1778.967072827619, 'learning_rate': 1.3310710355689765e-05, 'epoch': 0.56} +{'loss': 154.7535, 'grad_norm': 1644.9296747783362, 'learning_rate': 1.3306596497687948e-05, 'epoch': 0.56} +{'loss': 151.4155, 'grad_norm': 2223.0237974697634, 'learning_rate': 1.3302482011291953e-05, 'epoch': 0.56} +{'loss': 156.1238, 'grad_norm': 1957.3535176544565, 'learning_rate': 1.3298366897283697e-05, 'epoch': 0.56} +{'loss': 154.019, 'grad_norm': 2061.0841743766123, 'learning_rate': 1.3294251156445229e-05, 'epoch': 0.56} +{'loss': 152.2918, 'grad_norm': 1917.8400695427629, 'learning_rate': 1.3290134789558717e-05, 'epoch': 0.56} +{'loss': 152.7692, 'grad_norm': 1823.4465945024156, 'learning_rate': 1.3286017797406445e-05, 'epoch': 0.56} +{'loss': 154.0685, 'grad_norm': 2048.519398471395, 'learning_rate': 1.3281900180770818e-05, 'epoch': 0.56} +{'loss': 150.7697, 'grad_norm': 7527.461064244913, 'learning_rate': 1.3277781940434355e-05, 'epoch': 0.56} +{'loss': 155.5683, 'grad_norm': 1643.1131115416179, 'learning_rate': 1.3273663077179701e-05, 'epoch': 0.56} +{'loss': 150.2276, 'grad_norm': 2061.989515460547, 'learning_rate': 1.3269543591789616e-05, 'epoch': 0.56} +{'loss': 155.5534, 'grad_norm': 1809.2709063104414, 'learning_rate': 1.3265423485046973e-05, 'epoch': 0.56} +{'loss': 152.1283, 'grad_norm': 1869.148371828223, 'learning_rate': 1.3261302757734772e-05, 'epoch': 0.56} +{'loss': 150.0901, 'grad_norm': 2022.405291954525, 'learning_rate': 1.3257181410636123e-05, 'epoch': 0.56} +{'loss': 155.38, 'grad_norm': 1877.1101355299218, 'learning_rate': 1.3253059444534264e-05, 'epoch': 0.56} +{'loss': 151.455, 'grad_norm': 2211.8212475050873, 'learning_rate': 1.3248936860212537e-05, 'epoch': 0.56} +{'loss': 153.3823, 'grad_norm': 2240.2383977677077, 'learning_rate': 1.3244813658454414e-05, 'epoch': 0.56} +{'loss': 156.6627, 'grad_norm': 2191.444992020077, 'learning_rate': 1.3240689840043475e-05, 'epoch': 0.56} +{'loss': 158.282, 'grad_norm': 2424.172059183592, 'learning_rate': 1.3236565405763423e-05, 'epoch': 0.56} +{'loss': 149.3802, 'grad_norm': 1999.8026748885507, 'learning_rate': 1.3232440356398076e-05, 'epoch': 0.56} +{'loss': 149.3701, 'grad_norm': 2086.801884207228, 'learning_rate': 1.3228314692731368e-05, 'epoch': 0.56} +{'loss': 153.1123, 'grad_norm': 1706.280928498723, 'learning_rate': 1.3224188415547352e-05, 'epoch': 0.56} +{'loss': 148.5478, 'grad_norm': 1976.118267454599, 'learning_rate': 1.3220061525630198e-05, 'epoch': 0.56} +{'loss': 152.7536, 'grad_norm': 1946.2048378638428, 'learning_rate': 1.3215934023764185e-05, 'epoch': 0.56} +{'loss': 153.6158, 'grad_norm': 1999.6045896730514, 'learning_rate': 1.3211805910733724e-05, 'epoch': 0.56} +{'loss': 152.6488, 'grad_norm': 1810.7550480675916, 'learning_rate': 1.3207677187323324e-05, 'epoch': 0.56} +{'loss': 147.3653, 'grad_norm': 1838.711479485533, 'learning_rate': 1.320354785431762e-05, 'epoch': 0.56} +{'loss': 149.3934, 'grad_norm': 2051.9184783281135, 'learning_rate': 1.3199417912501363e-05, 'epoch': 0.56} +{'loss': 148.18, 'grad_norm': 1881.0093801634011, 'learning_rate': 1.3195287362659416e-05, 'epoch': 0.56} +{'loss': 149.1478, 'grad_norm': 1934.0682155215554, 'learning_rate': 1.3191156205576764e-05, 'epoch': 0.56} +{'loss': 153.6126, 'grad_norm': 1702.0656588401528, 'learning_rate': 1.3187024442038496e-05, 'epoch': 0.56} +{'loss': 151.6079, 'grad_norm': 1688.7942159023753, 'learning_rate': 1.3182892072829828e-05, 'epoch': 0.56} +{'loss': 150.7022, 'grad_norm': 2096.0126769686226, 'learning_rate': 1.3178759098736082e-05, 'epoch': 0.56} +{'loss': 153.6245, 'grad_norm': 1825.898616717465, 'learning_rate': 1.3174625520542706e-05, 'epoch': 0.56} +{'loss': 152.9053, 'grad_norm': 2136.897260593045, 'learning_rate': 1.3170491339035248e-05, 'epoch': 0.56} +{'loss': 157.6424, 'grad_norm': 2200.9952546009176, 'learning_rate': 1.3166356554999382e-05, 'epoch': 0.56} +{'loss': 156.4011, 'grad_norm': 2248.483615130026, 'learning_rate': 1.3162221169220894e-05, 'epoch': 0.57} +{'loss': 151.3407, 'grad_norm': 2342.102254086033, 'learning_rate': 1.3158085182485687e-05, 'epoch': 0.57} +{'loss': 149.7301, 'grad_norm': 2449.0913963574703, 'learning_rate': 1.3153948595579764e-05, 'epoch': 0.57} +{'loss': 155.5637, 'grad_norm': 1902.9062529461846, 'learning_rate': 1.3149811409289262e-05, 'epoch': 0.57} +{'loss': 152.7193, 'grad_norm': 2207.235030544086, 'learning_rate': 1.314567362440042e-05, 'epoch': 0.57} +{'loss': 150.7481, 'grad_norm': 1965.9204254116617, 'learning_rate': 1.3141535241699593e-05, 'epoch': 0.57} +{'loss': 148.8553, 'grad_norm': 2014.5876916817988, 'learning_rate': 1.313739626197325e-05, 'epoch': 0.57} +{'loss': 145.7437, 'grad_norm': 1768.2644895166588, 'learning_rate': 1.313325668600797e-05, 'epoch': 0.57} +{'loss': 151.4163, 'grad_norm': 2017.5354820264392, 'learning_rate': 1.3129116514590457e-05, 'epoch': 0.57} +{'loss': 152.7087, 'grad_norm': 2300.582602630359, 'learning_rate': 1.3124975748507514e-05, 'epoch': 0.57} +{'loss': 150.0725, 'grad_norm': 1885.4660679508188, 'learning_rate': 1.3120834388546062e-05, 'epoch': 0.57} +{'loss': 149.5732, 'grad_norm': 1876.497752358713, 'learning_rate': 1.3116692435493144e-05, 'epoch': 0.57} +{'loss': 151.2827, 'grad_norm': 1905.5000608711084, 'learning_rate': 1.3112549890135895e-05, 'epoch': 0.57} +{'loss': 151.0072, 'grad_norm': 1982.541808744184, 'learning_rate': 1.3108406753261585e-05, 'epoch': 0.57} +{'loss': 148.8464, 'grad_norm': 2226.7601354347794, 'learning_rate': 1.310426302565758e-05, 'epoch': 0.57} +{'loss': 152.3007, 'grad_norm': 1944.693658639311, 'learning_rate': 1.310011870811137e-05, 'epoch': 0.57} +{'loss': 147.8038, 'grad_norm': 2027.175618661353, 'learning_rate': 1.309597380141055e-05, 'epoch': 0.57} +{'loss': 157.2753, 'grad_norm': 1774.519970131625, 'learning_rate': 1.3091828306342829e-05, 'epoch': 0.57} +{'loss': 152.1321, 'grad_norm': 1726.3858520208, 'learning_rate': 1.3087682223696025e-05, 'epoch': 0.57} +{'loss': 148.9274, 'grad_norm': 1907.1047699274868, 'learning_rate': 1.3083535554258076e-05, 'epoch': 0.57} +{'loss': 151.136, 'grad_norm': 2328.8880794697448, 'learning_rate': 1.3079388298817018e-05, 'epoch': 0.57} +{'loss': 158.0122, 'grad_norm': 1868.0998370765433, 'learning_rate': 1.3075240458161013e-05, 'epoch': 0.57} +{'loss': 150.2846, 'grad_norm': 1672.4955292227967, 'learning_rate': 1.3071092033078325e-05, 'epoch': 0.57} +{'loss': 152.7059, 'grad_norm': 1916.4849682820509, 'learning_rate': 1.3066943024357333e-05, 'epoch': 0.57} +{'loss': 155.0822, 'grad_norm': 1978.2494494946325, 'learning_rate': 1.3062793432786523e-05, 'epoch': 0.57} +{'loss': 149.6531, 'grad_norm': 1827.0441591283584, 'learning_rate': 1.305864325915449e-05, 'epoch': 0.57} +{'loss': 152.5685, 'grad_norm': 1919.1246509313705, 'learning_rate': 1.3054492504249954e-05, 'epoch': 0.57} +{'loss': 148.8607, 'grad_norm': 1944.6145863010772, 'learning_rate': 1.3050341168861729e-05, 'epoch': 0.57} +{'loss': 169.9089, 'grad_norm': 1888.2545993307583, 'learning_rate': 1.3046189253778746e-05, 'epoch': 0.57} +{'loss': 155.2137, 'grad_norm': 1820.2442958137326, 'learning_rate': 1.3042036759790046e-05, 'epoch': 0.57} +{'loss': 152.0015, 'grad_norm': 1894.3216519275984, 'learning_rate': 1.303788368768478e-05, 'epoch': 0.57} +{'loss': 149.6312, 'grad_norm': 1735.1707689924194, 'learning_rate': 1.303373003825221e-05, 'epoch': 0.57} +{'loss': 151.5483, 'grad_norm': 1858.7659851368292, 'learning_rate': 1.3029575812281702e-05, 'epoch': 0.57} +{'loss': 149.4518, 'grad_norm': 1841.220719540979, 'learning_rate': 1.3025421010562743e-05, 'epoch': 0.57} +{'loss': 150.058, 'grad_norm': 1840.3332601632167, 'learning_rate': 1.3021265633884917e-05, 'epoch': 0.57} +{'loss': 151.7682, 'grad_norm': 1929.5802705106823, 'learning_rate': 1.3017109683037918e-05, 'epoch': 0.57} +{'loss': 155.7634, 'grad_norm': 2501.878443649189, 'learning_rate': 1.3012953158811565e-05, 'epoch': 0.57} +{'loss': 149.993, 'grad_norm': 1944.2509840239334, 'learning_rate': 1.3008796061995772e-05, 'epoch': 0.57} +{'loss': 152.1675, 'grad_norm': 1836.0315371151094, 'learning_rate': 1.3004638393380554e-05, 'epoch': 0.57} +{'loss': 149.4947, 'grad_norm': 2032.4691629799836, 'learning_rate': 1.3000480153756059e-05, 'epoch': 0.57} +{'loss': 151.8749, 'grad_norm': 1781.4176307960358, 'learning_rate': 1.299632134391252e-05, 'epoch': 0.57} +{'loss': 151.0445, 'grad_norm': 1965.5405490787589, 'learning_rate': 1.2992161964640291e-05, 'epoch': 0.57} +{'loss': 150.7203, 'grad_norm': 2135.976844234808, 'learning_rate': 1.2988002016729835e-05, 'epoch': 0.57} +{'loss': 154.0748, 'grad_norm': 2003.8344277862655, 'learning_rate': 1.298384150097171e-05, 'epoch': 0.57} +{'loss': 153.8088, 'grad_norm': 1803.4474030056172, 'learning_rate': 1.2979680418156604e-05, 'epoch': 0.57} +{'loss': 151.8327, 'grad_norm': 1945.612767446076, 'learning_rate': 1.2975518769075288e-05, 'epoch': 0.57} +{'loss': 149.6928, 'grad_norm': 1749.2803842982391, 'learning_rate': 1.297135655451866e-05, 'epoch': 0.57} +{'loss': 154.0069, 'grad_norm': 1839.6599300743305, 'learning_rate': 1.2967193775277716e-05, 'epoch': 0.57} +{'loss': 158.5301, 'grad_norm': 4154.089776377118, 'learning_rate': 1.2963030432143558e-05, 'epoch': 0.57} +{'loss': 149.6689, 'grad_norm': 1774.6234246647612, 'learning_rate': 1.2958866525907405e-05, 'epoch': 0.57} +{'loss': 152.7125, 'grad_norm': 1923.7889008654668, 'learning_rate': 1.295470205736057e-05, 'epoch': 0.57} +{'loss': 153.9823, 'grad_norm': 1951.3720448505494, 'learning_rate': 1.2950537027294487e-05, 'epoch': 0.57} +{'loss': 152.051, 'grad_norm': 1556.367542039294, 'learning_rate': 1.2946371436500685e-05, 'epoch': 0.58} +{'loss': 151.8522, 'grad_norm': 1786.995755421777, 'learning_rate': 1.2942205285770797e-05, 'epoch': 0.58} +{'loss': 152.5518, 'grad_norm': 1815.848100478616, 'learning_rate': 1.293803857589658e-05, 'epoch': 0.58} +{'loss': 148.8817, 'grad_norm': 1716.5848523290326, 'learning_rate': 1.2933871307669884e-05, 'epoch': 0.58} +{'loss': 149.8089, 'grad_norm': 1766.938379437902, 'learning_rate': 1.2929703481882662e-05, 'epoch': 0.58} +{'loss': 147.8599, 'grad_norm': 2120.308741145674, 'learning_rate': 1.2925535099326984e-05, 'epoch': 0.58} +{'loss': 148.1729, 'grad_norm': 1945.2600861893884, 'learning_rate': 1.2921366160795017e-05, 'epoch': 0.58} +{'loss': 152.6922, 'grad_norm': 1950.9059218364932, 'learning_rate': 1.2917196667079037e-05, 'epoch': 0.58} +{'loss': 152.9861, 'grad_norm': 1806.8022253260551, 'learning_rate': 1.291302661897143e-05, 'epoch': 0.58} +{'loss': 153.8283, 'grad_norm': 1856.6852270213021, 'learning_rate': 1.2908856017264675e-05, 'epoch': 0.58} +{'loss': 151.6987, 'grad_norm': 2043.9430435642494, 'learning_rate': 1.2904684862751371e-05, 'epoch': 0.58} +[2025-05-28 20:10:52,931] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30000 is about to be saved! +[2025-05-28 20:10:53,012] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 20:10:53,012] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 20:10:53,176] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 20:11:09,841] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 20:18:47,669] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 20:18:47,753] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 20:19:19,019] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30000 is ready now! +[2025-05-28 20:19:19,066] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30000 is about to be saved! +[2025-05-28 20:19:19,106] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 20:19:19,106] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 20:19:19,139] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 20:19:19,213] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 20:22:36,790] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 20:22:37,109] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-30000/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 20:25:36,975] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30000 is ready now! +{'loss': 150.5708, 'grad_norm': 2075.335935782009, 'learning_rate': 1.2900513156224212e-05, 'epoch': 0.58} +{'loss': 156.667, 'grad_norm': 1648.5114993952222, 'learning_rate': 1.2896340898475999e-05, 'epoch': 0.58} +{'loss': 149.6192, 'grad_norm': 1927.2534410675792, 'learning_rate': 1.2892168090299639e-05, 'epoch': 0.58} +{'loss': 149.1972, 'grad_norm': 2003.8092382433051, 'learning_rate': 1.2887994732488144e-05, 'epoch': 0.58} +{'loss': 152.7041, 'grad_norm': 1777.416018106656, 'learning_rate': 1.2883820825834635e-05, 'epoch': 0.58} +{'loss': 152.0771, 'grad_norm': 1759.907601782276, 'learning_rate': 1.2879646371132321e-05, 'epoch': 0.58} +{'loss': 148.1489, 'grad_norm': 1884.0078849241095, 'learning_rate': 1.2875471369174534e-05, 'epoch': 0.58} +{'loss': 152.9601, 'grad_norm': 1813.8643427927916, 'learning_rate': 1.2871295820754702e-05, 'epoch': 0.58} +{'loss': 149.8116, 'grad_norm': 1823.8875059733978, 'learning_rate': 1.2867119726666355e-05, 'epoch': 0.58} +{'loss': 150.0572, 'grad_norm': 1816.651126805582, 'learning_rate': 1.2862943087703127e-05, 'epoch': 0.58} +{'loss': 149.59, 'grad_norm': 2534.2532143612348, 'learning_rate': 1.2858765904658763e-05, 'epoch': 0.58} +{'loss': 152.0098, 'grad_norm': 1845.6830476450748, 'learning_rate': 1.28545881783271e-05, 'epoch': 0.58} +{'loss': 153.6318, 'grad_norm': 2176.3265639371766, 'learning_rate': 1.2850409909502089e-05, 'epoch': 0.58} +{'loss': 150.3857, 'grad_norm': 2019.2892465057178, 'learning_rate': 1.2846231098977772e-05, 'epoch': 0.58} +{'loss': 149.8571, 'grad_norm': 1723.820784926517, 'learning_rate': 1.2842051747548307e-05, 'epoch': 0.58} +{'loss': 150.2741, 'grad_norm': 1790.437196388549, 'learning_rate': 1.2837871856007946e-05, 'epoch': 0.58} +{'loss': 153.2656, 'grad_norm': 1861.4396432140527, 'learning_rate': 1.283369142515105e-05, 'epoch': 0.58} +{'loss': 151.883, 'grad_norm': 1774.7541824662987, 'learning_rate': 1.2829510455772077e-05, 'epoch': 0.58} +{'loss': 149.7157, 'grad_norm': 1896.5101964561563, 'learning_rate': 1.2825328948665585e-05, 'epoch': 0.58} +{'loss': 167.6569, 'grad_norm': 1820.6712795023866, 'learning_rate': 1.2821146904626245e-05, 'epoch': 0.58} +{'loss': 148.1728, 'grad_norm': 1937.0565934817912, 'learning_rate': 1.2816964324448819e-05, 'epoch': 0.58} +{'loss': 150.0438, 'grad_norm': 1820.9261463563942, 'learning_rate': 1.2812781208928178e-05, 'epoch': 0.58} +{'loss': 150.7559, 'grad_norm': 2018.4875525867772, 'learning_rate': 1.2808597558859292e-05, 'epoch': 0.58} +{'loss': 145.9942, 'grad_norm': 1646.886532286034, 'learning_rate': 1.2804413375037232e-05, 'epoch': 0.58} +{'loss': 159.8831, 'grad_norm': 1885.9228601925533, 'learning_rate': 1.280022865825717e-05, 'epoch': 0.58} +{'loss': 149.9683, 'grad_norm': 1886.6923301658778, 'learning_rate': 1.2796043409314382e-05, 'epoch': 0.58} +{'loss': 155.6468, 'grad_norm': 2044.0962838792846, 'learning_rate': 1.2791857629004245e-05, 'epoch': 0.58} +{'loss': 152.9897, 'grad_norm': 2119.227286320981, 'learning_rate': 1.2787671318122233e-05, 'epoch': 0.58} +{'loss': 152.5943, 'grad_norm': 2231.9267424887717, 'learning_rate': 1.278348447746392e-05, 'epoch': 0.58} +{'loss': 147.1931, 'grad_norm': 1895.1464924957386, 'learning_rate': 1.2779297107824994e-05, 'epoch': 0.58} +{'loss': 153.7111, 'grad_norm': 1977.1606232341894, 'learning_rate': 1.2775109210001225e-05, 'epoch': 0.58} +{'loss': 153.0788, 'grad_norm': 1792.6914341903348, 'learning_rate': 1.2770920784788494e-05, 'epoch': 0.58} +{'loss': 154.4889, 'grad_norm': 1812.7308890034267, 'learning_rate': 1.2766731832982782e-05, 'epoch': 0.58} +{'loss': 147.5961, 'grad_norm': 1603.628462412485, 'learning_rate': 1.2762542355380168e-05, 'epoch': 0.58} +{'loss': 148.9396, 'grad_norm': 2270.388822183594, 'learning_rate': 1.2758352352776829e-05, 'epoch': 0.58} +{'loss': 151.4953, 'grad_norm': 2016.9529349448521, 'learning_rate': 1.2754161825969044e-05, 'epoch': 0.58} +{'loss': 147.1518, 'grad_norm': 1768.2450613988835, 'learning_rate': 1.2749970775753198e-05, 'epoch': 0.58} +{'loss': 150.8282, 'grad_norm': 1814.2764374630813, 'learning_rate': 1.274577920292576e-05, 'epoch': 0.58} +{'loss': 148.7065, 'grad_norm': 2018.2172529711693, 'learning_rate': 1.2741587108283313e-05, 'epoch': 0.58} +{'loss': 148.0266, 'grad_norm': 1606.122280997998, 'learning_rate': 1.2737394492622532e-05, 'epoch': 0.58} +{'loss': 148.3407, 'grad_norm': 1964.9886038043958, 'learning_rate': 1.2733201356740192e-05, 'epoch': 0.58} +{'loss': 152.7469, 'grad_norm': 1942.3451351174015, 'learning_rate': 1.272900770143317e-05, 'epoch': 0.59} +{'loss': 156.0165, 'grad_norm': 2156.9776576405884, 'learning_rate': 1.2724813527498436e-05, 'epoch': 0.59} +{'loss': 148.0905, 'grad_norm': 1785.633610211536, 'learning_rate': 1.2720618835733061e-05, 'epoch': 0.59} +{'loss': 148.5802, 'grad_norm': 2032.2609218891205, 'learning_rate': 1.271642362693422e-05, 'epoch': 0.59} +{'loss': 151.5905, 'grad_norm': 1996.0514235263608, 'learning_rate': 1.2712227901899175e-05, 'epoch': 0.59} +{'loss': 148.1868, 'grad_norm': 1808.9014579548195, 'learning_rate': 1.27080316614253e-05, 'epoch': 0.59} +{'loss': 146.6621, 'grad_norm': 1691.3557294615132, 'learning_rate': 1.2703834906310051e-05, 'epoch': 0.59} +{'loss': 147.6699, 'grad_norm': 1961.8171426822428, 'learning_rate': 1.2699637637350997e-05, 'epoch': 0.59} +{'loss': 147.7532, 'grad_norm': 1847.0112728069485, 'learning_rate': 1.2695439855345797e-05, 'epoch': 0.59} +{'loss': 147.4198, 'grad_norm': 2064.966840108016, 'learning_rate': 1.2691241561092202e-05, 'epoch': 0.59} +{'loss': 148.0543, 'grad_norm': 2296.503445756122, 'learning_rate': 1.2687042755388077e-05, 'epoch': 0.59} +{'loss': 152.1247, 'grad_norm': 1828.5786810022562, 'learning_rate': 1.2682843439031365e-05, 'epoch': 0.59} +{'loss': 147.9717, 'grad_norm': 1817.7949772793218, 'learning_rate': 1.2678643612820118e-05, 'epoch': 0.59} +{'loss': 144.013, 'grad_norm': 2188.6487632221324, 'learning_rate': 1.2674443277552486e-05, 'epoch': 0.59} +{'loss': 154.0235, 'grad_norm': 1826.3637584242176, 'learning_rate': 1.2670242434026705e-05, 'epoch': 0.59} +{'loss': 151.2955, 'grad_norm': 1745.500321752881, 'learning_rate': 1.266604108304112e-05, 'epoch': 0.59} +{'loss': 151.1419, 'grad_norm': 1979.9236452847572, 'learning_rate': 1.266183922539416e-05, 'epoch': 0.59} +{'loss': 153.9657, 'grad_norm': 2284.5399902178915, 'learning_rate': 1.2657636861884363e-05, 'epoch': 0.59} +{'loss': 147.9815, 'grad_norm': 1644.1576954678096, 'learning_rate': 1.2653433993310355e-05, 'epoch': 0.59} +{'loss': 152.2053, 'grad_norm': 1836.3449260037917, 'learning_rate': 1.2649230620470858e-05, 'epoch': 0.59} +{'loss': 151.3765, 'grad_norm': 1883.1976644931547, 'learning_rate': 1.2645026744164698e-05, 'epoch': 0.59} +{'loss': 147.0184, 'grad_norm': 2183.517803451004, 'learning_rate': 1.2640822365190786e-05, 'epoch': 0.59} +{'loss': 150.3705, 'grad_norm': 1828.183352508643, 'learning_rate': 1.2636617484348129e-05, 'epoch': 0.59} +{'loss': 148.6179, 'grad_norm': 1903.723661994623, 'learning_rate': 1.2632412102435842e-05, 'epoch': 0.59} +{'loss': 145.6277, 'grad_norm': 1808.7441593889303, 'learning_rate': 1.262820622025312e-05, 'epoch': 0.59} +{'loss': 144.7577, 'grad_norm': 1877.6182414713928, 'learning_rate': 1.2623999838599264e-05, 'epoch': 0.59} +{'loss': 147.8478, 'grad_norm': 1956.5820278923898, 'learning_rate': 1.2619792958273664e-05, 'epoch': 0.59} +{'loss': 150.3155, 'grad_norm': 1825.4387152440274, 'learning_rate': 1.2615585580075805e-05, 'epoch': 0.59} +{'loss': 150.7384, 'grad_norm': 1875.4279842172675, 'learning_rate': 1.2611377704805271e-05, 'epoch': 0.59} +{'loss': 145.8442, 'grad_norm': 1840.7891906615102, 'learning_rate': 1.2607169333261737e-05, 'epoch': 0.59} +{'loss': 149.1697, 'grad_norm': 1859.4845578744053, 'learning_rate': 1.2602960466244971e-05, 'epoch': 0.59} +{'loss': 152.075, 'grad_norm': 2094.811120402427, 'learning_rate': 1.259875110455484e-05, 'epoch': 0.59} +{'loss': 147.2019, 'grad_norm': 1744.4488175794604, 'learning_rate': 1.2594541248991296e-05, 'epoch': 0.59} +{'loss': 150.1304, 'grad_norm': 1993.6481151086123, 'learning_rate': 1.2590330900354399e-05, 'epoch': 0.59} +{'loss': 149.7551, 'grad_norm': 1801.3427919378312, 'learning_rate': 1.2586120059444289e-05, 'epoch': 0.59} +{'loss': 152.8089, 'grad_norm': 1981.4964018183118, 'learning_rate': 1.2581908727061207e-05, 'epoch': 0.59} +{'loss': 146.9493, 'grad_norm': 1961.543304891207, 'learning_rate': 1.2577696904005487e-05, 'epoch': 0.59} +{'loss': 148.2913, 'grad_norm': 2256.399854602053, 'learning_rate': 1.257348459107755e-05, 'epoch': 0.59} +{'loss': 152.6642, 'grad_norm': 2377.6469719299103, 'learning_rate': 1.2569271789077919e-05, 'epoch': 0.59} +{'loss': 147.9606, 'grad_norm': 1880.6521006672272, 'learning_rate': 1.2565058498807206e-05, 'epoch': 0.59} +{'loss': 148.8851, 'grad_norm': 1909.3249332425737, 'learning_rate': 1.2560844721066113e-05, 'epoch': 0.59} +{'loss': 149.8325, 'grad_norm': 1885.2728819543656, 'learning_rate': 1.2556630456655441e-05, 'epoch': 0.59} +{'loss': 148.6573, 'grad_norm': 1866.9380193928826, 'learning_rate': 1.2552415706376075e-05, 'epoch': 0.59} +{'loss': 149.3619, 'grad_norm': 1726.3879369319045, 'learning_rate': 1.2548200471028999e-05, 'epoch': 0.59} +{'loss': 148.0642, 'grad_norm': 1755.7956130388036, 'learning_rate': 1.2543984751415291e-05, 'epoch': 0.59} +{'loss': 145.8149, 'grad_norm': 1782.668417046539, 'learning_rate': 1.2539768548336112e-05, 'epoch': 0.59} +{'loss': 156.5072, 'grad_norm': 1945.6738516263601, 'learning_rate': 1.2535551862592722e-05, 'epoch': 0.59} +{'loss': 146.6535, 'grad_norm': 2111.319408648266, 'learning_rate': 1.2531334694986472e-05, 'epoch': 0.59} +{'loss': 148.113, 'grad_norm': 2150.524402399773, 'learning_rate': 1.2527117046318801e-05, 'epoch': 0.59} +{'loss': 145.1113, 'grad_norm': 2082.5031103063393, 'learning_rate': 1.2522898917391249e-05, 'epoch': 0.59} +{'loss': 152.1365, 'grad_norm': 1701.6683755696213, 'learning_rate': 1.2518680309005427e-05, 'epoch': 0.59} +{'loss': 149.1906, 'grad_norm': 2114.640818794694, 'learning_rate': 1.2514461221963063e-05, 'epoch': 0.59} +{'loss': 156.4579, 'grad_norm': 3091.7690590208827, 'learning_rate': 1.2510241657065958e-05, 'epoch': 0.6} +{'loss': 152.0387, 'grad_norm': 1957.0299207048563, 'learning_rate': 1.2506021615116008e-05, 'epoch': 0.6} +{'loss': 148.0953, 'grad_norm': 2830.116732017916, 'learning_rate': 1.2501801096915203e-05, 'epoch': 0.6} +{'loss': 143.4218, 'grad_norm': 1736.2490569741583, 'learning_rate': 1.2497580103265621e-05, 'epoch': 0.6} +{'loss': 151.6463, 'grad_norm': 1849.2981546761282, 'learning_rate': 1.2493358634969428e-05, 'epoch': 0.6} +{'loss': 148.2188, 'grad_norm': 2010.2744253604806, 'learning_rate': 1.2489136692828885e-05, 'epoch': 0.6} +{'loss': 143.4422, 'grad_norm': 1673.0781201726084, 'learning_rate': 1.2484914277646341e-05, 'epoch': 0.6} +{'loss': 150.4893, 'grad_norm': 1845.4008822817748, 'learning_rate': 1.2480691390224238e-05, 'epoch': 0.6} +{'loss': 148.2308, 'grad_norm': 1925.5961004244284, 'learning_rate': 1.2476468031365097e-05, 'epoch': 0.6} +{'loss': 150.2819, 'grad_norm': 1771.357989289696, 'learning_rate': 1.2472244201871539e-05, 'epoch': 0.6} +{'loss': 151.0535, 'grad_norm': 1878.6487070067142, 'learning_rate': 1.2468019902546274e-05, 'epoch': 0.6} +{'loss': 151.6333, 'grad_norm': 2094.4435532214115, 'learning_rate': 1.2463795134192096e-05, 'epoch': 0.6} +{'loss': 145.9211, 'grad_norm': 2025.5791416180848, 'learning_rate': 1.2459569897611895e-05, 'epoch': 0.6} +{'loss': 149.5986, 'grad_norm': 2007.3913875973237, 'learning_rate': 1.2455344193608643e-05, 'epoch': 0.6} +{'loss': 147.6504, 'grad_norm': 1904.0439604680435, 'learning_rate': 1.2451118022985402e-05, 'epoch': 0.6} +{'loss': 145.8536, 'grad_norm': 1772.3353610409595, 'learning_rate': 1.2446891386545325e-05, 'epoch': 0.6} +{'loss': 148.7014, 'grad_norm': 1848.7688207669123, 'learning_rate': 1.2442664285091656e-05, 'epoch': 0.6} +{'loss': 148.5996, 'grad_norm': 1732.0454110746052, 'learning_rate': 1.2438436719427725e-05, 'epoch': 0.6} +{'loss': 147.7403, 'grad_norm': 1750.1513894081313, 'learning_rate': 1.2434208690356945e-05, 'epoch': 0.6} +{'loss': 145.9952, 'grad_norm': 1797.383172512314, 'learning_rate': 1.2429980198682823e-05, 'epoch': 0.6} +{'loss': 153.0503, 'grad_norm': 2018.3840188729803, 'learning_rate': 1.2425751245208955e-05, 'epoch': 0.6} +{'loss': 149.2482, 'grad_norm': 1954.549546680455, 'learning_rate': 1.242152183073902e-05, 'epoch': 0.6} +{'loss': 147.3324, 'grad_norm': 1778.7447980792194, 'learning_rate': 1.2417291956076788e-05, 'epoch': 0.6} +{'loss': 154.1214, 'grad_norm': 2414.092223700188, 'learning_rate': 1.2413061622026115e-05, 'epoch': 0.6} +{'loss': 147.8171, 'grad_norm': 1712.8981498299415, 'learning_rate': 1.2408830829390943e-05, 'epoch': 0.6} +{'loss': 148.1603, 'grad_norm': 1736.7565125765116, 'learning_rate': 1.2404599578975306e-05, 'epoch': 0.6} +{'loss': 146.9328, 'grad_norm': 1656.794866717749, 'learning_rate': 1.240036787158332e-05, 'epoch': 0.6} +{'loss': 145.365, 'grad_norm': 2017.8326827147202, 'learning_rate': 1.239613570801919e-05, 'epoch': 0.6} +{'loss': 147.9817, 'grad_norm': 1587.149248823328, 'learning_rate': 1.2391903089087208e-05, 'epoch': 0.6} +{'loss': 151.1766, 'grad_norm': 1979.5503872349093, 'learning_rate': 1.238767001559175e-05, 'epoch': 0.6} +{'loss': 143.8772, 'grad_norm': 2075.0658421969542, 'learning_rate': 1.2383436488337282e-05, 'epoch': 0.6} +{'loss': 148.6671, 'grad_norm': 1719.5058076694957, 'learning_rate': 1.2379202508128354e-05, 'epoch': 0.6} +{'loss': 152.2823, 'grad_norm': 2269.742813923152, 'learning_rate': 1.2374968075769603e-05, 'epoch': 0.6} +{'loss': 150.7325, 'grad_norm': 2060.9829788266316, 'learning_rate': 1.237073319206575e-05, 'epoch': 0.6} +{'loss': 147.9435, 'grad_norm': 1817.7167401453814, 'learning_rate': 1.2366497857821604e-05, 'epoch': 0.6} +{'loss': 143.7509, 'grad_norm': 2165.453357385711, 'learning_rate': 1.236226207384206e-05, 'epoch': 0.6} +{'loss': 149.8053, 'grad_norm': 1593.5427413912782, 'learning_rate': 1.23580258409321e-05, 'epoch': 0.6} +{'loss': 141.7856, 'grad_norm': 1621.1628313759222, 'learning_rate': 1.2353789159896781e-05, 'epoch': 0.6} +{'loss': 144.7995, 'grad_norm': 1873.4805713581452, 'learning_rate': 1.2349552031541259e-05, 'epoch': 0.6} +{'loss': 151.9999, 'grad_norm': 2122.1329749595725, 'learning_rate': 1.2345314456670768e-05, 'epoch': 0.6} +{'loss': 149.1159, 'grad_norm': 1800.3621125166126, 'learning_rate': 1.2341076436090622e-05, 'epoch': 0.6} +{'loss': 144.7406, 'grad_norm': 2054.7998188343545, 'learning_rate': 1.2336837970606235e-05, 'epoch': 0.6} +{'loss': 148.2105, 'grad_norm': 2184.263585385122, 'learning_rate': 1.233259906102309e-05, 'epoch': 0.6} +{'loss': 149.105, 'grad_norm': 2105.9103175986784, 'learning_rate': 1.2328359708146762e-05, 'epoch': 0.6} +{'loss': 148.1121, 'grad_norm': 2030.5504273166703, 'learning_rate': 1.232411991278291e-05, 'epoch': 0.6} +{'loss': 145.9055, 'grad_norm': 1991.6777938032358, 'learning_rate': 1.2319879675737269e-05, 'epoch': 0.6} +{'loss': 148.4465, 'grad_norm': 1857.4505995254315, 'learning_rate': 1.2315638997815676e-05, 'epoch': 0.6} +{'loss': 145.0088, 'grad_norm': 1692.9167386659713, 'learning_rate': 1.231139787982403e-05, 'epoch': 0.6} +{'loss': 149.0307, 'grad_norm': 1828.8524501132306, 'learning_rate': 1.2307156322568328e-05, 'epoch': 0.6} +{'loss': 152.6708, 'grad_norm': 1753.3685151580037, 'learning_rate': 1.2302914326854651e-05, 'epoch': 0.6} +{'loss': 148.2145, 'grad_norm': 1721.4934984445847, 'learning_rate': 1.2298671893489149e-05, 'epoch': 0.6} +{'loss': 146.3983, 'grad_norm': 1641.3169306652876, 'learning_rate': 1.2294429023278074e-05, 'epoch': 0.6} +{'loss': 153.2579, 'grad_norm': 1894.3350115479689, 'learning_rate': 1.2290185717027749e-05, 'epoch': 0.61} +{'loss': 144.8255, 'grad_norm': 1740.5112211365627, 'learning_rate': 1.228594197554458e-05, 'epoch': 0.61} +{'loss': 143.8417, 'grad_norm': 1984.8277338780158, 'learning_rate': 1.2281697799635065e-05, 'epoch': 0.61} +{'loss': 153.5073, 'grad_norm': 2035.039317291456, 'learning_rate': 1.2277453190105771e-05, 'epoch': 0.61} +{'loss': 146.107, 'grad_norm': 1737.4724485445017, 'learning_rate': 1.2273208147763363e-05, 'epoch': 0.61} +{'loss': 148.4891, 'grad_norm': 1901.1727183560433, 'learning_rate': 1.2268962673414573e-05, 'epoch': 0.61} +{'loss': 145.6923, 'grad_norm': 2094.340362114945, 'learning_rate': 1.226471676786622e-05, 'epoch': 0.61} +{'loss': 144.5762, 'grad_norm': 1697.9650522233283, 'learning_rate': 1.2260470431925216e-05, 'epoch': 0.61} +{'loss': 149.1995, 'grad_norm': 1752.521685534902, 'learning_rate': 1.2256223666398539e-05, 'epoch': 0.61} +{'loss': 145.6733, 'grad_norm': 2609.8585652227107, 'learning_rate': 1.2251976472093256e-05, 'epoch': 0.61} +{'loss': 144.6799, 'grad_norm': 1803.8753879414662, 'learning_rate': 1.2247728849816517e-05, 'epoch': 0.61} +{'loss': 149.9096, 'grad_norm': 1779.3113534663535, 'learning_rate': 1.224348080037555e-05, 'epoch': 0.61} +{'loss': 147.1265, 'grad_norm': 1889.6453845564292, 'learning_rate': 1.2239232324577666e-05, 'epoch': 0.61} +{'loss': 144.9542, 'grad_norm': 1796.3593091968228, 'learning_rate': 1.2234983423230255e-05, 'epoch': 0.61} +{'loss': 145.5795, 'grad_norm': 1728.3876325231345, 'learning_rate': 1.223073409714079e-05, 'epoch': 0.61} +{'loss': 147.0243, 'grad_norm': 1882.858138582795, 'learning_rate': 1.2226484347116824e-05, 'epoch': 0.61} +{'loss': 148.3165, 'grad_norm': 1762.7627443463737, 'learning_rate': 1.2222234173965988e-05, 'epoch': 0.61} +{'loss': 147.1013, 'grad_norm': 2001.8111508186178, 'learning_rate': 1.2217983578496004e-05, 'epoch': 0.61} +{'loss': 142.1158, 'grad_norm': 1881.2368770460896, 'learning_rate': 1.2213732561514657e-05, 'epoch': 0.61} +{'loss': 148.8162, 'grad_norm': 1765.6741038916662, 'learning_rate': 1.2209481123829825e-05, 'epoch': 0.61} +{'loss': 146.0852, 'grad_norm': 1824.124368394865, 'learning_rate': 1.2205229266249466e-05, 'epoch': 0.61} +{'loss': 146.4353, 'grad_norm': 1765.108896423881, 'learning_rate': 1.2200976989581602e-05, 'epoch': 0.61} +{'loss': 144.5537, 'grad_norm': 1724.8195420108007, 'learning_rate': 1.2196724294634361e-05, 'epoch': 0.61} +{'loss': 147.232, 'grad_norm': 1879.1488655598687, 'learning_rate': 1.2192471182215933e-05, 'epoch': 0.61} +{'loss': 143.3116, 'grad_norm': 1732.1086084776973, 'learning_rate': 1.218821765313458e-05, 'epoch': 0.61} +{'loss': 147.0101, 'grad_norm': 2228.9136111943894, 'learning_rate': 1.2183963708198668e-05, 'epoch': 0.61} +{'loss': 148.1701, 'grad_norm': 1619.9984608458738, 'learning_rate': 1.2179709348216618e-05, 'epoch': 0.61} +{'loss': 148.3613, 'grad_norm': 1728.4199048625117, 'learning_rate': 1.2175454573996942e-05, 'epoch': 0.61} +{'loss': 147.2207, 'grad_norm': 1705.3551949064815, 'learning_rate': 1.2171199386348231e-05, 'epoch': 0.61} +{'loss': 148.4391, 'grad_norm': 2471.3236006167826, 'learning_rate': 1.2166943786079146e-05, 'epoch': 0.61} +{'loss': 149.8347, 'grad_norm': 1807.7452071938992, 'learning_rate': 1.2162687773998444e-05, 'epoch': 0.61} +{'loss': 146.9902, 'grad_norm': 1627.7608999434565, 'learning_rate': 1.2158431350914934e-05, 'epoch': 0.61} +{'loss': 148.6879, 'grad_norm': 2008.9721984931616, 'learning_rate': 1.2154174517637526e-05, 'epoch': 0.61} +{'loss': 146.2412, 'grad_norm': 1642.4080362928717, 'learning_rate': 1.2149917274975203e-05, 'epoch': 0.61} +{'loss': 149.022, 'grad_norm': 2684.64076319668, 'learning_rate': 1.2145659623737011e-05, 'epoch': 0.61} +{'loss': 142.5277, 'grad_norm': 1754.55404607605, 'learning_rate': 1.2141401564732095e-05, 'epoch': 0.61} +{'loss': 144.6566, 'grad_norm': 2190.832347649384, 'learning_rate': 1.2137143098769666e-05, 'epoch': 0.61} +{'loss': 143.9686, 'grad_norm': 1663.2898862272057, 'learning_rate': 1.213288422665901e-05, 'epoch': 0.61} +{'loss': 145.4147, 'grad_norm': 1835.967238677347, 'learning_rate': 1.2128624949209498e-05, 'epoch': 0.61} +{'loss': 147.1116, 'grad_norm': 1784.0804583408067, 'learning_rate': 1.2124365267230571e-05, 'epoch': 0.61} +{'loss': 143.0467, 'grad_norm': 1677.9496338070871, 'learning_rate': 1.2120105181531752e-05, 'epoch': 0.61} +{'loss': 145.7035, 'grad_norm': 1907.9460110721966, 'learning_rate': 1.2115844692922638e-05, 'epoch': 0.61} +{'loss': 144.0841, 'grad_norm': 1588.0659353674278, 'learning_rate': 1.2111583802212905e-05, 'epoch': 0.61} +{'loss': 146.654, 'grad_norm': 1680.8707146919878, 'learning_rate': 1.2107322510212302e-05, 'epoch': 0.61} +{'loss': 148.3141, 'grad_norm': 1808.6889772557256, 'learning_rate': 1.2103060817730659e-05, 'epoch': 0.61} +{'loss': 147.222, 'grad_norm': 1584.5795708415856, 'learning_rate': 1.2098798725577872e-05, 'epoch': 0.61} +{'loss': 144.6888, 'grad_norm': 1791.9478711532774, 'learning_rate': 1.2094536234563927e-05, 'epoch': 0.61} +{'loss': 145.7324, 'grad_norm': 1792.2681292639431, 'learning_rate': 1.2090273345498876e-05, 'epoch': 0.61} +{'loss': 150.1752, 'grad_norm': 1898.7356961217693, 'learning_rate': 1.208601005919285e-05, 'epoch': 0.61} +{'loss': 143.7717, 'grad_norm': 1572.3835207768043, 'learning_rate': 1.2081746376456056e-05, 'epoch': 0.61} +{'loss': 145.4786, 'grad_norm': 1584.8991834237163, 'learning_rate': 1.2077482298098773e-05, 'epoch': 0.61} +{'loss': 149.7377, 'grad_norm': 1776.4244736929513, 'learning_rate': 1.2073217824931359e-05, 'epoch': 0.61} +{'loss': 148.2477, 'grad_norm': 1737.8404603692973, 'learning_rate': 1.2068952957764244e-05, 'epoch': 0.62} +{'loss': 149.2114, 'grad_norm': 1652.9821706065613, 'learning_rate': 1.2064687697407939e-05, 'epoch': 0.62} +{'loss': 146.9008, 'grad_norm': 1716.3180689750293, 'learning_rate': 1.2060422044673022e-05, 'epoch': 0.62} +[2025-05-28 22:14:09,978] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step32000 is about to be saved! +[2025-05-28 22:14:10,019] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 22:14:10,020] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 22:14:10,174] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 22:14:11,845] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 22:20:20,695] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 22:20:20,735] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 22:20:47,454] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step32000 is ready now! +[2025-05-28 22:20:47,577] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step32000 is about to be saved! +[2025-05-28 22:20:47,601] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-28 22:20:47,601] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-28 22:20:47,682] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-28 22:20:47,795] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-28 22:26:39,611] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-28 22:26:39,638] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-32000/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-28 22:26:53,573] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step32000 is ready now! +{'loss': 144.9416, 'grad_norm': 1645.929432907004, 'learning_rate': 1.2056156000370144e-05, 'epoch': 0.62} +{'loss': 147.0675, 'grad_norm': 1871.149231035086, 'learning_rate': 1.2051889565310044e-05, 'epoch': 0.62} +{'loss': 146.7348, 'grad_norm': 1692.2062537905979, 'learning_rate': 1.2047622740303524e-05, 'epoch': 0.62} +{'loss': 142.8703, 'grad_norm': 1925.1911283614697, 'learning_rate': 1.2043355526161458e-05, 'epoch': 0.62} +{'loss': 146.3649, 'grad_norm': 1436.9724219113868, 'learning_rate': 1.2039087923694798e-05, 'epoch': 0.62} +{'loss': 147.4956, 'grad_norm': 1777.6451266512659, 'learning_rate': 1.2034819933714576e-05, 'epoch': 0.62} +{'loss': 145.696, 'grad_norm': 1736.5677850942136, 'learning_rate': 1.203055155703189e-05, 'epoch': 0.62} +{'loss': 146.8481, 'grad_norm': 1737.8643291934734, 'learning_rate': 1.2026282794457915e-05, 'epoch': 0.62} +{'loss': 146.7222, 'grad_norm': 1843.5900797396905, 'learning_rate': 1.2022013646803894e-05, 'epoch': 0.62} +{'loss': 150.2865, 'grad_norm': 1924.5188280800874, 'learning_rate': 1.2017744114881146e-05, 'epoch': 0.62} +{'loss': 143.9635, 'grad_norm': 1531.9779942631721, 'learning_rate': 1.201347419950107e-05, 'epoch': 0.62} +{'loss': 143.1583, 'grad_norm': 1646.694031430515, 'learning_rate': 1.2009203901475124e-05, 'epoch': 0.62} +{'loss': 147.119, 'grad_norm': 1786.455766162841, 'learning_rate': 1.2004933221614854e-05, 'epoch': 0.62} +{'loss': 147.6758, 'grad_norm': 1852.8100002743831, 'learning_rate': 1.2000662160731867e-05, 'epoch': 0.62} +{'loss': 150.7771, 'grad_norm': 1724.9477545510917, 'learning_rate': 1.1996390719637847e-05, 'epoch': 0.62} +{'loss': 143.8758, 'grad_norm': 1663.3434069637922, 'learning_rate': 1.199211889914455e-05, 'epoch': 0.62} +{'loss': 142.2223, 'grad_norm': 2050.412571822061, 'learning_rate': 1.1987846700063803e-05, 'epoch': 0.62} +{'loss': 145.5133, 'grad_norm': 2300.797049513649, 'learning_rate': 1.1983574123207509e-05, 'epoch': 0.62} +{'loss': 143.9509, 'grad_norm': 1599.8621370312007, 'learning_rate': 1.1979301169387636e-05, 'epoch': 0.62} +{'loss': 142.0189, 'grad_norm': 1801.900096728544, 'learning_rate': 1.1975027839416227e-05, 'epoch': 0.62} +{'loss': 143.2292, 'grad_norm': 1950.3084196095297, 'learning_rate': 1.19707541341054e-05, 'epoch': 0.62} +{'loss': 143.9566, 'grad_norm': 1633.3023234933771, 'learning_rate': 1.1966480054267338e-05, 'epoch': 0.62} +{'loss': 154.2405, 'grad_norm': 1809.6781366952055, 'learning_rate': 1.19622056007143e-05, 'epoch': 0.62} +{'loss': 145.6521, 'grad_norm': 1651.7389815014296, 'learning_rate': 1.1957930774258616e-05, 'epoch': 0.62} +{'loss': 143.3104, 'grad_norm': 1718.0435161845833, 'learning_rate': 1.1953655575712679e-05, 'epoch': 0.62} +{'loss': 148.5567, 'grad_norm': 1747.856960280422, 'learning_rate': 1.1949380005888966e-05, 'epoch': 0.62} +{'loss': 144.1211, 'grad_norm': 2247.781386288774, 'learning_rate': 1.1945104065600013e-05, 'epoch': 0.62} +{'loss': 145.5268, 'grad_norm': 2113.855960765598, 'learning_rate': 1.1940827755658433e-05, 'epoch': 0.62} +{'loss': 146.6277, 'grad_norm': 1956.076218414202, 'learning_rate': 1.1936551076876903e-05, 'epoch': 0.62} +{'loss': 171.4124, 'grad_norm': 1926.6108861850473, 'learning_rate': 1.193227403006818e-05, 'epoch': 0.62} +{'loss': 147.0056, 'grad_norm': 2144.461114260359, 'learning_rate': 1.1927996616045083e-05, 'epoch': 0.62} +{'loss': 147.6409, 'grad_norm': 2092.0698551433825, 'learning_rate': 1.1923718835620496e-05, 'epoch': 0.62} +{'loss': 146.0984, 'grad_norm': 1674.8704749687959, 'learning_rate': 1.1919440689607394e-05, 'epoch': 0.62} +{'loss': 143.2096, 'grad_norm': 1799.8280291673875, 'learning_rate': 1.1915162178818793e-05, 'epoch': 0.62} +{'loss': 142.5081, 'grad_norm': 2056.10332239445, 'learning_rate': 1.19108833040678e-05, 'epoch': 0.62} +{'loss': 143.5046, 'grad_norm': 1588.5481385599958, 'learning_rate': 1.1906604066167583e-05, 'epoch': 0.62} +{'loss': 148.3951, 'grad_norm': 1699.3493187736212, 'learning_rate': 1.1902324465931373e-05, 'epoch': 0.62} +{'loss': 145.0956, 'grad_norm': 1829.9521099464532, 'learning_rate': 1.1898044504172483e-05, 'epoch': 0.62} +{'loss': 146.7783, 'grad_norm': 2297.622594335151, 'learning_rate': 1.1893764181704287e-05, 'epoch': 0.62} +{'loss': 145.2501, 'grad_norm': 1697.0625969464156, 'learning_rate': 1.1889483499340226e-05, 'epoch': 0.62} +{'loss': 143.3228, 'grad_norm': 1746.989225035801, 'learning_rate': 1.1885202457893819e-05, 'epoch': 0.62} +{'loss': 143.4317, 'grad_norm': 2039.3959468532903, 'learning_rate': 1.1880921058178637e-05, 'epoch': 0.62} +{'loss': 148.3185, 'grad_norm': 2067.272660240029, 'learning_rate': 1.1876639301008332e-05, 'epoch': 0.62} +{'loss': 147.3208, 'grad_norm': 1875.8855947142158, 'learning_rate': 1.1872357187196623e-05, 'epoch': 0.62} +{'loss': 147.7215, 'grad_norm': 1782.540818378599, 'learning_rate': 1.1868074717557288e-05, 'epoch': 0.62} +{'loss': 146.0668, 'grad_norm': 1695.5747817923038, 'learning_rate': 1.1863791892904186e-05, 'epoch': 0.62} +{'loss': 145.262, 'grad_norm': 1737.248583860037, 'learning_rate': 1.1859508714051232e-05, 'epoch': 0.62} +{'loss': 142.9154, 'grad_norm': 1904.535280651276, 'learning_rate': 1.1855225181812408e-05, 'epoch': 0.62} +{'loss': 146.4246, 'grad_norm': 2143.3317664732176, 'learning_rate': 1.1850941297001778e-05, 'epoch': 0.62} +{'loss': 144.3461, 'grad_norm': 1856.745642047669, 'learning_rate': 1.1846657060433453e-05, 'epoch': 0.63} +{'loss': 142.1063, 'grad_norm': 1951.559166852411, 'learning_rate': 1.1842372472921629e-05, 'epoch': 0.63} +{'loss': 149.9505, 'grad_norm': 1940.4585060086124, 'learning_rate': 1.1838087535280554e-05, 'epoch': 0.63} +{'loss': 148.0363, 'grad_norm': 1812.8316092860111, 'learning_rate': 1.1833802248324548e-05, 'epoch': 0.63} +{'loss': 146.0686, 'grad_norm': 1820.4144524850494, 'learning_rate': 1.1829516612868003e-05, 'epoch': 0.63} +{'loss': 142.6679, 'grad_norm': 1913.5534737090807, 'learning_rate': 1.1825230629725366e-05, 'epoch': 0.63} +{'loss': 142.2614, 'grad_norm': 2624.0538974298443, 'learning_rate': 1.1820944299711164e-05, 'epoch': 0.63} +{'loss': 140.8133, 'grad_norm': 1782.7532350997535, 'learning_rate': 1.181665762363998e-05, 'epoch': 0.63} +{'loss': 147.6169, 'grad_norm': 1731.362303651037, 'learning_rate': 1.1812370602326457e-05, 'epoch': 0.63} +{'loss': 142.4157, 'grad_norm': 1875.717188740785, 'learning_rate': 1.1808083236585321e-05, 'epoch': 0.63} +{'loss': 145.7213, 'grad_norm': 1717.5894415573293, 'learning_rate': 1.1803795527231352e-05, 'epoch': 0.63} +{'loss': 145.6343, 'grad_norm': 1660.5211621616652, 'learning_rate': 1.1799507475079394e-05, 'epoch': 0.63} +{'loss': 145.6157, 'grad_norm': 2096.646973994327, 'learning_rate': 1.1795219080944366e-05, 'epoch': 0.63} +{'loss': 146.2508, 'grad_norm': 1871.6701304528906, 'learning_rate': 1.1790930345641238e-05, 'epoch': 0.63} +{'loss': 138.4666, 'grad_norm': 1800.5811281132674, 'learning_rate': 1.1786641269985059e-05, 'epoch': 0.63} +{'loss': 146.4855, 'grad_norm': 1905.5579451565204, 'learning_rate': 1.1782351854790932e-05, 'epoch': 0.63} +{'loss': 142.5538, 'grad_norm': 1767.2908135467178, 'learning_rate': 1.1778062100874029e-05, 'epoch': 0.63} +{'loss': 143.8543, 'grad_norm': 1877.2282867391025, 'learning_rate': 1.1773772009049589e-05, 'epoch': 0.63} +{'loss': 141.8592, 'grad_norm': 2164.7462668073294, 'learning_rate': 1.1769481580132904e-05, 'epoch': 0.63} +{'loss': 141.61, 'grad_norm': 1722.8211493109734, 'learning_rate': 1.1765190814939351e-05, 'epoch': 0.63} +{'loss': 143.1513, 'grad_norm': 1750.3092235033405, 'learning_rate': 1.1760899714284348e-05, 'epoch': 0.63} +{'loss': 144.1422, 'grad_norm': 1783.0949273328338, 'learning_rate': 1.175660827898339e-05, 'epoch': 0.63} +{'loss': 141.6893, 'grad_norm': 1729.9846824993099, 'learning_rate': 1.1752316509852038e-05, 'epoch': 0.63} +{'loss': 147.4438, 'grad_norm': 1739.090033829036, 'learning_rate': 1.1748024407705898e-05, 'epoch': 0.63} +{'loss': 146.507, 'grad_norm': 1896.9984239624905, 'learning_rate': 1.1743731973360663e-05, 'epoch': 0.63} +{'loss': 143.0489, 'grad_norm': 1671.0678230533929, 'learning_rate': 1.1739439207632077e-05, 'epoch': 0.63} +{'loss': 147.387, 'grad_norm': 2097.267050294047, 'learning_rate': 1.1735146111335945e-05, 'epoch': 0.63} +{'loss': 144.8551, 'grad_norm': 1910.7789702584193, 'learning_rate': 1.173085268528814e-05, 'epoch': 0.63} +{'loss': 145.6084, 'grad_norm': 1824.3049922192151, 'learning_rate': 1.1726558930304596e-05, 'epoch': 0.63} +{'loss': 146.4657, 'grad_norm': 1661.93673842392, 'learning_rate': 1.172226484720131e-05, 'epoch': 0.63} +{'loss': 144.8737, 'grad_norm': 1817.746611924664, 'learning_rate': 1.1717970436794337e-05, 'epoch': 0.63} +{'loss': 142.7387, 'grad_norm': 1697.6277868313375, 'learning_rate': 1.1713675699899802e-05, 'epoch': 0.63} +{'loss': 148.4174, 'grad_norm': 1704.5479043495016, 'learning_rate': 1.1709380637333889e-05, 'epoch': 0.63} +{'loss': 144.9464, 'grad_norm': 1748.201422762255, 'learning_rate': 1.1705085249912837e-05, 'epoch': 0.63} +{'loss': 140.5737, 'grad_norm': 1753.955451454316, 'learning_rate': 1.1700789538452957e-05, 'epoch': 0.63} +{'loss': 146.4615, 'grad_norm': 1898.9318365178815, 'learning_rate': 1.1696493503770616e-05, 'epoch': 0.63} +{'loss': 145.6603, 'grad_norm': 1814.1110734013398, 'learning_rate': 1.1692197146682242e-05, 'epoch': 0.63} +{'loss': 141.5993, 'grad_norm': 1852.7792040070694, 'learning_rate': 1.1687900468004333e-05, 'epoch': 0.63} +{'loss': 144.5154, 'grad_norm': 1684.972314373867, 'learning_rate': 1.1683603468553433e-05, 'epoch': 0.63} +{'loss': 149.6173, 'grad_norm': 2039.4441175051004, 'learning_rate': 1.1679306149146154e-05, 'epoch': 0.63} +{'loss': 146.9079, 'grad_norm': 1895.9755576049765, 'learning_rate': 1.1675008510599176e-05, 'epoch': 0.63} +{'loss': 147.401, 'grad_norm': 2125.1440082792874, 'learning_rate': 1.1670710553729231e-05, 'epoch': 0.63} +{'loss': 143.5773, 'grad_norm': 1995.1864531039105, 'learning_rate': 1.1666412279353114e-05, 'epoch': 0.63} +{'loss': 141.473, 'grad_norm': 1953.945533306282, 'learning_rate': 1.1662113688287679e-05, 'epoch': 0.63} +{'loss': 144.3737, 'grad_norm': 1825.4535462022843, 'learning_rate': 1.1657814781349844e-05, 'epoch': 0.63} +{'loss': 142.6539, 'grad_norm': 1862.941459786742, 'learning_rate': 1.1653515559356578e-05, 'epoch': 0.63} +{'loss': 142.0663, 'grad_norm': 1655.0540950192928, 'learning_rate': 1.1649216023124925e-05, 'epoch': 0.63} +{'loss': 146.5691, 'grad_norm': 1886.5437117342585, 'learning_rate': 1.1644916173471976e-05, 'epoch': 0.63} +{'loss': 141.3288, 'grad_norm': 1914.5027093653046, 'learning_rate': 1.1640616011214883e-05, 'epoch': 0.63} +{'loss': 144.2214, 'grad_norm': 1620.4582973820272, 'learning_rate': 1.1636315537170863e-05, 'epoch': 0.63} +{'loss': 143.695, 'grad_norm': 1756.455125417768, 'learning_rate': 1.163201475215719e-05, 'epoch': 0.63} +{'loss': 144.6936, 'grad_norm': 1730.7872432731951, 'learning_rate': 1.1627713656991195e-05, 'epoch': 0.63} +{'loss': 151.0932, 'grad_norm': 2017.177674253007, 'learning_rate': 1.162341225249027e-05, 'epoch': 0.64} +{'loss': 144.3904, 'grad_norm': 1956.7196500679045, 'learning_rate': 1.1619110539471863e-05, 'epoch': 0.64} +{'loss': 141.1097, 'grad_norm': 1954.2562122514673, 'learning_rate': 1.1614808518753485e-05, 'epoch': 0.64} +{'loss': 142.2898, 'grad_norm': 1715.0252795252711, 'learning_rate': 1.1610506191152702e-05, 'epoch': 0.64} +{'loss': 139.9282, 'grad_norm': 1750.0279056170793, 'learning_rate': 1.160620355748714e-05, 'epoch': 0.64} +{'loss': 144.2404, 'grad_norm': 1725.2128109052692, 'learning_rate': 1.1601900618574485e-05, 'epoch': 0.64} +{'loss': 141.5964, 'grad_norm': 1681.254414373116, 'learning_rate': 1.1597597375232472e-05, 'epoch': 0.64} +{'loss': 145.6237, 'grad_norm': 1836.0451227610195, 'learning_rate': 1.1593293828278908e-05, 'epoch': 0.64} +{'loss': 144.7175, 'grad_norm': 1844.55408677192, 'learning_rate': 1.1588989978531646e-05, 'epoch': 0.64} +{'loss': 142.803, 'grad_norm': 1661.838795099941, 'learning_rate': 1.1584685826808604e-05, 'epoch': 0.64} +{'loss': 141.7017, 'grad_norm': 1547.1743072638542, 'learning_rate': 1.158038137392775e-05, 'epoch': 0.64} +{'loss': 140.6119, 'grad_norm': 1853.0936798022365, 'learning_rate': 1.1576076620707114e-05, 'epoch': 0.64} +{'loss': 143.5009, 'grad_norm': 1756.2140877823488, 'learning_rate': 1.1571771567964787e-05, 'epoch': 0.64} +{'loss': 144.0576, 'grad_norm': 1700.9330853183467, 'learning_rate': 1.1567466216518905e-05, 'epoch': 0.64} +{'loss': 145.131, 'grad_norm': 1635.3371363085753, 'learning_rate': 1.1563160567187678e-05, 'epoch': 0.64} +{'loss': 142.9391, 'grad_norm': 2104.563125617848, 'learning_rate': 1.1558854620789356e-05, 'epoch': 0.64} +{'loss': 147.9075, 'grad_norm': 1666.8285668544747, 'learning_rate': 1.1554548378142249e-05, 'epoch': 0.64} +{'loss': 141.9854, 'grad_norm': 1491.295222651772, 'learning_rate': 1.1550241840064738e-05, 'epoch': 0.64} +{'loss': 146.7997, 'grad_norm': 2566.0793984107017, 'learning_rate': 1.1545935007375238e-05, 'epoch': 0.64} +{'loss': 145.366, 'grad_norm': 1869.7115500477294, 'learning_rate': 1.1541627880892232e-05, 'epoch': 0.64} +{'loss': 145.3401, 'grad_norm': 1877.8856017551452, 'learning_rate': 1.1537320461434263e-05, 'epoch': 0.64} +{'loss': 142.4762, 'grad_norm': 1934.7755144772132, 'learning_rate': 1.1533012749819919e-05, 'epoch': 0.64} +{'loss': 141.5666, 'grad_norm': 2310.3653512061287, 'learning_rate': 1.1528704746867853e-05, 'epoch': 0.64} +{'loss': 142.1457, 'grad_norm': 1798.7842878008169, 'learning_rate': 1.1524396453396767e-05, 'epoch': 0.64} +{'loss': 144.3154, 'grad_norm': 1861.5438226907042, 'learning_rate': 1.1520087870225419e-05, 'epoch': 0.64} +{'loss': 140.5509, 'grad_norm': 1780.9092947679558, 'learning_rate': 1.1515778998172626e-05, 'epoch': 0.64} +{'loss': 144.5709, 'grad_norm': 1570.1231667194354, 'learning_rate': 1.1511469838057253e-05, 'epoch': 0.64} +{'loss': 140.3308, 'grad_norm': 1816.0413765746255, 'learning_rate': 1.1507160390698229e-05, 'epoch': 0.64} +{'loss': 141.6913, 'grad_norm': 1517.282366479945, 'learning_rate': 1.1502850656914528e-05, 'epoch': 0.64} +{'loss': 142.2257, 'grad_norm': 1652.4047524000757, 'learning_rate': 1.1498540637525187e-05, 'epoch': 0.64} +{'loss': 144.2247, 'grad_norm': 1751.3494213862177, 'learning_rate': 1.1494230333349292e-05, 'epoch': 0.64} +{'loss': 144.7173, 'grad_norm': 1577.2447443050194, 'learning_rate': 1.1489919745205982e-05, 'epoch': 0.64} +{'loss': 142.0884, 'grad_norm': 1976.9804017694876, 'learning_rate': 1.1485608873914455e-05, 'epoch': 0.64} +{'loss': 143.2783, 'grad_norm': 1825.20991603427, 'learning_rate': 1.148129772029396e-05, 'epoch': 0.64} +{'loss': 138.5364, 'grad_norm': 1573.7040898552352, 'learning_rate': 1.14769862851638e-05, 'epoch': 0.64} +{'loss': 142.3449, 'grad_norm': 1789.488840359041, 'learning_rate': 1.1472674569343331e-05, 'epoch': 0.64} +{'loss': 143.1418, 'grad_norm': 1758.4838648416533, 'learning_rate': 1.1468362573651959e-05, 'epoch': 0.64} +{'loss': 140.5006, 'grad_norm': 1521.1124419474777, 'learning_rate': 1.1464050298909153e-05, 'epoch': 0.64} +{'loss': 149.1841, 'grad_norm': 1889.6323118234034, 'learning_rate': 1.1459737745934424e-05, 'epoch': 0.64} +{'loss': 141.296, 'grad_norm': 1771.0832222402053, 'learning_rate': 1.1455424915547345e-05, 'epoch': 0.64} +{'loss': 138.5784, 'grad_norm': 1740.6629631829485, 'learning_rate': 1.1451111808567536e-05, 'epoch': 0.64} +{'loss': 141.7769, 'grad_norm': 1588.1825303725109, 'learning_rate': 1.1446798425814667e-05, 'epoch': 0.64} +{'loss': 141.7619, 'grad_norm': 1742.2405034569144, 'learning_rate': 1.1442484768108471e-05, 'epoch': 0.64} +{'loss': 141.8579, 'grad_norm': 1614.1619103342316, 'learning_rate': 1.1438170836268723e-05, 'epoch': 0.64} +{'loss': 144.1442, 'grad_norm': 1744.8729916651594, 'learning_rate': 1.1433856631115252e-05, 'epoch': 0.64} +{'loss': 144.1384, 'grad_norm': 1779.4373505709343, 'learning_rate': 1.1429542153467945e-05, 'epoch': 0.64} +{'loss': 143.7566, 'grad_norm': 1522.9153872324662, 'learning_rate': 1.1425227404146735e-05, 'epoch': 0.64} +{'loss': 144.7794, 'grad_norm': 1787.4680978771366, 'learning_rate': 1.1420912383971606e-05, 'epoch': 0.64} +{'loss': 142.7133, 'grad_norm': 1855.1148948735308, 'learning_rate': 1.14165970937626e-05, 'epoch': 0.64} +{'loss': 139.7351, 'grad_norm': 1563.1349865009747, 'learning_rate': 1.14122815343398e-05, 'epoch': 0.64} +{'loss': 146.6094, 'grad_norm': 2100.375014909321, 'learning_rate': 1.1407965706523352e-05, 'epoch': 0.64} +{'loss': 146.1039, 'grad_norm': 1863.0250442701288, 'learning_rate': 1.1403649611133444e-05, 'epoch': 0.64} +{'loss': 140.7871, 'grad_norm': 1657.9892373982811, 'learning_rate': 1.1399333248990318e-05, 'epoch': 0.65} +{'loss': 146.9258, 'grad_norm': 1700.6980750088453, 'learning_rate': 1.139501662091427e-05, 'epoch': 0.65} +{'loss': 144.6116, 'grad_norm': 1627.8877014418108, 'learning_rate': 1.1390699727725636e-05, 'epoch': 0.65} +{'loss': 143.9261, 'grad_norm': 1792.801320324804, 'learning_rate': 1.1386382570244819e-05, 'epoch': 0.65} +{'loss': 144.6518, 'grad_norm': 1934.4705743919762, 'learning_rate': 1.1382065149292254e-05, 'epoch': 0.65} +{'loss': 142.19, 'grad_norm': 1877.7821412240326, 'learning_rate': 1.137774746568844e-05, 'epoch': 0.65} +{'loss': 143.7156, 'grad_norm': 1765.7183331628057, 'learning_rate': 1.1373429520253922e-05, 'epoch': 0.65} +{'loss': 140.3811, 'grad_norm': 1643.6147039900582, 'learning_rate': 1.1369111313809287e-05, 'epoch': 0.65} +{'loss': 139.7648, 'grad_norm': 1616.3967572977078, 'learning_rate': 1.1364792847175186e-05, 'epoch': 0.65} +{'loss': 140.693, 'grad_norm': 1772.690687515749, 'learning_rate': 1.1360474121172306e-05, 'epoch': 0.65} +{'loss': 147.702, 'grad_norm': 1686.3988033163744, 'learning_rate': 1.1356155136621392e-05, 'epoch': 0.65} +{'loss': 142.1896, 'grad_norm': 1756.1308976785588, 'learning_rate': 1.1351835894343235e-05, 'epoch': 0.65} +{'loss': 142.4242, 'grad_norm': 3227.315493168273, 'learning_rate': 1.1347516395158671e-05, 'epoch': 0.65} +{'loss': 142.1969, 'grad_norm': 1918.1872839085233, 'learning_rate': 1.1343196639888591e-05, 'epoch': 0.65} +{'loss': 143.1854, 'grad_norm': 2102.3685253950302, 'learning_rate': 1.1338876629353937e-05, 'epoch': 0.65} +{'loss': 139.8442, 'grad_norm': 1721.5448281179333, 'learning_rate': 1.1334556364375689e-05, 'epoch': 0.65} +{'loss': 142.8338, 'grad_norm': 1796.9276514236535, 'learning_rate': 1.1330235845774884e-05, 'epoch': 0.65} +{'loss': 139.2171, 'grad_norm': 1715.3713964422807, 'learning_rate': 1.1325915074372604e-05, 'epoch': 0.65} +{'loss': 137.3916, 'grad_norm': 1784.8401163973672, 'learning_rate': 1.1321594050989981e-05, 'epoch': 0.65} +{'loss': 145.974, 'grad_norm': 1699.6440132083312, 'learning_rate': 1.1317272776448192e-05, 'epoch': 0.65} +{'loss': 139.7355, 'grad_norm': 1757.0954220115061, 'learning_rate': 1.1312951251568461e-05, 'epoch': 0.65} +{'loss': 142.3091, 'grad_norm': 1619.3141413688995, 'learning_rate': 1.1308629477172069e-05, 'epoch': 0.65} +{'loss': 143.8361, 'grad_norm': 1863.3624167313212, 'learning_rate': 1.130430745408033e-05, 'epoch': 0.65} +{'loss': 141.9278, 'grad_norm': 1751.4407126009244, 'learning_rate': 1.1299985183114614e-05, 'epoch': 0.65} +{'loss': 145.0178, 'grad_norm': 1778.6967032781727, 'learning_rate': 1.1295662665096339e-05, 'epoch': 0.65} +{'loss': 140.7528, 'grad_norm': 3447.6519162453915, 'learning_rate': 1.129133990084697e-05, 'epoch': 0.65} +{'loss': 144.5522, 'grad_norm': 1568.5909409649346, 'learning_rate': 1.1287016891188007e-05, 'epoch': 0.65} +{'loss': 144.1236, 'grad_norm': 1686.2174880004272, 'learning_rate': 1.1282693636941013e-05, 'epoch': 0.65} +{'loss': 143.1278, 'grad_norm': 1662.3124804796446, 'learning_rate': 1.127837013892759e-05, 'epoch': 0.65} +{'loss': 143.9554, 'grad_norm': 1708.8480247093419, 'learning_rate': 1.1274046397969386e-05, 'epoch': 0.65} +{'loss': 140.489, 'grad_norm': 1707.4818136101187, 'learning_rate': 1.1269722414888096e-05, 'epoch': 0.65} +{'loss': 140.7335, 'grad_norm': 1640.6630895421035, 'learning_rate': 1.126539819050546e-05, 'epoch': 0.65} +{'loss': 140.6028, 'grad_norm': 1520.9795886321638, 'learning_rate': 1.1261073725643266e-05, 'epoch': 0.65} +{'loss': 142.2754, 'grad_norm': 1959.7861818394874, 'learning_rate': 1.1256749021123346e-05, 'epoch': 0.65} +{'loss': 138.6577, 'grad_norm': 1915.7675108975443, 'learning_rate': 1.1252424077767577e-05, 'epoch': 0.65} +{'loss': 144.0404, 'grad_norm': 2308.518954655509, 'learning_rate': 1.1248098896397888e-05, 'epoch': 0.65} +{'loss': 140.0591, 'grad_norm': 1740.3246752611951, 'learning_rate': 1.124377347783624e-05, 'epoch': 0.65} +{'loss': 141.4146, 'grad_norm': 1549.820695869398, 'learning_rate': 1.123944782290465e-05, 'epoch': 0.65} +{'loss': 142.6025, 'grad_norm': 1741.4089418125695, 'learning_rate': 1.1235121932425177e-05, 'epoch': 0.65} +{'loss': 138.8098, 'grad_norm': 1641.3906519822624, 'learning_rate': 1.1230795807219926e-05, 'epoch': 0.65} +{'loss': 140.8309, 'grad_norm': 1741.6859067279424, 'learning_rate': 1.1226469448111041e-05, 'epoch': 0.65} +{'loss': 143.4589, 'grad_norm': 1961.2236950691538, 'learning_rate': 1.1222142855920719e-05, 'epoch': 0.65} +{'loss': 142.2959, 'grad_norm': 2137.081685026581, 'learning_rate': 1.1217816031471194e-05, 'epoch': 0.65} +{'loss': 139.3393, 'grad_norm': 1788.2582192075658, 'learning_rate': 1.121348897558475e-05, 'epoch': 0.65} +{'loss': 139.7923, 'grad_norm': 2256.283575810486, 'learning_rate': 1.1209161689083706e-05, 'epoch': 0.65} +{'loss': 141.7611, 'grad_norm': 2092.174164891077, 'learning_rate': 1.1204834172790435e-05, 'epoch': 0.65} +{'loss': 141.6088, 'grad_norm': 1678.5477442349315, 'learning_rate': 1.120050642752735e-05, 'epoch': 0.65} +[2025-05-29 00:15:24,343] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step34000 is about to be saved! +[2025-05-29 00:15:24,363] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 00:15:24,363] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 00:15:24,413] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 00:15:24,548] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 00:21:27,899] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 00:21:27,944] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 00:21:30,517] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step34000 is ready now! +[2025-05-29 00:21:30,548] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step34000 is about to be saved! +[2025-05-29 00:21:30,566] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 00:21:30,567] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 00:21:30,794] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 00:21:33,386] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 00:27:16,647] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 00:27:16,804] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-34000/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 00:27:33,447] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step34000 is ready now! +{'loss': 141.6344, 'grad_norm': 1995.7993385529944, 'learning_rate': 1.1196178454116904e-05, 'epoch': 0.65} +{'loss': 141.5064, 'grad_norm': 1720.6329624987209, 'learning_rate': 1.1191850253381602e-05, 'epoch': 0.65} +{'loss': 141.9736, 'grad_norm': 1876.2587856884443, 'learning_rate': 1.1187521826143981e-05, 'epoch': 0.65} +{'loss': 139.8613, 'grad_norm': 1601.9812345152648, 'learning_rate': 1.1183193173226626e-05, 'epoch': 0.65} +{'loss': 137.9973, 'grad_norm': 1596.468202801082, 'learning_rate': 1.1178864295452167e-05, 'epoch': 0.65} +{'loss': 141.5575, 'grad_norm': 1792.2909994549268, 'learning_rate': 1.1174535193643279e-05, 'epoch': 0.66} +{'loss': 144.9054, 'grad_norm': 1766.2840875086777, 'learning_rate': 1.1170205868622672e-05, 'epoch': 0.66} +{'loss': 142.9061, 'grad_norm': 1720.5245721533768, 'learning_rate': 1.1165876321213103e-05, 'epoch': 0.66} +{'loss': 140.4631, 'grad_norm': 1927.1212300404056, 'learning_rate': 1.1161546552237368e-05, 'epoch': 0.66} +{'loss': 140.3039, 'grad_norm': 1600.722442033555, 'learning_rate': 1.115721656251831e-05, 'epoch': 0.66} +{'loss': 140.1715, 'grad_norm': 1837.5301777450322, 'learning_rate': 1.1152886352878811e-05, 'epoch': 0.66} +{'loss': 143.5902, 'grad_norm': 1836.3801298334436, 'learning_rate': 1.11485559241418e-05, 'epoch': 0.66} +{'loss': 138.6493, 'grad_norm': 1628.6168828814627, 'learning_rate': 1.1144225277130235e-05, 'epoch': 0.66} +{'loss': 140.5268, 'grad_norm': 1659.5190085842012, 'learning_rate': 1.1139894412667125e-05, 'epoch': 0.66} +{'loss': 144.3165, 'grad_norm': 1642.5907981900036, 'learning_rate': 1.1135563331575522e-05, 'epoch': 0.66} +{'loss': 144.0533, 'grad_norm': 1721.2653131009802, 'learning_rate': 1.1131232034678513e-05, 'epoch': 0.66} +{'loss': 140.4599, 'grad_norm': 2060.5816383777824, 'learning_rate': 1.1126900522799233e-05, 'epoch': 0.66} +{'loss': 142.0379, 'grad_norm': 1655.5365855612602, 'learning_rate': 1.112256879676085e-05, 'epoch': 0.66} +{'loss': 140.5223, 'grad_norm': 1789.785611721754, 'learning_rate': 1.1118236857386578e-05, 'epoch': 0.66} +{'loss': 141.4188, 'grad_norm': 1816.0503145975802, 'learning_rate': 1.1113904705499669e-05, 'epoch': 0.66} +{'loss': 145.8719, 'grad_norm': 1893.900656176157, 'learning_rate': 1.1109572341923416e-05, 'epoch': 0.66} +{'loss': 142.1017, 'grad_norm': 1932.5360440210409, 'learning_rate': 1.1105239767481154e-05, 'epoch': 0.66} +{'loss': 138.6646, 'grad_norm': 1776.6829315703203, 'learning_rate': 1.1100906982996257e-05, 'epoch': 0.66} +{'loss': 143.4271, 'grad_norm': 1755.1505301707214, 'learning_rate': 1.1096573989292138e-05, 'epoch': 0.66} +{'loss': 139.5921, 'grad_norm': 1697.3670984732414, 'learning_rate': 1.1092240787192253e-05, 'epoch': 0.66} +{'loss': 141.7671, 'grad_norm': 1863.042563074501, 'learning_rate': 1.1087907377520086e-05, 'epoch': 0.66} +{'loss': 142.92, 'grad_norm': 1832.3662168441165, 'learning_rate': 1.1083573761099183e-05, 'epoch': 0.66} +{'loss': 139.8148, 'grad_norm': 1641.3090858794092, 'learning_rate': 1.10792399387531e-05, 'epoch': 0.66} +{'loss': 141.5903, 'grad_norm': 1724.15822938412, 'learning_rate': 1.1074905911305462e-05, 'epoch': 0.66} +{'loss': 142.6532, 'grad_norm': 2000.4551946745771, 'learning_rate': 1.1070571679579913e-05, 'epoch': 0.66} +{'loss': 142.139, 'grad_norm': 2163.3973104499896, 'learning_rate': 1.1066237244400138e-05, 'epoch': 0.66} +{'loss': 140.6748, 'grad_norm': 2059.6554285898, 'learning_rate': 1.1061902606589874e-05, 'epoch': 0.66} +{'loss': 141.5919, 'grad_norm': 1880.6018418348417, 'learning_rate': 1.105756776697288e-05, 'epoch': 0.66} +{'loss': 139.4462, 'grad_norm': 1737.0447501216672, 'learning_rate': 1.1053232726372954e-05, 'epoch': 0.66} +{'loss': 143.8326, 'grad_norm': 1724.2115539863557, 'learning_rate': 1.1048897485613952e-05, 'epoch': 0.66} +{'loss': 140.6188, 'grad_norm': 1918.5383487300064, 'learning_rate': 1.1044562045519744e-05, 'epoch': 0.66} +{'loss': 147.6212, 'grad_norm': 1806.1392546929378, 'learning_rate': 1.1040226406914254e-05, 'epoch': 0.66} +{'loss': 138.6265, 'grad_norm': 1897.9394784326885, 'learning_rate': 1.103589057062144e-05, 'epoch': 0.66} +{'loss': 141.968, 'grad_norm': 1746.276911353478, 'learning_rate': 1.1031554537465284e-05, 'epoch': 0.66} +{'loss': 136.1845, 'grad_norm': 1514.003412005808, 'learning_rate': 1.1027218308269829e-05, 'epoch': 0.66} +{'loss': 143.6463, 'grad_norm': 1734.9496563294315, 'learning_rate': 1.1022881883859138e-05, 'epoch': 0.66} +{'loss': 142.1518, 'grad_norm': 1887.539523519787, 'learning_rate': 1.1018545265057319e-05, 'epoch': 0.66} +{'loss': 137.8855, 'grad_norm': 1824.44661526484, 'learning_rate': 1.1014208452688513e-05, 'epoch': 0.66} +{'loss': 140.8313, 'grad_norm': 1765.3970826084674, 'learning_rate': 1.1009871447576894e-05, 'epoch': 0.66} +{'loss': 141.7638, 'grad_norm': 1785.3363269535243, 'learning_rate': 1.1005534250546687e-05, 'epoch': 0.66} +{'loss': 139.4428, 'grad_norm': 1694.6164884218906, 'learning_rate': 1.1001196862422138e-05, 'epoch': 0.66} +{'loss': 144.6555, 'grad_norm': 1652.007724883764, 'learning_rate': 1.099685928402754e-05, 'epoch': 0.66} +{'loss': 142.9641, 'grad_norm': 1547.7620139792332, 'learning_rate': 1.0992521516187214e-05, 'epoch': 0.66} +{'loss': 136.2173, 'grad_norm': 1678.4131303142924, 'learning_rate': 1.0988183559725515e-05, 'epoch': 0.66} +{'loss': 142.9628, 'grad_norm': 1532.113265389966, 'learning_rate': 1.0983845415466853e-05, 'epoch': 0.66} +{'loss': 138.838, 'grad_norm': 1600.119773828472, 'learning_rate': 1.0979507084235653e-05, 'epoch': 0.66} +{'loss': 135.8113, 'grad_norm': 1577.5246431886294, 'learning_rate': 1.0975168566856378e-05, 'epoch': 0.66} +{'loss': 140.4834, 'grad_norm': 1841.5620328874804, 'learning_rate': 1.0970829864153542e-05, 'epoch': 0.66} +{'loss': 141.4258, 'grad_norm': 2145.1705327907016, 'learning_rate': 1.0966490976951674e-05, 'epoch': 0.66} +{'loss': 142.7978, 'grad_norm': 1746.302430007947, 'learning_rate': 1.0962151906075351e-05, 'epoch': 0.66} +{'loss': 142.1075, 'grad_norm': 2182.242805536289, 'learning_rate': 1.095781265234918e-05, 'epoch': 0.66} +{'loss': 142.3702, 'grad_norm': 1800.1137197228677, 'learning_rate': 1.0953473216597808e-05, 'epoch': 0.66} +{'loss': 141.9398, 'grad_norm': 1687.415416289487, 'learning_rate': 1.094913359964591e-05, 'epoch': 0.67} +{'loss': 143.9371, 'grad_norm': 1880.0040926515408, 'learning_rate': 1.0944793802318195e-05, 'epoch': 0.67} +{'loss': 142.0055, 'grad_norm': 1691.7375331967394, 'learning_rate': 1.0940453825439412e-05, 'epoch': 0.67} +{'loss': 140.8486, 'grad_norm': 1604.2885557861734, 'learning_rate': 1.0936113669834343e-05, 'epoch': 0.67} +{'loss': 139.4889, 'grad_norm': 1574.241884322103, 'learning_rate': 1.09317733363278e-05, 'epoch': 0.67} +{'loss': 142.7334, 'grad_norm': 2003.3480997855868, 'learning_rate': 1.0927432825744635e-05, 'epoch': 0.67} +{'loss': 138.5456, 'grad_norm': 1577.0053735082454, 'learning_rate': 1.0923092138909726e-05, 'epoch': 0.67} +{'loss': 136.5724, 'grad_norm': 1989.4734501844068, 'learning_rate': 1.0918751276647988e-05, 'epoch': 0.67} +{'loss': 139.5686, 'grad_norm': 1823.1454191166015, 'learning_rate': 1.0914410239784372e-05, 'epoch': 0.67} +{'loss': 140.8682, 'grad_norm': 1682.830831272488, 'learning_rate': 1.0910069029143862e-05, 'epoch': 0.67} +{'loss': 138.9207, 'grad_norm': 1782.518961621625, 'learning_rate': 1.0905727645551467e-05, 'epoch': 0.67} +{'loss': 140.0886, 'grad_norm': 1833.922053281163, 'learning_rate': 1.090138608983224e-05, 'epoch': 0.67} +{'loss': 136.9453, 'grad_norm': 1594.203000147675, 'learning_rate': 1.0897044362811258e-05, 'epoch': 0.67} +{'loss': 139.8073, 'grad_norm': 1848.564114792692, 'learning_rate': 1.0892702465313637e-05, 'epoch': 0.67} +{'loss': 140.3126, 'grad_norm': 1836.859618229129, 'learning_rate': 1.0888360398164521e-05, 'epoch': 0.67} +{'loss': 139.4065, 'grad_norm': 1573.609789114124, 'learning_rate': 1.0884018162189086e-05, 'epoch': 0.67} +{'loss': 139.785, 'grad_norm': 1735.1554806688466, 'learning_rate': 1.0879675758212547e-05, 'epoch': 0.67} +{'loss': 139.1342, 'grad_norm': 2155.371594258881, 'learning_rate': 1.087533318706014e-05, 'epoch': 0.67} +{'loss': 140.05, 'grad_norm': 1812.1524815800772, 'learning_rate': 1.0870990449557142e-05, 'epoch': 0.67} +{'loss': 138.0236, 'grad_norm': 1743.322530347828, 'learning_rate': 1.0866647546528857e-05, 'epoch': 0.67} +{'loss': 140.1814, 'grad_norm': 1839.87053797012, 'learning_rate': 1.086230447880062e-05, 'epoch': 0.67} +{'loss': 139.8122, 'grad_norm': 1725.7237850840265, 'learning_rate': 1.08579612471978e-05, 'epoch': 0.67} +{'loss': 138.7298, 'grad_norm': 1897.120072977913, 'learning_rate': 1.0853617852545799e-05, 'epoch': 0.67} +{'loss': 140.0452, 'grad_norm': 1668.4200543621316, 'learning_rate': 1.0849274295670045e-05, 'epoch': 0.67} +{'loss': 139.8707, 'grad_norm': 1516.477447488363, 'learning_rate': 1.0844930577395996e-05, 'epoch': 0.67} +{'loss': 136.9464, 'grad_norm': 1671.600579526991, 'learning_rate': 1.0840586698549145e-05, 'epoch': 0.67} +{'loss': 140.7229, 'grad_norm': 1907.4086484987288, 'learning_rate': 1.0836242659955016e-05, 'epoch': 0.67} +{'loss': 135.1783, 'grad_norm': 1564.2888266992838, 'learning_rate': 1.0831898462439164e-05, 'epoch': 0.67} +{'loss': 142.7739, 'grad_norm': 1848.5647109479578, 'learning_rate': 1.0827554106827162e-05, 'epoch': 0.67} +{'loss': 140.9056, 'grad_norm': 1890.8407105502156, 'learning_rate': 1.082320959394463e-05, 'epoch': 0.67} +{'loss': 140.2505, 'grad_norm': 1616.5358172279648, 'learning_rate': 1.081886492461721e-05, 'epoch': 0.67} +{'loss': 138.8262, 'grad_norm': 1702.6793441290365, 'learning_rate': 1.0814520099670571e-05, 'epoch': 0.67} +{'loss': 142.8912, 'grad_norm': 1732.4811139002582, 'learning_rate': 1.0810175119930421e-05, 'epoch': 0.67} +{'loss': 141.309, 'grad_norm': 1563.0807687756055, 'learning_rate': 1.0805829986222485e-05, 'epoch': 0.67} +{'loss': 140.7355, 'grad_norm': 1655.306271759963, 'learning_rate': 1.0801484699372524e-05, 'epoch': 0.67} +{'loss': 142.3705, 'grad_norm': 1831.8173367457716, 'learning_rate': 1.0797139260206331e-05, 'epoch': 0.67} +{'loss': 141.2607, 'grad_norm': 1830.0327534544267, 'learning_rate': 1.0792793669549721e-05, 'epoch': 0.67} +{'loss': 138.7283, 'grad_norm': 1568.2344626693346, 'learning_rate': 1.0788447928228547e-05, 'epoch': 0.67} +{'loss': 135.0217, 'grad_norm': 1669.2858376755487, 'learning_rate': 1.0784102037068679e-05, 'epoch': 0.67} +{'loss': 139.9993, 'grad_norm': 1784.6374181863903, 'learning_rate': 1.0779755996896022e-05, 'epoch': 0.67} +{'loss': 138.6689, 'grad_norm': 1700.166294546746, 'learning_rate': 1.0775409808536513e-05, 'epoch': 0.67} +{'loss': 143.3536, 'grad_norm': 1870.9528695701752, 'learning_rate': 1.0771063472816108e-05, 'epoch': 0.67} +{'loss': 140.4315, 'grad_norm': 1479.5740102716184, 'learning_rate': 1.07667169905608e-05, 'epoch': 0.67} +{'loss': 142.7192, 'grad_norm': 1505.975696252214, 'learning_rate': 1.0762370362596602e-05, 'epoch': 0.67} +{'loss': 142.6977, 'grad_norm': 2112.2076435280596, 'learning_rate': 1.0758023589749562e-05, 'epoch': 0.67} +{'loss': 141.7265, 'grad_norm': 1784.383334640409, 'learning_rate': 1.0753676672845752e-05, 'epoch': 0.67} +{'loss': 139.8931, 'grad_norm': 2008.9661130564325, 'learning_rate': 1.0749329612711264e-05, 'epoch': 0.67} +{'loss': 138.7041, 'grad_norm': 1520.2857515023106, 'learning_rate': 1.0744982410172236e-05, 'epoch': 0.67} +{'loss': 139.1984, 'grad_norm': 1681.621620340973, 'learning_rate': 1.0740635066054815e-05, 'epoch': 0.67} +{'loss': 141.3304, 'grad_norm': 1564.7891357046651, 'learning_rate': 1.073628758118518e-05, 'epoch': 0.67} +{'loss': 142.137, 'grad_norm': 1810.285726225181, 'learning_rate': 1.0731939956389546e-05, 'epoch': 0.67} +{'loss': 141.5341, 'grad_norm': 1627.76837074684, 'learning_rate': 1.0727592192494135e-05, 'epoch': 0.67} +{'loss': 137.4777, 'grad_norm': 1602.0096310245772, 'learning_rate': 1.0723244290325218e-05, 'epoch': 0.68} +{'loss': 141.2286, 'grad_norm': 1773.9302674417424, 'learning_rate': 1.0718896250709078e-05, 'epoch': 0.68} +{'loss': 143.237, 'grad_norm': 1750.0978278160392, 'learning_rate': 1.0714548074472026e-05, 'epoch': 0.68} +{'loss': 140.3294, 'grad_norm': 2399.159587161989, 'learning_rate': 1.0710199762440406e-05, 'epoch': 0.68} +{'loss': 148.7832, 'grad_norm': 5474.663900560582, 'learning_rate': 1.0705851315440575e-05, 'epoch': 0.68} +{'loss': 140.4863, 'grad_norm': 1653.11147088756, 'learning_rate': 1.0701502734298928e-05, 'epoch': 0.68} +{'loss': 137.0558, 'grad_norm': 1912.8022822497835, 'learning_rate': 1.069715401984188e-05, 'epoch': 0.68} +{'loss': 139.9372, 'grad_norm': 1625.5739754486415, 'learning_rate': 1.0692805172895871e-05, 'epoch': 0.68} +{'loss': 138.865, 'grad_norm': 1759.1434924726834, 'learning_rate': 1.0688456194287368e-05, 'epoch': 0.68} +{'loss': 136.3386, 'grad_norm': 1828.8990332668368, 'learning_rate': 1.0684107084842858e-05, 'epoch': 0.68} +{'loss': 138.2297, 'grad_norm': 2086.8079760858627, 'learning_rate': 1.0679757845388868e-05, 'epoch': 0.68} +{'loss': 140.2652, 'grad_norm': 1604.0779412501806, 'learning_rate': 1.067540847675193e-05, 'epoch': 0.68} +{'loss': 138.7289, 'grad_norm': 2028.7186249104464, 'learning_rate': 1.0671058979758606e-05, 'epoch': 0.68} +{'loss': 137.132, 'grad_norm': 1940.9375802614895, 'learning_rate': 1.0666709355235496e-05, 'epoch': 0.68} +{'loss': 135.4978, 'grad_norm': 1652.099252577259, 'learning_rate': 1.0662359604009207e-05, 'epoch': 0.68} +{'loss': 140.1616, 'grad_norm': 1894.6870570910435, 'learning_rate': 1.0658009726906377e-05, 'epoch': 0.68} +{'loss': 142.1042, 'grad_norm': 2065.4615149174497, 'learning_rate': 1.0653659724753676e-05, 'epoch': 0.68} +{'loss': 139.6472, 'grad_norm': 1623.8570201012028, 'learning_rate': 1.0649309598377775e-05, 'epoch': 0.68} +{'loss': 139.4377, 'grad_norm': 1743.327969973656, 'learning_rate': 1.0644959348605397e-05, 'epoch': 0.68} +{'loss': 135.415, 'grad_norm': 1611.8944919362073, 'learning_rate': 1.0640608976263268e-05, 'epoch': 0.68} +{'loss': 137.0322, 'grad_norm': 1779.2836696781496, 'learning_rate': 1.0636258482178144e-05, 'epoch': 0.68} +{'loss': 135.8973, 'grad_norm': 1683.1869122352362, 'learning_rate': 1.063190786717681e-05, 'epoch': 0.68} +{'loss': 140.9627, 'grad_norm': 2360.0271682744174, 'learning_rate': 1.0627557132086057e-05, 'epoch': 0.68} +{'loss': 139.7222, 'grad_norm': 1842.749812035407, 'learning_rate': 1.0623206277732723e-05, 'epoch': 0.68} +{'loss': 139.2545, 'grad_norm': 2140.457509278969, 'learning_rate': 1.0618855304943644e-05, 'epoch': 0.68} +{'loss': 133.7326, 'grad_norm': 1641.401348540216, 'learning_rate': 1.06145042145457e-05, 'epoch': 0.68} +{'loss': 144.6797, 'grad_norm': 1637.1773973264794, 'learning_rate': 1.061015300736578e-05, 'epoch': 0.68} +{'loss': 136.9881, 'grad_norm': 1552.913748201145, 'learning_rate': 1.0605801684230794e-05, 'epoch': 0.68} +{'loss': 138.5846, 'grad_norm': 1721.3681692263904, 'learning_rate': 1.0601450245967687e-05, 'epoch': 0.68} +{'loss': 142.8047, 'grad_norm': 2167.7193300465033, 'learning_rate': 1.0597098693403409e-05, 'epoch': 0.68} +{'loss': 140.7791, 'grad_norm': 1570.395197847316, 'learning_rate': 1.0592747027364947e-05, 'epoch': 0.68} +{'loss': 138.9384, 'grad_norm': 2361.856027363085, 'learning_rate': 1.0588395248679302e-05, 'epoch': 0.68} +{'loss': 135.6962, 'grad_norm': 1739.6978241168656, 'learning_rate': 1.0584043358173492e-05, 'epoch': 0.68} +{'loss': 136.3888, 'grad_norm': 1637.1844160163503, 'learning_rate': 1.0579691356674568e-05, 'epoch': 0.68} +{'loss': 139.9234, 'grad_norm': 1906.916413925407, 'learning_rate': 1.0575339245009593e-05, 'epoch': 0.68} +{'loss': 139.2119, 'grad_norm': 2195.572507269272, 'learning_rate': 1.0570987024005653e-05, 'epoch': 0.68} +{'loss': 140.7121, 'grad_norm': 1691.6451238749855, 'learning_rate': 1.0566634694489857e-05, 'epoch': 0.68} +{'loss': 139.0595, 'grad_norm': 1708.8088777525086, 'learning_rate': 1.0562282257289332e-05, 'epoch': 0.68} +{'loss': 140.3565, 'grad_norm': 1788.8098375097854, 'learning_rate': 1.0557929713231225e-05, 'epoch': 0.68} +{'loss': 143.3808, 'grad_norm': 2063.054895127997, 'learning_rate': 1.0553577063142705e-05, 'epoch': 0.68} +{'loss': 139.5627, 'grad_norm': 1804.109392783031, 'learning_rate': 1.0549224307850965e-05, 'epoch': 0.68} +{'loss': 141.7945, 'grad_norm': 1642.0300928328709, 'learning_rate': 1.0544871448183213e-05, 'epoch': 0.68} +{'loss': 137.2024, 'grad_norm': 1737.426532298861, 'learning_rate': 1.0540518484966674e-05, 'epoch': 0.68} +{'loss': 136.8858, 'grad_norm': 1632.1649500426518, 'learning_rate': 1.0536165419028597e-05, 'epoch': 0.68} +{'loss': 136.3291, 'grad_norm': 1626.8636584556327, 'learning_rate': 1.0531812251196256e-05, 'epoch': 0.68} +{'loss': 137.3486, 'grad_norm': 1603.7499110731192, 'learning_rate': 1.052745898229693e-05, 'epoch': 0.68} +{'loss': 136.8919, 'grad_norm': 2224.241892640922, 'learning_rate': 1.052310561315793e-05, 'epoch': 0.68} +{'loss': 136.6906, 'grad_norm': 1738.143945508372, 'learning_rate': 1.0518752144606576e-05, 'epoch': 0.68} +{'loss': 136.0863, 'grad_norm': 1743.9030966961404, 'learning_rate': 1.0514398577470221e-05, 'epoch': 0.68} +{'loss': 141.1114, 'grad_norm': 1836.3185929303868, 'learning_rate': 1.051004491257623e-05, 'epoch': 0.68} +{'loss': 136.4505, 'grad_norm': 1661.0579059791435, 'learning_rate': 1.0505691150751973e-05, 'epoch': 0.68} +{'loss': 140.8398, 'grad_norm': 1588.0302002265234, 'learning_rate': 1.0501337292824855e-05, 'epoch': 0.68} +{'loss': 139.0637, 'grad_norm': 2123.3723332889067, 'learning_rate': 1.04969833396223e-05, 'epoch': 0.69} +{'loss': 142.859, 'grad_norm': 1784.6638562993298, 'learning_rate': 1.0492629291971738e-05, 'epoch': 0.69} +{'loss': 145.1843, 'grad_norm': 1871.8703893063596, 'learning_rate': 1.0488275150700628e-05, 'epoch': 0.69} +{'loss': 136.7752, 'grad_norm': 1600.328830016791, 'learning_rate': 1.0483920916636439e-05, 'epoch': 0.69} +{'loss': 139.4085, 'grad_norm': 1667.8973908228822, 'learning_rate': 1.0479566590606663e-05, 'epoch': 0.69} +{'loss': 137.2952, 'grad_norm': 1719.2666119735907, 'learning_rate': 1.0475212173438804e-05, 'epoch': 0.69} +{'loss': 141.0886, 'grad_norm': 1647.979829960603, 'learning_rate': 1.0470857665960392e-05, 'epoch': 0.69} +{'loss': 137.928, 'grad_norm': 1725.506398273905, 'learning_rate': 1.0466503068998967e-05, 'epoch': 0.69} +{'loss': 140.6797, 'grad_norm': 1879.3571625721786, 'learning_rate': 1.0462148383382086e-05, 'epoch': 0.69} +{'loss': 135.706, 'grad_norm': 1480.3855382582535, 'learning_rate': 1.0457793609937328e-05, 'epoch': 0.69} +{'loss': 135.6724, 'grad_norm': 2215.7173113003882, 'learning_rate': 1.045343874949228e-05, 'epoch': 0.69} +{'loss': 139.0578, 'grad_norm': 5651.536388258449, 'learning_rate': 1.0449083802874558e-05, 'epoch': 0.69} +{'loss': 140.1522, 'grad_norm': 1642.591396237815, 'learning_rate': 1.044472877091178e-05, 'epoch': 0.69} +{'loss': 141.1333, 'grad_norm': 1870.8189788411232, 'learning_rate': 1.0440373654431594e-05, 'epoch': 0.69} +{'loss': 142.0753, 'grad_norm': 1953.37339918426, 'learning_rate': 1.0436018454261654e-05, 'epoch': 0.69} +{'loss': 143.9246, 'grad_norm': 1613.5614843112346, 'learning_rate': 1.0431663171229636e-05, 'epoch': 0.69} +{'loss': 143.6663, 'grad_norm': 2311.420478375622, 'learning_rate': 1.042730780616323e-05, 'epoch': 0.69} +{'loss': 137.922, 'grad_norm': 1614.213978672296, 'learning_rate': 1.0422952359890132e-05, 'epoch': 0.69} +{'loss': 138.6806, 'grad_norm': 1711.4739866685024, 'learning_rate': 1.0418596833238075e-05, 'epoch': 0.69} +{'loss': 137.951, 'grad_norm': 1608.9503759298734, 'learning_rate': 1.0414241227034784e-05, 'epoch': 0.69} +{'loss': 136.6541, 'grad_norm': 1680.2194636482088, 'learning_rate': 1.0409885542108017e-05, 'epoch': 0.69} +{'loss': 139.9028, 'grad_norm': 1473.1076155208489, 'learning_rate': 1.040552977928554e-05, 'epoch': 0.69} +{'loss': 137.2922, 'grad_norm': 1566.3604874136738, 'learning_rate': 1.0401173939395128e-05, 'epoch': 0.69} +{'loss': 138.2428, 'grad_norm': 1626.0822007479865, 'learning_rate': 1.0396818023264576e-05, 'epoch': 0.69} +{'loss': 134.9728, 'grad_norm': 1644.1773564130326, 'learning_rate': 1.03924620317217e-05, 'epoch': 0.69} +{'loss': 139.0547, 'grad_norm': 1590.3005783574708, 'learning_rate': 1.0388105965594319e-05, 'epoch': 0.69} +{'loss': 134.5168, 'grad_norm': 1574.7518499387713, 'learning_rate': 1.0383749825710274e-05, 'epoch': 0.69} +{'loss': 139.3909, 'grad_norm': 2191.689830770434, 'learning_rate': 1.0379393612897413e-05, 'epoch': 0.69} +{'loss': 137.7778, 'grad_norm': 1977.5980206862184, 'learning_rate': 1.0375037327983606e-05, 'epoch': 0.69} +{'loss': 140.2337, 'grad_norm': 1637.6497585245863, 'learning_rate': 1.0370680971796732e-05, 'epoch': 0.69} +{'loss': 136.1813, 'grad_norm': 1741.2358405183988, 'learning_rate': 1.0366324545164679e-05, 'epoch': 0.69} +{'loss': 133.5375, 'grad_norm': 1580.9222784606798, 'learning_rate': 1.0361968048915362e-05, 'epoch': 0.69} +{'loss': 142.06, 'grad_norm': 2233.1861807082723, 'learning_rate': 1.0357611483876698e-05, 'epoch': 0.69} +{'loss': 138.4913, 'grad_norm': 1767.8027671952725, 'learning_rate': 1.0353254850876616e-05, 'epoch': 0.69} +{'loss': 137.8523, 'grad_norm': 1786.4837797563919, 'learning_rate': 1.0348898150743067e-05, 'epoch': 0.69} +{'loss': 139.1044, 'grad_norm': 1674.9378618358437, 'learning_rate': 1.0344541384304006e-05, 'epoch': 0.69} +{'loss': 136.6441, 'grad_norm': 1878.4465147679382, 'learning_rate': 1.0340184552387406e-05, 'epoch': 0.69} +{'loss': 138.5704, 'grad_norm': 1655.8263029881978, 'learning_rate': 1.0335827655821249e-05, 'epoch': 0.69} +{'loss': 137.493, 'grad_norm': 1719.2434683440988, 'learning_rate': 1.0331470695433534e-05, 'epoch': 0.69} +[2025-05-29 02:16:07,398] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step36000 is about to be saved! +[2025-05-29 02:16:07,464] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 02:16:07,464] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 02:16:07,525] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 02:16:07,695] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 02:22:01,787] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 02:22:01,893] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 02:22:02,286] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step36000 is ready now! +[2025-05-29 02:22:02,326] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step36000 is about to be saved! +[2025-05-29 02:22:02,347] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 02:22:02,347] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 02:22:02,378] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 02:22:02,559] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 02:28:14,938] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 02:28:15,025] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-36000/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 02:28:15,710] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step36000 is ready now! +{'loss': 139.6363, 'grad_norm': 1757.6251660451776, 'learning_rate': 1.0327113672052266e-05, 'epoch': 0.69} +{'loss': 137.7073, 'grad_norm': 1703.5669755668557, 'learning_rate': 1.0322756586505464e-05, 'epoch': 0.69} +{'loss': 138.1341, 'grad_norm': 1553.3342809090086, 'learning_rate': 1.0318399439621167e-05, 'epoch': 0.69} +{'loss': 136.482, 'grad_norm': 1615.8434631889286, 'learning_rate': 1.0314042232227413e-05, 'epoch': 0.69} +{'loss': 145.1989, 'grad_norm': 1496.4903230374687, 'learning_rate': 1.0309684965152254e-05, 'epoch': 0.69} +{'loss': 137.2694, 'grad_norm': 1614.8227883910251, 'learning_rate': 1.0305327639223763e-05, 'epoch': 0.69} +{'loss': 141.9323, 'grad_norm': 1906.8308411386686, 'learning_rate': 1.030097025527001e-05, 'epoch': 0.69} +{'loss': 137.0404, 'grad_norm': 1634.396996753976, 'learning_rate': 1.0296612814119095e-05, 'epoch': 0.69} +{'loss': 135.2535, 'grad_norm': 1677.2789226483221, 'learning_rate': 1.029225531659911e-05, 'epoch': 0.69} +{'loss': 137.0036, 'grad_norm': 1767.47414811446, 'learning_rate': 1.0287897763538159e-05, 'epoch': 0.69} +{'loss': 139.158, 'grad_norm': 1402.8973339079619, 'learning_rate': 1.0283540155764374e-05, 'epoch': 0.69} +{'loss': 136.4218, 'grad_norm': 1832.917936009458, 'learning_rate': 1.0279182494105879e-05, 'epoch': 0.69} +{'loss': 138.0962, 'grad_norm': 1948.7571027536999, 'learning_rate': 1.027482477939082e-05, 'epoch': 0.69} +{'loss': 142.1899, 'grad_norm': 1701.0168367902247, 'learning_rate': 1.0270467012447346e-05, 'epoch': 0.7} +{'loss': 134.0144, 'grad_norm': 1746.0216079829117, 'learning_rate': 1.0266109194103617e-05, 'epoch': 0.7} +{'loss': 141.9987, 'grad_norm': 1631.3601369971898, 'learning_rate': 1.0261751325187807e-05, 'epoch': 0.7} +{'loss': 137.1207, 'grad_norm': 1718.9369654199998, 'learning_rate': 1.0257393406528095e-05, 'epoch': 0.7} +{'loss': 138.7842, 'grad_norm': 2289.011405519115, 'learning_rate': 1.0253035438952672e-05, 'epoch': 0.7} +{'loss': 136.2178, 'grad_norm': 2025.7659202760303, 'learning_rate': 1.024867742328974e-05, 'epoch': 0.7} +{'loss': 138.156, 'grad_norm': 1791.0274602597153, 'learning_rate': 1.0244319360367496e-05, 'epoch': 0.7} +{'loss': 138.1675, 'grad_norm': 1684.2575641114631, 'learning_rate': 1.0239961251014174e-05, 'epoch': 0.7} +{'loss': 134.9346, 'grad_norm': 1652.9947090909152, 'learning_rate': 1.0235603096057989e-05, 'epoch': 0.7} +{'loss': 141.0391, 'grad_norm': 1837.0855159951598, 'learning_rate': 1.0231244896327182e-05, 'epoch': 0.7} +{'loss': 137.0167, 'grad_norm': 1792.1927455084099, 'learning_rate': 1.0226886652649994e-05, 'epoch': 0.7} +{'loss': 137.4028, 'grad_norm': 1606.3023446566651, 'learning_rate': 1.0222528365854674e-05, 'epoch': 0.7} +{'loss': 137.0701, 'grad_norm': 1556.8645267906898, 'learning_rate': 1.021817003676949e-05, 'epoch': 0.7} +{'loss': 139.0421, 'grad_norm': 1509.2701572687558, 'learning_rate': 1.0213811666222705e-05, 'epoch': 0.7} +{'loss': 136.345, 'grad_norm': 1566.3162746715532, 'learning_rate': 1.0209453255042592e-05, 'epoch': 0.7} +{'loss': 142.1729, 'grad_norm': 1877.6896464385409, 'learning_rate': 1.0205094804057443e-05, 'epoch': 0.7} +{'loss': 137.1042, 'grad_norm': 1841.1477260441784, 'learning_rate': 1.0200736314095543e-05, 'epoch': 0.7} +{'loss': 136.6809, 'grad_norm': 1743.2888583541871, 'learning_rate': 1.0196377785985192e-05, 'epoch': 0.7} +{'loss': 137.2153, 'grad_norm': 1606.9042557963328, 'learning_rate': 1.0192019220554698e-05, 'epoch': 0.7} +{'loss': 136.0233, 'grad_norm': 1809.4582670126072, 'learning_rate': 1.018766061863237e-05, 'epoch': 0.7} +{'loss': 141.1456, 'grad_norm': 1710.3233487722723, 'learning_rate': 1.0183301981046532e-05, 'epoch': 0.7} +{'loss': 139.5261, 'grad_norm': 1528.9211760580113, 'learning_rate': 1.017894330862551e-05, 'epoch': 0.7} +{'loss': 135.916, 'grad_norm': 1659.4637545492365, 'learning_rate': 1.0174584602197637e-05, 'epoch': 0.7} +{'loss': 133.8566, 'grad_norm': 1831.2151603494792, 'learning_rate': 1.0170225862591252e-05, 'epoch': 0.7} +{'loss': 138.035, 'grad_norm': 1588.2352195230421, 'learning_rate': 1.01658670906347e-05, 'epoch': 0.7} +{'loss': 133.2187, 'grad_norm': 1463.0668790172053, 'learning_rate': 1.016150828715634e-05, 'epoch': 0.7} +{'loss': 133.4132, 'grad_norm': 1604.459369023434, 'learning_rate': 1.0157149452984523e-05, 'epoch': 0.7} +{'loss': 137.8411, 'grad_norm': 1793.133889658047, 'learning_rate': 1.0152790588947617e-05, 'epoch': 0.7} +{'loss': 142.0545, 'grad_norm': 1962.234426413917, 'learning_rate': 1.0148431695873991e-05, 'epoch': 0.7} +{'loss': 134.8208, 'grad_norm': 2035.3975017551697, 'learning_rate': 1.0144072774592018e-05, 'epoch': 0.7} +{'loss': 136.4868, 'grad_norm': 1632.5487928534142, 'learning_rate': 1.0139713825930081e-05, 'epoch': 0.7} +{'loss': 140.3839, 'grad_norm': 1970.6272811561423, 'learning_rate': 1.0135354850716565e-05, 'epoch': 0.7} +{'loss': 134.5083, 'grad_norm': 1606.4478014114216, 'learning_rate': 1.0130995849779863e-05, 'epoch': 0.7} +{'loss': 136.9293, 'grad_norm': 1702.5643734266446, 'learning_rate': 1.0126636823948373e-05, 'epoch': 0.7} +{'loss': 139.177, 'grad_norm': 1710.5853983989036, 'learning_rate': 1.0122277774050486e-05, 'epoch': 0.7} +{'loss': 134.8191, 'grad_norm': 1589.9785999892777, 'learning_rate': 1.0117918700914617e-05, 'epoch': 0.7} +{'loss': 134.1902, 'grad_norm': 1726.6031359683955, 'learning_rate': 1.0113559605369169e-05, 'epoch': 0.7} +{'loss': 135.1074, 'grad_norm': 1789.0986238002408, 'learning_rate': 1.010920048824256e-05, 'epoch': 0.7} +{'loss': 137.0311, 'grad_norm': 1680.5730008619244, 'learning_rate': 1.010484135036321e-05, 'epoch': 0.7} +{'loss': 140.5364, 'grad_norm': 1792.8057109945794, 'learning_rate': 1.0100482192559534e-05, 'epoch': 0.7} +{'loss': 131.9511, 'grad_norm': 1539.6516405894997, 'learning_rate': 1.009612301565996e-05, 'epoch': 0.7} +{'loss': 135.5507, 'grad_norm': 1619.616331817267, 'learning_rate': 1.009176382049292e-05, 'epoch': 0.7} +{'loss': 135.9979, 'grad_norm': 1773.9061001152854, 'learning_rate': 1.0087404607886846e-05, 'epoch': 0.7} +{'loss': 136.9852, 'grad_norm': 3274.4833066965903, 'learning_rate': 1.0083045378670173e-05, 'epoch': 0.7} +{'loss': 135.6039, 'grad_norm': 1691.6864772117674, 'learning_rate': 1.007868613367134e-05, 'epoch': 0.7} +{'loss': 133.0134, 'grad_norm': 1580.7553401152743, 'learning_rate': 1.0074326873718792e-05, 'epoch': 0.7} +{'loss': 135.533, 'grad_norm': 1499.0274562146537, 'learning_rate': 1.0069967599640968e-05, 'epoch': 0.7} +{'loss': 144.9279, 'grad_norm': 1832.8450000828595, 'learning_rate': 1.0065608312266324e-05, 'epoch': 0.7} +{'loss': 135.8487, 'grad_norm': 1499.5547446326868, 'learning_rate': 1.0061249012423302e-05, 'epoch': 0.7} +{'loss': 137.2763, 'grad_norm': 2124.793145440231, 'learning_rate': 1.005688970094036e-05, 'epoch': 0.7} +{'loss': 133.0731, 'grad_norm': 1598.1927483557356, 'learning_rate': 1.0052530378645953e-05, 'epoch': 0.7} +{'loss': 136.2184, 'grad_norm': 1837.8291471134507, 'learning_rate': 1.0048171046368534e-05, 'epoch': 0.7} +{'loss': 137.4591, 'grad_norm': 1508.93253278311, 'learning_rate': 1.0043811704936566e-05, 'epoch': 0.71} +{'loss': 137.48, 'grad_norm': 1652.6538025569835, 'learning_rate': 1.0039452355178507e-05, 'epoch': 0.71} +{'loss': 136.3298, 'grad_norm': 1866.6509704854436, 'learning_rate': 1.003509299792282e-05, 'epoch': 0.71} +{'loss': 136.8727, 'grad_norm': 1661.9660218837257, 'learning_rate': 1.0030733633997968e-05, 'epoch': 0.71} +{'loss': 137.9942, 'grad_norm': 1828.8329983854196, 'learning_rate': 1.0026374264232417e-05, 'epoch': 0.71} +{'loss': 142.9717, 'grad_norm': 1884.1253231804333, 'learning_rate': 1.0022014889454634e-05, 'epoch': 0.71} +{'loss': 136.2864, 'grad_norm': 2108.9759080911435, 'learning_rate': 1.0017655510493082e-05, 'epoch': 0.71} +{'loss': 135.2389, 'grad_norm': 1609.560539828714, 'learning_rate': 1.0013296128176231e-05, 'epoch': 0.71} +{'loss': 134.0972, 'grad_norm': 1642.9359915025045, 'learning_rate': 1.0008936743332552e-05, 'epoch': 0.71} +{'loss': 132.7007, 'grad_norm': 1565.2208065999857, 'learning_rate': 1.0004577356790506e-05, 'epoch': 0.71} +{'loss': 133.3372, 'grad_norm': 1752.013678858268, 'learning_rate': 1.0000217969378574e-05, 'epoch': 0.71} +{'loss': 135.5601, 'grad_norm': 1612.9520831419086, 'learning_rate': 9.995858581925213e-06, 'epoch': 0.71} +{'loss': 136.6907, 'grad_norm': 1717.5887552227357, 'learning_rate': 9.991499195258902e-06, 'epoch': 0.71} +{'loss': 137.4254, 'grad_norm': 1800.0210852142905, 'learning_rate': 9.987139810208101e-06, 'epoch': 0.71} +{'loss': 136.6951, 'grad_norm': 1707.8808663851698, 'learning_rate': 9.982780427601286e-06, 'epoch': 0.71} +{'loss': 135.3098, 'grad_norm': 1757.4901280514712, 'learning_rate': 9.978421048266926e-06, 'epoch': 0.71} +{'loss': 133.3116, 'grad_norm': 1402.9537240280977, 'learning_rate': 9.97406167303348e-06, 'epoch': 0.71} +{'loss': 138.4165, 'grad_norm': 1644.7166044960818, 'learning_rate': 9.969702302729429e-06, 'epoch': 0.71} +{'loss': 138.6803, 'grad_norm': 1711.4463186785808, 'learning_rate': 9.965342938183224e-06, 'epoch': 0.71} +{'loss': 133.8826, 'grad_norm': 1656.0068024590923, 'learning_rate': 9.960983580223339e-06, 'epoch': 0.71} +{'loss': 137.3448, 'grad_norm': 1778.378679163877, 'learning_rate': 9.956624229678238e-06, 'epoch': 0.71} +{'loss': 134.6641, 'grad_norm': 1609.1787868266679, 'learning_rate': 9.952264887376379e-06, 'epoch': 0.71} +{'loss': 138.8334, 'grad_norm': 1632.16924334938, 'learning_rate': 9.947905554146227e-06, 'epoch': 0.71} +{'loss': 135.0753, 'grad_norm': 1436.6108707202109, 'learning_rate': 9.943546230816236e-06, 'epoch': 0.71} +{'loss': 132.5091, 'grad_norm': 1540.8821563386755, 'learning_rate': 9.939186918214867e-06, 'epoch': 0.71} +{'loss': 134.3043, 'grad_norm': 1498.638301095456, 'learning_rate': 9.934827617170575e-06, 'epoch': 0.71} +{'loss': 134.7923, 'grad_norm': 1671.8178986146015, 'learning_rate': 9.930468328511812e-06, 'epoch': 0.71} +{'loss': 137.6752, 'grad_norm': 1762.7406055956806, 'learning_rate': 9.926109053067029e-06, 'epoch': 0.71} +{'loss': 134.7673, 'grad_norm': 1657.328033276727, 'learning_rate': 9.921749791664668e-06, 'epoch': 0.71} +{'loss': 133.0957, 'grad_norm': 1607.362339184466, 'learning_rate': 9.917390545133183e-06, 'epoch': 0.71} +{'loss': 134.0753, 'grad_norm': 1781.883808019968, 'learning_rate': 9.913031314301016e-06, 'epoch': 0.71} +{'loss': 137.797, 'grad_norm': 1472.726594695489, 'learning_rate': 9.908672099996604e-06, 'epoch': 0.71} +{'loss': 134.1622, 'grad_norm': 1522.6270705404493, 'learning_rate': 9.90431290304838e-06, 'epoch': 0.71} +{'loss': 135.9218, 'grad_norm': 1791.4638108685715, 'learning_rate': 9.899953724284778e-06, 'epoch': 0.71} +{'loss': 139.9809, 'grad_norm': 1580.6893537784697, 'learning_rate': 9.895594564534236e-06, 'epoch': 0.71} +{'loss': 136.5196, 'grad_norm': 1536.4979843419471, 'learning_rate': 9.891235424625172e-06, 'epoch': 0.71} +{'loss': 132.608, 'grad_norm': 1552.337183332374, 'learning_rate': 9.886876305386012e-06, 'epoch': 0.71} +{'loss': 137.7684, 'grad_norm': 1836.0204754134252, 'learning_rate': 9.88251720764517e-06, 'epoch': 0.71} +{'loss': 139.1641, 'grad_norm': 1955.6541069547663, 'learning_rate': 9.878158132231062e-06, 'epoch': 0.71} +{'loss': 131.2689, 'grad_norm': 1456.04485824056, 'learning_rate': 9.873799079972102e-06, 'epoch': 0.71} +{'loss': 136.6597, 'grad_norm': 1828.146066430012, 'learning_rate': 9.869440051696691e-06, 'epoch': 0.71} +{'loss': 137.0866, 'grad_norm': 1567.2767548163247, 'learning_rate': 9.865081048233234e-06, 'epoch': 0.71} +{'loss': 140.4293, 'grad_norm': 2097.8949335209913, 'learning_rate': 9.86072207041012e-06, 'epoch': 0.71} +{'loss': 137.3759, 'grad_norm': 1878.5003104750028, 'learning_rate': 9.856363119055747e-06, 'epoch': 0.71} +{'loss': 138.9016, 'grad_norm': 2108.7126387853036, 'learning_rate': 9.852004194998503e-06, 'epoch': 0.71} +{'loss': 135.8931, 'grad_norm': 1651.1515641952808, 'learning_rate': 9.847645299066764e-06, 'epoch': 0.71} +{'loss': 138.2356, 'grad_norm': 1768.417820698348, 'learning_rate': 9.84328643208891e-06, 'epoch': 0.71} +{'loss': 144.5598, 'grad_norm': 16256.140773313798, 'learning_rate': 9.838927594893308e-06, 'epoch': 0.71} +{'loss': 137.927, 'grad_norm': 1865.8732467900752, 'learning_rate': 9.834568788308322e-06, 'epoch': 0.71} +{'loss': 137.2302, 'grad_norm': 1704.5602837578817, 'learning_rate': 9.830210013162316e-06, 'epoch': 0.71} +{'loss': 134.7647, 'grad_norm': 2099.8735822143994, 'learning_rate': 9.825851270283639e-06, 'epoch': 0.71} +{'loss': 136.5723, 'grad_norm': 1853.0106688704332, 'learning_rate': 9.821492560500641e-06, 'epoch': 0.71} +{'loss': 134.9854, 'grad_norm': 1521.1293936656862, 'learning_rate': 9.817133884641657e-06, 'epoch': 0.72} +{'loss': 136.6975, 'grad_norm': 1802.3914879467054, 'learning_rate': 9.812775243535024e-06, 'epoch': 0.72} +{'loss': 137.9748, 'grad_norm': 1662.885657417086, 'learning_rate': 9.808416638009075e-06, 'epoch': 0.72} +{'loss': 135.5821, 'grad_norm': 1822.7234872607019, 'learning_rate': 9.80405806889212e-06, 'epoch': 0.72} +{'loss': 137.2309, 'grad_norm': 1443.5331217057194, 'learning_rate': 9.799699537012484e-06, 'epoch': 0.72} +{'loss': 136.7391, 'grad_norm': 1777.8114515680356, 'learning_rate': 9.795341043198465e-06, 'epoch': 0.72} +{'loss': 135.3238, 'grad_norm': 1797.367315261061, 'learning_rate': 9.790982588278362e-06, 'epoch': 0.72} +{'loss': 135.5434, 'grad_norm': 1621.265408856755, 'learning_rate': 9.786624173080477e-06, 'epoch': 0.72} +{'loss': 135.3577, 'grad_norm': 2053.872242817571, 'learning_rate': 9.782265798433086e-06, 'epoch': 0.72} +{'loss': 133.6784, 'grad_norm': 1868.5871640301723, 'learning_rate': 9.777907465164468e-06, 'epoch': 0.72} +{'loss': 133.0938, 'grad_norm': 2118.235964700173, 'learning_rate': 9.773549174102892e-06, 'epoch': 0.72} +{'loss': 137.7576, 'grad_norm': 2100.635382483504, 'learning_rate': 9.769190926076616e-06, 'epoch': 0.72} +{'loss': 133.5955, 'grad_norm': 1659.4182018775145, 'learning_rate': 9.7648327219139e-06, 'epoch': 0.72} +{'loss': 132.2678, 'grad_norm': 1623.150038679052, 'learning_rate': 9.760474562442984e-06, 'epoch': 0.72} +{'loss': 136.9466, 'grad_norm': 1678.6231520694087, 'learning_rate': 9.756116448492104e-06, 'epoch': 0.72} +{'loss': 138.7757, 'grad_norm': 1794.232157935617, 'learning_rate': 9.751758380889489e-06, 'epoch': 0.72} +{'loss': 131.7137, 'grad_norm': 1990.1457824433198, 'learning_rate': 9.747400360463352e-06, 'epoch': 0.72} +{'loss': 137.785, 'grad_norm': 1849.1600096869354, 'learning_rate': 9.743042388041907e-06, 'epoch': 0.72} +{'loss': 134.0635, 'grad_norm': 1856.5643713697832, 'learning_rate': 9.738684464453357e-06, 'epoch': 0.72} +{'loss': 133.8066, 'grad_norm': 1562.9842116893249, 'learning_rate': 9.734326590525887e-06, 'epoch': 0.72} +{'loss': 134.131, 'grad_norm': 2003.1018989695829, 'learning_rate': 9.729968767087685e-06, 'epoch': 0.72} +{'loss': 132.5869, 'grad_norm': 1632.0932552127595, 'learning_rate': 9.725610994966916e-06, 'epoch': 0.72} +{'loss': 134.2305, 'grad_norm': 1619.355180890106, 'learning_rate': 9.721253274991746e-06, 'epoch': 0.72} +{'loss': 135.0253, 'grad_norm': 1629.2424613861021, 'learning_rate': 9.716895607990332e-06, 'epoch': 0.72} +{'loss': 136.8965, 'grad_norm': 1436.1265542510248, 'learning_rate': 9.712537994790808e-06, 'epoch': 0.72} +{'loss': 135.1661, 'grad_norm': 1630.2762915873234, 'learning_rate': 9.708180436221312e-06, 'epoch': 0.72} +{'loss': 138.9952, 'grad_norm': 1723.8472673251779, 'learning_rate': 9.70382293310996e-06, 'epoch': 0.72} +{'loss': 131.1853, 'grad_norm': 1634.0449042635098, 'learning_rate': 9.699465486284871e-06, 'epoch': 0.72} +{'loss': 133.0739, 'grad_norm': 1643.3507334896701, 'learning_rate': 9.695108096574141e-06, 'epoch': 0.72} +{'loss': 134.7738, 'grad_norm': 1707.1593172662908, 'learning_rate': 9.690750764805859e-06, 'epoch': 0.72} +{'loss': 133.4453, 'grad_norm': 1530.1512434049519, 'learning_rate': 9.686393491808109e-06, 'epoch': 0.72} +{'loss': 133.2437, 'grad_norm': 1632.121571068843, 'learning_rate': 9.682036278408949e-06, 'epoch': 0.72} +{'loss': 136.4953, 'grad_norm': 1516.4528135068426, 'learning_rate': 9.677679125436438e-06, 'epoch': 0.72} +{'loss': 136.0181, 'grad_norm': 1490.2187497923308, 'learning_rate': 9.673322033718631e-06, 'epoch': 0.72} +{'loss': 131.8146, 'grad_norm': 1852.6139124168208, 'learning_rate': 9.668965004083549e-06, 'epoch': 0.72} +{'loss': 131.7971, 'grad_norm': 1605.4471092524118, 'learning_rate': 9.664608037359217e-06, 'epoch': 0.72} +{'loss': 130.6481, 'grad_norm': 1697.737677053389, 'learning_rate': 9.660251134373646e-06, 'epoch': 0.72} +{'loss': 136.0353, 'grad_norm': 1690.6403827630895, 'learning_rate': 9.655894295954827e-06, 'epoch': 0.72} +{'loss': 134.8919, 'grad_norm': 1694.6470604778292, 'learning_rate': 9.651537522930756e-06, 'epoch': 0.72} +{'loss': 140.119, 'grad_norm': 1612.8417752719631, 'learning_rate': 9.647180816129396e-06, 'epoch': 0.72} +{'loss': 136.1594, 'grad_norm': 1554.6256590348682, 'learning_rate': 9.642824176378712e-06, 'epoch': 0.72} +{'loss': 134.9841, 'grad_norm': 1878.2698155769956, 'learning_rate': 9.638467604506648e-06, 'epoch': 0.72} +{'loss': 137.9042, 'grad_norm': 1859.2363606969825, 'learning_rate': 9.634111101341134e-06, 'epoch': 0.72} +{'loss': 134.8919, 'grad_norm': 1836.6120145521127, 'learning_rate': 9.629754667710106e-06, 'epoch': 0.72} +{'loss': 133.2079, 'grad_norm': 1812.1767894231366, 'learning_rate': 9.625398304441455e-06, 'epoch': 0.72} +{'loss': 137.0831, 'grad_norm': 1703.890230018508, 'learning_rate': 9.62104201236309e-06, 'epoch': 0.72} +{'loss': 130.8697, 'grad_norm': 1691.3447819475202, 'learning_rate': 9.616685792302882e-06, 'epoch': 0.72} +{'loss': 131.0151, 'grad_norm': 1517.9424500116324, 'learning_rate': 9.612329645088695e-06, 'epoch': 0.72} +{'loss': 135.9938, 'grad_norm': 2092.13202383878, 'learning_rate': 9.607973571548396e-06, 'epoch': 0.72} +{'loss': 133.598, 'grad_norm': 1766.1461224165635, 'learning_rate': 9.603617572509819e-06, 'epoch': 0.72} +{'loss': 134.8504, 'grad_norm': 1814.0383676998078, 'learning_rate': 9.599261648800784e-06, 'epoch': 0.72} +{'loss': 135.3558, 'grad_norm': 1936.8296071747461, 'learning_rate': 9.59490580124911e-06, 'epoch': 0.72} +{'loss': 138.9735, 'grad_norm': 1700.20627802627, 'learning_rate': 9.590550030682584e-06, 'epoch': 0.73} +{'loss': 139.1146, 'grad_norm': 1651.5914634181984, 'learning_rate': 9.586194337928995e-06, 'epoch': 0.73} +{'loss': 132.7935, 'grad_norm': 2093.088019546677, 'learning_rate': 9.581838723816112e-06, 'epoch': 0.73} +{'loss': 131.081, 'grad_norm': 1743.1145654982279, 'learning_rate': 9.577483189171681e-06, 'epoch': 0.73} +{'loss': 133.7373, 'grad_norm': 1690.274929977156, 'learning_rate': 9.573127734823444e-06, 'epoch': 0.73} +{'loss': 131.0236, 'grad_norm': 1658.6401687192201, 'learning_rate': 9.568772361599116e-06, 'epoch': 0.73} +{'loss': 137.0006, 'grad_norm': 1909.1283657333959, 'learning_rate': 9.564417070326412e-06, 'epoch': 0.73} +{'loss': 135.7628, 'grad_norm': 1648.975240484152, 'learning_rate': 9.56006186183302e-06, 'epoch': 0.73} +{'loss': 132.7353, 'grad_norm': 1621.897565943748, 'learning_rate': 9.555706736946613e-06, 'epoch': 0.73} +{'loss': 133.8422, 'grad_norm': 1946.4513534239934, 'learning_rate': 9.551351696494854e-06, 'epoch': 0.73} +{'loss': 135.2294, 'grad_norm': 1684.7016073155598, 'learning_rate': 9.54699674130538e-06, 'epoch': 0.73} +{'loss': 140.1833, 'grad_norm': 3593.7623228097586, 'learning_rate': 9.542641872205822e-06, 'epoch': 0.73} +{'loss': 135.5089, 'grad_norm': 1685.9458385791504, 'learning_rate': 9.538287090023794e-06, 'epoch': 0.73} +{'loss': 145.6522, 'grad_norm': 1878.4764068924092, 'learning_rate': 9.533932395586883e-06, 'epoch': 0.73} +{'loss': 137.2162, 'grad_norm': 2102.178499148933, 'learning_rate': 9.529577789722672e-06, 'epoch': 0.73} +{'loss': 136.9362, 'grad_norm': 1779.098733348868, 'learning_rate': 9.525223273258717e-06, 'epoch': 0.73} +{'loss': 134.1313, 'grad_norm': 1611.6504578423487, 'learning_rate': 9.520868847022566e-06, 'epoch': 0.73} +{'loss': 134.5552, 'grad_norm': 1503.033755157535, 'learning_rate': 9.516514511841745e-06, 'epoch': 0.73} +{'loss': 132.0515, 'grad_norm': 1784.4407961666202, 'learning_rate': 9.51216026854376e-06, 'epoch': 0.73} +{'loss': 136.1773, 'grad_norm': 1706.2778496849064, 'learning_rate': 9.507806117956106e-06, 'epoch': 0.73} +{'loss': 138.2647, 'grad_norm': 1640.643918949294, 'learning_rate': 9.503452060906251e-06, 'epoch': 0.73} +{'loss': 133.8926, 'grad_norm': 1758.5416919318936, 'learning_rate': 9.499098098221659e-06, 'epoch': 0.73} +{'loss': 132.3476, 'grad_norm': 1574.9232666734606, 'learning_rate': 9.494744230729767e-06, 'epoch': 0.73} +{'loss': 134.3211, 'grad_norm': 1665.0326506931717, 'learning_rate': 9.49039045925799e-06, 'epoch': 0.73} +{'loss': 135.0584, 'grad_norm': 1600.8175710069988, 'learning_rate': 9.486036784633738e-06, 'epoch': 0.73} +{'loss': 132.6191, 'grad_norm': 1868.6738151691293, 'learning_rate': 9.481683207684386e-06, 'epoch': 0.73} +{'loss': 137.0317, 'grad_norm': 1660.2690513028058, 'learning_rate': 9.477329729237304e-06, 'epoch': 0.73} +{'loss': 135.4258, 'grad_norm': 1963.927309362809, 'learning_rate': 9.47297635011984e-06, 'epoch': 0.73} +{'loss': 130.8266, 'grad_norm': 1659.215627071408, 'learning_rate': 9.468623071159318e-06, 'epoch': 0.73} +{'loss': 132.7542, 'grad_norm': 1815.5917823285376, 'learning_rate': 9.464269893183048e-06, 'epoch': 0.73} +{'loss': 131.867, 'grad_norm': 1561.0332963284657, 'learning_rate': 9.459916817018317e-06, 'epoch': 0.73} +[2025-05-29 04:16:39,659] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step38000 is about to be saved! +[2025-05-29 04:16:39,725] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 04:16:39,725] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 04:16:39,777] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 04:16:39,949] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 04:22:32,590] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 04:22:32,678] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 04:22:38,603] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step38000 is ready now! +[2025-05-29 04:22:38,664] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step38000 is about to be saved! +[2025-05-29 04:22:38,682] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 04:22:38,682] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 04:22:38,777] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 04:22:41,501] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 04:28:50,547] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 04:28:50,574] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-38000/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 04:28:52,196] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step38000 is ready now! +{'loss': 135.3419, 'grad_norm': 1755.9905454148782, 'learning_rate': 9.455563843492397e-06, 'epoch': 0.73} +{'loss': 134.5664, 'grad_norm': 1483.0949807945472, 'learning_rate': 9.45121097343254e-06, 'epoch': 0.73} +{'loss': 137.5012, 'grad_norm': 1621.258823001652, 'learning_rate': 9.446858207665976e-06, 'epoch': 0.73} +{'loss': 132.4307, 'grad_norm': 1711.9176884394794, 'learning_rate': 9.442505547019912e-06, 'epoch': 0.73} +{'loss': 138.5402, 'grad_norm': 6283.764191610019, 'learning_rate': 9.438152992321538e-06, 'epoch': 0.73} +{'loss': 133.5833, 'grad_norm': 1844.0803036222806, 'learning_rate': 9.433800544398036e-06, 'epoch': 0.73} +{'loss': 133.2233, 'grad_norm': 1599.1367907293027, 'learning_rate': 9.429448204076544e-06, 'epoch': 0.73} +{'loss': 134.3651, 'grad_norm': 1948.011671282396, 'learning_rate': 9.4250959721842e-06, 'epoch': 0.73} +{'loss': 136.3536, 'grad_norm': 1947.0429947927762, 'learning_rate': 9.420743849548109e-06, 'epoch': 0.73} +{'loss': 131.4141, 'grad_norm': 1627.1287647259605, 'learning_rate': 9.416391836995358e-06, 'epoch': 0.73} +{'loss': 141.1502, 'grad_norm': 1856.7333855628706, 'learning_rate': 9.412039935353023e-06, 'epoch': 0.73} +{'loss': 133.4542, 'grad_norm': 1743.9714304231034, 'learning_rate': 9.407688145448143e-06, 'epoch': 0.73} +{'loss': 130.6478, 'grad_norm': 1541.6435286294957, 'learning_rate': 9.403336468107748e-06, 'epoch': 0.73} +{'loss': 130.7213, 'grad_norm': 1689.2917694021185, 'learning_rate': 9.398984904158839e-06, 'epoch': 0.73} +{'loss': 134.9992, 'grad_norm': 1524.5317634460014, 'learning_rate': 9.394633454428396e-06, 'epoch': 0.73} +{'loss': 132.5181, 'grad_norm': 1755.8459304858002, 'learning_rate': 9.390282119743389e-06, 'epoch': 0.73} +{'loss': 134.447, 'grad_norm': 1727.8432429721586, 'learning_rate': 9.385930900930748e-06, 'epoch': 0.73} +{'loss': 133.8508, 'grad_norm': 1744.5798644203392, 'learning_rate': 9.381579798817397e-06, 'epoch': 0.73} +{'loss': 133.7813, 'grad_norm': 1670.3958270746343, 'learning_rate': 9.377228814230225e-06, 'epoch': 0.73} +{'loss': 133.3085, 'grad_norm': 1505.3503864894872, 'learning_rate': 9.372877947996103e-06, 'epoch': 0.73} +{'loss': 133.102, 'grad_norm': 2185.1034836499734, 'learning_rate': 9.36852720094189e-06, 'epoch': 0.73} +{'loss': 134.5567, 'grad_norm': 1595.5586346478758, 'learning_rate': 9.364176573894404e-06, 'epoch': 0.74} +{'loss': 134.8551, 'grad_norm': 1657.6259904699193, 'learning_rate': 9.359826067680456e-06, 'epoch': 0.74} +{'loss': 132.0026, 'grad_norm': 1512.3315910898, 'learning_rate': 9.355475683126822e-06, 'epoch': 0.74} +{'loss': 132.8136, 'grad_norm': 1592.6556363965663, 'learning_rate': 9.351125421060262e-06, 'epoch': 0.74} +{'loss': 134.2041, 'grad_norm': 1503.7349267941668, 'learning_rate': 9.346775282307516e-06, 'epoch': 0.74} +{'loss': 131.8154, 'grad_norm': 1969.6677395166273, 'learning_rate': 9.34242526769529e-06, 'epoch': 0.74} +{'loss': 132.8213, 'grad_norm': 1862.1881979979175, 'learning_rate': 9.338075378050275e-06, 'epoch': 0.74} +{'loss': 134.787, 'grad_norm': 1573.7758694283152, 'learning_rate': 9.333725614199132e-06, 'epoch': 0.74} +{'loss': 135.0665, 'grad_norm': 1698.6463989145811, 'learning_rate': 9.329375976968502e-06, 'epoch': 0.74} +{'loss': 133.6328, 'grad_norm': 1771.8310228820985, 'learning_rate': 9.325026467185007e-06, 'epoch': 0.74} +{'loss': 132.7063, 'grad_norm': 1898.0597700476576, 'learning_rate': 9.320677085675234e-06, 'epoch': 0.74} +{'loss': 134.217, 'grad_norm': 1870.3243465686946, 'learning_rate': 9.316327833265753e-06, 'epoch': 0.74} +{'loss': 132.0596, 'grad_norm': 1645.0368894384685, 'learning_rate': 9.311978710783103e-06, 'epoch': 0.74} +{'loss': 133.0246, 'grad_norm': 1458.4651205212258, 'learning_rate': 9.307629719053804e-06, 'epoch': 0.74} +{'loss': 136.6447, 'grad_norm': 1644.9277903113398, 'learning_rate': 9.303280858904356e-06, 'epoch': 0.74} +{'loss': 136.5177, 'grad_norm': 1765.4960606900654, 'learning_rate': 9.298932131161225e-06, 'epoch': 0.74} +{'loss': 136.6786, 'grad_norm': 1574.1089774373738, 'learning_rate': 9.294583536650847e-06, 'epoch': 0.74} +{'loss': 132.3469, 'grad_norm': 2092.5599369007846, 'learning_rate': 9.29023507619965e-06, 'epoch': 0.74} +{'loss': 134.6362, 'grad_norm': 1631.9145497960283, 'learning_rate': 9.285886750634017e-06, 'epoch': 0.74} +{'loss': 132.2115, 'grad_norm': 1667.1574617188276, 'learning_rate': 9.281538560780322e-06, 'epoch': 0.74} +{'loss': 132.0419, 'grad_norm': 1766.4091756069524, 'learning_rate': 9.277190507464909e-06, 'epoch': 0.74} +{'loss': 130.7984, 'grad_norm': 1666.9795926326951, 'learning_rate': 9.272842591514083e-06, 'epoch': 0.74} +{'loss': 129.1928, 'grad_norm': 1559.999721940786, 'learning_rate': 9.268494813754146e-06, 'epoch': 0.74} +{'loss': 132.5813, 'grad_norm': 1670.829269160357, 'learning_rate': 9.264147175011347e-06, 'epoch': 0.74} +{'loss': 130.4075, 'grad_norm': 1583.166800985242, 'learning_rate': 9.259799676111934e-06, 'epoch': 0.74} +{'loss': 135.3261, 'grad_norm': 1629.0804145897666, 'learning_rate': 9.255452317882114e-06, 'epoch': 0.74} +{'loss': 129.2303, 'grad_norm': 1688.0889128150322, 'learning_rate': 9.251105101148067e-06, 'epoch': 0.74} +{'loss': 135.3445, 'grad_norm': 1754.576367123282, 'learning_rate': 9.246758026735954e-06, 'epoch': 0.74} +{'loss': 136.3952, 'grad_norm': 1655.7363143479477, 'learning_rate': 9.242411095471897e-06, 'epoch': 0.74} +{'loss': 132.7265, 'grad_norm': 1529.2404803286092, 'learning_rate': 9.238064308182005e-06, 'epoch': 0.74} +{'loss': 132.2482, 'grad_norm': 1888.6984575814054, 'learning_rate': 9.233717665692354e-06, 'epoch': 0.74} +{'loss': 133.9466, 'grad_norm': 1566.7986242428462, 'learning_rate': 9.229371168828985e-06, 'epoch': 0.74} +{'loss': 131.8572, 'grad_norm': 1402.244767417885, 'learning_rate': 9.22502481841792e-06, 'epoch': 0.74} +{'loss': 134.5208, 'grad_norm': 1861.9420649024826, 'learning_rate': 9.22067861528515e-06, 'epoch': 0.74} +{'loss': 130.1202, 'grad_norm': 1461.7223454147309, 'learning_rate': 9.21633256025664e-06, 'epoch': 0.74} +{'loss': 159.5065, 'grad_norm': 1611.5637210164825, 'learning_rate': 9.21198665415833e-06, 'epoch': 0.74} +{'loss': 133.1259, 'grad_norm': 1653.190621182001, 'learning_rate': 9.207640897816119e-06, 'epoch': 0.74} +{'loss': 131.0967, 'grad_norm': 1698.905129273534, 'learning_rate': 9.203295292055891e-06, 'epoch': 0.74} +{'loss': 130.6653, 'grad_norm': 1693.3877965636982, 'learning_rate': 9.198949837703493e-06, 'epoch': 0.74} +{'loss': 132.3194, 'grad_norm': 1844.0498026345351, 'learning_rate': 9.194604535584748e-06, 'epoch': 0.74} +{'loss': 133.4348, 'grad_norm': 1781.2297597911147, 'learning_rate': 9.19025938652545e-06, 'epoch': 0.74} +{'loss': 132.4417, 'grad_norm': 1549.1885090545627, 'learning_rate': 9.185914391351361e-06, 'epoch': 0.74} +{'loss': 133.0845, 'grad_norm': 1647.6019205022849, 'learning_rate': 9.181569550888217e-06, 'epoch': 0.74} +{'loss': 133.2928, 'grad_norm': 1698.5506108623963, 'learning_rate': 9.177224865961718e-06, 'epoch': 0.74} +{'loss': 131.9346, 'grad_norm': 2300.5139033593755, 'learning_rate': 9.172880337397541e-06, 'epoch': 0.74} +{'loss': 136.6233, 'grad_norm': 1744.917026989413, 'learning_rate': 9.168535966021337e-06, 'epoch': 0.74} +{'loss': 138.3555, 'grad_norm': 1681.1035757609736, 'learning_rate': 9.164191752658715e-06, 'epoch': 0.74} +{'loss': 127.864, 'grad_norm': 1919.9617847504362, 'learning_rate': 9.159847698135265e-06, 'epoch': 0.74} +{'loss': 134.6656, 'grad_norm': 1497.7728366288418, 'learning_rate': 9.155503803276537e-06, 'epoch': 0.74} +{'loss': 133.8895, 'grad_norm': 1598.7479836493267, 'learning_rate': 9.15116006890806e-06, 'epoch': 0.74} +{'loss': 130.9259, 'grad_norm': 1696.914187215561, 'learning_rate': 9.146816495855331e-06, 'epoch': 0.74} +{'loss': 134.2596, 'grad_norm': 1830.399461734299, 'learning_rate': 9.142473084943808e-06, 'epoch': 0.74} +{'loss': 130.6659, 'grad_norm': 2414.139517011379, 'learning_rate': 9.13812983699893e-06, 'epoch': 0.75} +{'loss': 135.048, 'grad_norm': 1696.1676382206283, 'learning_rate': 9.13378675284609e-06, 'epoch': 0.75} +{'loss': 131.511, 'grad_norm': 1600.2812817306838, 'learning_rate': 9.12944383331067e-06, 'epoch': 0.75} +{'loss': 134.0345, 'grad_norm': 1539.701852076236, 'learning_rate': 9.125101079218004e-06, 'epoch': 0.75} +{'loss': 136.5926, 'grad_norm': 1940.9699661247726, 'learning_rate': 9.120758491393402e-06, 'epoch': 0.75} +{'loss': 130.3646, 'grad_norm': 1697.920226788632, 'learning_rate': 9.116416070662138e-06, 'epoch': 0.75} +{'loss': 131.5427, 'grad_norm': 1756.196586144351, 'learning_rate': 9.112073817849457e-06, 'epoch': 0.75} +{'loss': 137.2617, 'grad_norm': 1969.1738827569213, 'learning_rate': 9.107731733780575e-06, 'epoch': 0.75} +{'loss': 134.168, 'grad_norm': 1554.9528348129113, 'learning_rate': 9.103389819280673e-06, 'epoch': 0.75} +{'loss': 129.6521, 'grad_norm': 1688.4534485285121, 'learning_rate': 9.099048075174898e-06, 'epoch': 0.75} +{'loss': 133.6506, 'grad_norm': 1604.4722333344612, 'learning_rate': 9.094706502288365e-06, 'epoch': 0.75} +{'loss': 133.4377, 'grad_norm': 1590.1136121787272, 'learning_rate': 9.090365101446157e-06, 'epoch': 0.75} +{'loss': 133.7125, 'grad_norm': 1784.7435696251828, 'learning_rate': 9.086023873473333e-06, 'epoch': 0.75} +{'loss': 133.6592, 'grad_norm': 1785.8053724111726, 'learning_rate': 9.081682819194902e-06, 'epoch': 0.75} +{'loss': 138.9854, 'grad_norm': 1725.7558855531781, 'learning_rate': 9.077341939435857e-06, 'epoch': 0.75} +{'loss': 130.4649, 'grad_norm': 1610.24871440955, 'learning_rate': 9.073001235021143e-06, 'epoch': 0.75} +{'loss': 130.7896, 'grad_norm': 1824.4517867592551, 'learning_rate': 9.068660706775678e-06, 'epoch': 0.75} +{'loss': 130.678, 'grad_norm': 2601.036797158233, 'learning_rate': 9.064320355524358e-06, 'epoch': 0.75} +{'loss': 128.3528, 'grad_norm': 1486.3674455305036, 'learning_rate': 9.059980182092022e-06, 'epoch': 0.75} +{'loss': 131.4546, 'grad_norm': 1520.6404115983999, 'learning_rate': 9.055640187303499e-06, 'epoch': 0.75} +{'loss': 135.4277, 'grad_norm': 1781.1883427371736, 'learning_rate': 9.051300371983564e-06, 'epoch': 0.75} +{'loss': 133.4824, 'grad_norm': 1737.8024479201345, 'learning_rate': 9.046960736956967e-06, 'epoch': 0.75} +{'loss': 133.2053, 'grad_norm': 1538.3463668018196, 'learning_rate': 9.042621283048432e-06, 'epoch': 0.75} +{'loss': 132.4976, 'grad_norm': 1641.4604423023993, 'learning_rate': 9.03828201108263e-06, 'epoch': 0.75} +{'loss': 137.2259, 'grad_norm': 4250.245727957537, 'learning_rate': 9.033942921884217e-06, 'epoch': 0.75} +{'loss': 137.9955, 'grad_norm': 2082.3920253001875, 'learning_rate': 9.029604016277798e-06, 'epoch': 0.75} +{'loss': 134.4364, 'grad_norm': 1897.6415014608683, 'learning_rate': 9.025265295087948e-06, 'epoch': 0.75} +{'loss': 129.7424, 'grad_norm': 1864.4271714274885, 'learning_rate': 9.02092675913922e-06, 'epoch': 0.75} +{'loss': 130.4572, 'grad_norm': 1619.0395508230577, 'learning_rate': 9.016588409256108e-06, 'epoch': 0.75} +{'loss': 129.7647, 'grad_norm': 1706.603165401835, 'learning_rate': 9.012250246263091e-06, 'epoch': 0.75} +{'loss': 127.6296, 'grad_norm': 1671.7141664862538, 'learning_rate': 9.007912270984602e-06, 'epoch': 0.75} +{'loss': 133.6934, 'grad_norm': 1699.6328574716208, 'learning_rate': 9.003574484245043e-06, 'epoch': 0.75} +{'loss': 132.665, 'grad_norm': 1455.596827931348, 'learning_rate': 8.999236886868772e-06, 'epoch': 0.75} +{'loss': 131.7238, 'grad_norm': 1639.6505884010865, 'learning_rate': 8.994899479680123e-06, 'epoch': 0.75} +{'loss': 132.493, 'grad_norm': 1801.8482664495561, 'learning_rate': 8.990562263503392e-06, 'epoch': 0.75} +{'loss': 129.816, 'grad_norm': 1409.3401150287198, 'learning_rate': 8.986225239162828e-06, 'epoch': 0.75} +{'loss': 128.3966, 'grad_norm': 1786.5616960260384, 'learning_rate': 8.98188840748265e-06, 'epoch': 0.75} +{'loss': 131.0943, 'grad_norm': 1787.5524691656167, 'learning_rate': 8.977551769287042e-06, 'epoch': 0.75} +{'loss': 130.4425, 'grad_norm': 1584.8417979778308, 'learning_rate': 8.973215325400155e-06, 'epoch': 0.75} +{'loss': 135.8719, 'grad_norm': 1680.0358191608134, 'learning_rate': 8.968879076646093e-06, 'epoch': 0.75} +{'loss': 131.9893, 'grad_norm': 1675.9889792345546, 'learning_rate': 8.964543023848929e-06, 'epoch': 0.75} +{'loss': 134.7655, 'grad_norm': 1826.147470041572, 'learning_rate': 8.960207167832697e-06, 'epoch': 0.75} +{'loss': 135.7036, 'grad_norm': 1611.702137610004, 'learning_rate': 8.955871509421394e-06, 'epoch': 0.75} +{'loss': 130.4307, 'grad_norm': 1453.847898978802, 'learning_rate': 8.951536049438983e-06, 'epoch': 0.75} +{'loss': 130.65, 'grad_norm': 1615.4972828210284, 'learning_rate': 8.947200788709384e-06, 'epoch': 0.75} +{'loss': 133.4856, 'grad_norm': 1638.850957895472, 'learning_rate': 8.942865728056482e-06, 'epoch': 0.75} +{'loss': 128.6004, 'grad_norm': 2295.9070476062147, 'learning_rate': 8.938530868304121e-06, 'epoch': 0.75} +{'loss': 132.1774, 'grad_norm': 1606.5546733798938, 'learning_rate': 8.934196210276108e-06, 'epoch': 0.75} +{'loss': 130.7429, 'grad_norm': 2059.1712287293776, 'learning_rate': 8.929861754796218e-06, 'epoch': 0.75} +{'loss': 131.0208, 'grad_norm': 1967.318604111726, 'learning_rate': 8.925527502688179e-06, 'epoch': 0.75} +{'loss': 131.1323, 'grad_norm': 1628.1552489816718, 'learning_rate': 8.921193454775685e-06, 'epoch': 0.75} +{'loss': 132.7692, 'grad_norm': 1691.551155391383, 'learning_rate': 8.916859611882387e-06, 'epoch': 0.75} +{'loss': 146.0798, 'grad_norm': 2156.703584226341, 'learning_rate': 8.9125259748319e-06, 'epoch': 0.76} +{'loss': 133.3255, 'grad_norm': 1707.899593562021, 'learning_rate': 8.908192544447803e-06, 'epoch': 0.76} +{'loss': 134.6167, 'grad_norm': 1572.8718108254664, 'learning_rate': 8.90385932155363e-06, 'epoch': 0.76} +{'loss': 129.9158, 'grad_norm': 1478.169188092067, 'learning_rate': 8.89952630697288e-06, 'epoch': 0.76} +{'loss': 127.8316, 'grad_norm': 1523.6923881311711, 'learning_rate': 8.895193501529004e-06, 'epoch': 0.76} +{'loss': 132.3945, 'grad_norm': 1637.8874422786173, 'learning_rate': 8.890860906045425e-06, 'epoch': 0.76} +{'loss': 130.7612, 'grad_norm': 1725.1967271151373, 'learning_rate': 8.886528521345523e-06, 'epoch': 0.76} +{'loss': 132.0726, 'grad_norm': 1751.477638929531, 'learning_rate': 8.882196348252631e-06, 'epoch': 0.76} +{'loss': 129.7743, 'grad_norm': 1679.1125190733503, 'learning_rate': 8.877864387590049e-06, 'epoch': 0.76} +{'loss': 132.7142, 'grad_norm': 1699.867649573446, 'learning_rate': 8.87353264018103e-06, 'epoch': 0.76} +{'loss': 131.9164, 'grad_norm': 1562.3399300672952, 'learning_rate': 8.869201106848794e-06, 'epoch': 0.76} +{'loss': 132.5071, 'grad_norm': 1430.2096658892892, 'learning_rate': 8.86486978841652e-06, 'epoch': 0.76} +{'loss': 132.6512, 'grad_norm': 1662.5330073911257, 'learning_rate': 8.860538685707337e-06, 'epoch': 0.76} +{'loss': 129.5864, 'grad_norm': 1685.5579115550604, 'learning_rate': 8.856207799544342e-06, 'epoch': 0.76} +{'loss': 130.5222, 'grad_norm': 2426.334656371504, 'learning_rate': 8.851877130750589e-06, 'epoch': 0.76} +{'loss': 131.3952, 'grad_norm': 1525.0126011605855, 'learning_rate': 8.84754668014908e-06, 'epoch': 0.76} +{'loss': 131.3839, 'grad_norm': 1420.968940822617, 'learning_rate': 8.8432164485628e-06, 'epoch': 0.76} +{'loss': 132.3306, 'grad_norm': 1850.135487862666, 'learning_rate': 8.838886436814668e-06, 'epoch': 0.76} +{'loss': 129.4526, 'grad_norm': 1567.958436147906, 'learning_rate': 8.834556645727574e-06, 'epoch': 0.76} +{'loss': 133.1971, 'grad_norm': 1874.5672580963744, 'learning_rate': 8.830227076124363e-06, 'epoch': 0.76} +{'loss': 132.8974, 'grad_norm': 1788.8044712982567, 'learning_rate': 8.82589772882783e-06, 'epoch': 0.76} +{'loss': 131.7947, 'grad_norm': 1506.660533049747, 'learning_rate': 8.821568604660747e-06, 'epoch': 0.76} +{'loss': 130.4957, 'grad_norm': 1614.4738449935312, 'learning_rate': 8.817239704445827e-06, 'epoch': 0.76} +{'loss': 136.2041, 'grad_norm': 1535.149274313881, 'learning_rate': 8.812911029005744e-06, 'epoch': 0.76} +{'loss': 128.5828, 'grad_norm': 1831.8152148964468, 'learning_rate': 8.808582579163135e-06, 'epoch': 0.76} +{'loss': 129.5588, 'grad_norm': 1458.6850283566882, 'learning_rate': 8.80425435574058e-06, 'epoch': 0.76} +{'loss': 131.7948, 'grad_norm': 2028.3540700676895, 'learning_rate': 8.799926359560637e-06, 'epoch': 0.76} +{'loss': 131.3459, 'grad_norm': 1590.1801148012273, 'learning_rate': 8.795598591445806e-06, 'epoch': 0.76} +{'loss': 129.9112, 'grad_norm': 1615.8276787457767, 'learning_rate': 8.791271052218543e-06, 'epoch': 0.76} +{'loss': 126.4167, 'grad_norm': 1638.7365182527408, 'learning_rate': 8.786943742701273e-06, 'epoch': 0.76} +{'loss': 131.6862, 'grad_norm': 1556.4134521899794, 'learning_rate': 8.782616663716359e-06, 'epoch': 0.76} +{'loss': 132.4371, 'grad_norm': 1493.4610036911256, 'learning_rate': 8.778289816086138e-06, 'epoch': 0.76} +{'loss': 131.8175, 'grad_norm': 1618.5742006954085, 'learning_rate': 8.773963200632896e-06, 'epoch': 0.76} +{'loss': 133.0187, 'grad_norm': 2410.820436325282, 'learning_rate': 8.769636818178868e-06, 'epoch': 0.76} +{'loss': 132.1811, 'grad_norm': 1673.8037004571927, 'learning_rate': 8.765310669546258e-06, 'epoch': 0.76} +{'loss': 132.3051, 'grad_norm': 1796.050321846637, 'learning_rate': 8.76098475555721e-06, 'epoch': 0.76} +{'loss': 131.7867, 'grad_norm': 1485.1309674925224, 'learning_rate': 8.756659077033838e-06, 'epoch': 0.76} +{'loss': 130.1118, 'grad_norm': 1744.3789924613393, 'learning_rate': 8.752333634798206e-06, 'epoch': 0.76} +{'loss': 128.5506, 'grad_norm': 1519.9043830280182, 'learning_rate': 8.748008429672328e-06, 'epoch': 0.76} +{'loss': 128.0115, 'grad_norm': 1452.975418253918, 'learning_rate': 8.743683462478182e-06, 'epoch': 0.76} +{'loss': 127.8688, 'grad_norm': 1774.1257343746165, 'learning_rate': 8.739358734037691e-06, 'epoch': 0.76} +{'loss': 129.7165, 'grad_norm': 1582.3646239881145, 'learning_rate': 8.735034245172741e-06, 'epoch': 0.76} +{'loss': 132.1426, 'grad_norm': 2179.5580358027114, 'learning_rate': 8.73070999670517e-06, 'epoch': 0.76} +{'loss': 132.0173, 'grad_norm': 1505.466151726762, 'learning_rate': 8.726385989456764e-06, 'epoch': 0.76} +{'loss': 128.9449, 'grad_norm': 1629.6219552085013, 'learning_rate': 8.722062224249278e-06, 'epoch': 0.76} +{'loss': 132.9419, 'grad_norm': 2109.5104494179905, 'learning_rate': 8.717738701904397e-06, 'epoch': 0.76} +{'loss': 132.1956, 'grad_norm': 1756.7172286815808, 'learning_rate': 8.713415423243788e-06, 'epoch': 0.76} +{'loss': 128.1289, 'grad_norm': 1590.8854105419673, 'learning_rate': 8.709092389089056e-06, 'epoch': 0.76} +{'loss': 132.9965, 'grad_norm': 1615.12594578012, 'learning_rate': 8.704769600261755e-06, 'epoch': 0.76} +{'loss': 133.9962, 'grad_norm': 1699.8091618386354, 'learning_rate': 8.700447057583406e-06, 'epoch': 0.76} +{'loss': 129.0598, 'grad_norm': 1574.4721523451808, 'learning_rate': 8.696124761875467e-06, 'epoch': 0.76} +{'loss': 129.5918, 'grad_norm': 1697.6646471242923, 'learning_rate': 8.69180271395937e-06, 'epoch': 0.76} +{'loss': 133.57, 'grad_norm': 1512.180579727973, 'learning_rate': 8.687480914656481e-06, 'epoch': 0.77} +{'loss': 130.0683, 'grad_norm': 2070.827734457714, 'learning_rate': 8.68315936478813e-06, 'epoch': 0.77} +{'loss': 135.922, 'grad_norm': 1643.318071977263, 'learning_rate': 8.67883806517559e-06, 'epoch': 0.77} +{'loss': 137.2659, 'grad_norm': 1836.084299326771, 'learning_rate': 8.674517016640092e-06, 'epoch': 0.77} +{'loss': 128.8948, 'grad_norm': 1889.0186487821106, 'learning_rate': 8.670196220002828e-06, 'epoch': 0.77} +{'loss': 131.4234, 'grad_norm': 1796.7153522848762, 'learning_rate': 8.665875676084927e-06, 'epoch': 0.77} +{'loss': 135.5963, 'grad_norm': 1809.1240698363856, 'learning_rate': 8.661555385707478e-06, 'epoch': 0.77} +{'loss': 128.4287, 'grad_norm': 1984.6192277267771, 'learning_rate': 8.657235349691518e-06, 'epoch': 0.77} +{'loss': 135.7073, 'grad_norm': 1643.918717238712, 'learning_rate': 8.652915568858036e-06, 'epoch': 0.77} +{'loss': 128.5447, 'grad_norm': 1825.333980410054, 'learning_rate': 8.648596044027983e-06, 'epoch': 0.77} +{'loss': 131.0391, 'grad_norm': 1684.9550882845526, 'learning_rate': 8.644276776022244e-06, 'epoch': 0.77} +{'loss': 131.3867, 'grad_norm': 1810.1791683590463, 'learning_rate': 8.639957765661671e-06, 'epoch': 0.77} +{'loss': 126.5002, 'grad_norm': 1712.5676252486196, 'learning_rate': 8.635639013767053e-06, 'epoch': 0.77} +{'loss': 131.7164, 'grad_norm': 1883.926929809946, 'learning_rate': 8.631320521159138e-06, 'epoch': 0.77} +{'loss': 128.2448, 'grad_norm': 1704.3851961451867, 'learning_rate': 8.62700228865863e-06, 'epoch': 0.77} +{'loss': 127.563, 'grad_norm': 1538.3680968856172, 'learning_rate': 8.622684317086171e-06, 'epoch': 0.77} +{'loss': 130.5917, 'grad_norm': 1703.1712416488322, 'learning_rate': 8.618366607262363e-06, 'epoch': 0.77} +{'loss': 130.243, 'grad_norm': 1627.6393211778998, 'learning_rate': 8.61404916000775e-06, 'epoch': 0.77} +{'loss': 131.9793, 'grad_norm': 1708.6823563383064, 'learning_rate': 8.609731976142833e-06, 'epoch': 0.77} +{'loss': 129.8181, 'grad_norm': 1954.1158284865514, 'learning_rate': 8.605415056488067e-06, 'epoch': 0.77} +{'loss': 132.0327, 'grad_norm': 1517.8714012766236, 'learning_rate': 8.60109840186384e-06, 'epoch': 0.77} +{'loss': 130.2826, 'grad_norm': 1598.3858090239141, 'learning_rate': 8.596782013090513e-06, 'epoch': 0.77} +{'loss': 130.1667, 'grad_norm': 1533.918627384563, 'learning_rate': 8.592465890988369e-06, 'epoch': 0.77} +[2025-05-29 06:17:19,202] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step40000 is about to be saved! +[2025-05-29 06:17:19,245] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 06:17:19,245] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 06:17:19,382] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 06:17:19,452] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 06:23:17,895] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 06:23:17,939] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 06:23:26,882] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step40000 is ready now! +[2025-05-29 06:23:26,938] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step40000 is about to be saved! +[2025-05-29 06:23:26,961] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 06:23:26,961] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 06:23:27,059] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 06:23:27,277] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 06:28:54,315] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 06:28:54,484] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-40000/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 06:29:43,714] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step40000 is ready now! +{'loss': 129.6665, 'grad_norm': 1535.2441178894135, 'learning_rate': 8.588150036377663e-06, 'epoch': 0.77} +{'loss': 129.782, 'grad_norm': 1695.4364649559895, 'learning_rate': 8.583834450078594e-06, 'epoch': 0.77} +{'loss': 134.0368, 'grad_norm': 1783.016674421436, 'learning_rate': 8.579519132911303e-06, 'epoch': 0.77} +{'loss': 128.7715, 'grad_norm': 1607.1177178075163, 'learning_rate': 8.575204085695887e-06, 'epoch': 0.77} +{'loss': 132.2363, 'grad_norm': 1635.4402383052527, 'learning_rate': 8.570889309252382e-06, 'epoch': 0.77} +{'loss': 132.373, 'grad_norm': 1659.8656514355653, 'learning_rate': 8.566574804400785e-06, 'epoch': 0.77} +{'loss': 130.6833, 'grad_norm': 1501.167725190025, 'learning_rate': 8.562260571961037e-06, 'epoch': 0.77} +{'loss': 129.5365, 'grad_norm': 1545.5099218309101, 'learning_rate': 8.557946612753023e-06, 'epoch': 0.77} +{'loss': 128.3234, 'grad_norm': 1607.1966263624286, 'learning_rate': 8.553632927596582e-06, 'epoch': 0.77} +{'loss': 153.0209, 'grad_norm': 25232.54813197923, 'learning_rate': 8.549319517311491e-06, 'epoch': 0.77} +{'loss': 131.0825, 'grad_norm': 1665.1024284760406, 'learning_rate': 8.545006382717487e-06, 'epoch': 0.77} +{'loss': 127.6546, 'grad_norm': 1614.7978029647304, 'learning_rate': 8.54069352463425e-06, 'epoch': 0.77} +{'loss': 129.6048, 'grad_norm': 1484.589733767259, 'learning_rate': 8.536380943881409e-06, 'epoch': 0.77} +{'loss': 132.4557, 'grad_norm': 1484.26101599885, 'learning_rate': 8.53206864127853e-06, 'epoch': 0.77} +{'loss': 130.4965, 'grad_norm': 1655.9706682052408, 'learning_rate': 8.527756617645142e-06, 'epoch': 0.77} +{'loss': 138.7995, 'grad_norm': 1874.4194265839367, 'learning_rate': 8.523444873800705e-06, 'epoch': 0.77} +{'loss': 128.0663, 'grad_norm': 1682.4985605088382, 'learning_rate': 8.519133410564642e-06, 'epoch': 0.77} +{'loss': 129.3183, 'grad_norm': 1568.706995294854, 'learning_rate': 8.514822228756311e-06, 'epoch': 0.77} +{'loss': 127.9593, 'grad_norm': 1492.8825253706586, 'learning_rate': 8.510511329195021e-06, 'epoch': 0.77} +{'loss': 128.5023, 'grad_norm': 1617.5452467954399, 'learning_rate': 8.506200712700029e-06, 'epoch': 0.77} +{'loss': 133.2466, 'grad_norm': 1867.7926613566208, 'learning_rate': 8.501890380090524e-06, 'epoch': 0.77} +{'loss': 132.7448, 'grad_norm': 1776.9565535040276, 'learning_rate': 8.497580332185668e-06, 'epoch': 0.77} +{'loss': 128.3704, 'grad_norm': 1520.614066582208, 'learning_rate': 8.493270569804547e-06, 'epoch': 0.77} +{'loss': 128.7557, 'grad_norm': 1728.9396178620095, 'learning_rate': 8.488961093766201e-06, 'epoch': 0.77} +{'loss': 128.1559, 'grad_norm': 1563.1301426391187, 'learning_rate': 8.484651904889614e-06, 'epoch': 0.77} +{'loss': 128.7884, 'grad_norm': 1455.184819595839, 'learning_rate': 8.480343003993708e-06, 'epoch': 0.77} +{'loss': 127.6356, 'grad_norm': 1677.2892285706885, 'learning_rate': 8.476034391897367e-06, 'epoch': 0.77} +{'loss': 133.1514, 'grad_norm': 1501.0015332942028, 'learning_rate': 8.471726069419412e-06, 'epoch': 0.77} +{'loss': 128.4288, 'grad_norm': 1522.0541488395315, 'learning_rate': 8.467418037378599e-06, 'epoch': 0.77} +{'loss': 132.9857, 'grad_norm': 1830.1579691768482, 'learning_rate': 8.463110296593646e-06, 'epoch': 0.78} +{'loss': 127.8534, 'grad_norm': 1451.7665823388338, 'learning_rate': 8.458802847883199e-06, 'epoch': 0.78} +{'loss': 133.0597, 'grad_norm': 1886.1006880289412, 'learning_rate': 8.454495692065862e-06, 'epoch': 0.78} +{'loss': 130.7499, 'grad_norm': 1868.186424161546, 'learning_rate': 8.450188829960181e-06, 'epoch': 0.78} +{'loss': 131.7454, 'grad_norm': 1430.9088974005738, 'learning_rate': 8.445882262384638e-06, 'epoch': 0.78} +{'loss': 129.3309, 'grad_norm': 1711.3075893467014, 'learning_rate': 8.441575990157669e-06, 'epoch': 0.78} +{'loss': 126.8048, 'grad_norm': 1559.2160529686043, 'learning_rate': 8.437270014097638e-06, 'epoch': 0.78} +{'loss': 142.3706, 'grad_norm': 1849.9349072700256, 'learning_rate': 8.432964335022878e-06, 'epoch': 0.78} +{'loss': 130.4382, 'grad_norm': 1655.7737822924314, 'learning_rate': 8.428658953751644e-06, 'epoch': 0.78} +{'loss': 131.743, 'grad_norm': 2163.5692444433475, 'learning_rate': 8.424353871102144e-06, 'epoch': 0.78} +{'loss': 129.086, 'grad_norm': 1533.8274423682458, 'learning_rate': 8.420049087892526e-06, 'epoch': 0.78} +{'loss': 129.0577, 'grad_norm': 1639.1236124552704, 'learning_rate': 8.41574460494088e-06, 'epoch': 0.78} +{'loss': 129.6057, 'grad_norm': 1938.39507943742, 'learning_rate': 8.411440423065246e-06, 'epoch': 0.78} +{'loss': 133.1056, 'grad_norm': 1862.9349646031374, 'learning_rate': 8.4071365430836e-06, 'epoch': 0.78} +{'loss': 131.0106, 'grad_norm': 1641.5661171468057, 'learning_rate': 8.40283296581386e-06, 'epoch': 0.78} +{'loss': 129.1779, 'grad_norm': 1549.9096486985127, 'learning_rate': 8.398529692073894e-06, 'epoch': 0.78} +{'loss': 137.0487, 'grad_norm': 2094.5910574048903, 'learning_rate': 8.394226722681498e-06, 'epoch': 0.78} +{'loss': 130.2472, 'grad_norm': 1483.512476626982, 'learning_rate': 8.389924058454431e-06, 'epoch': 0.78} +{'loss': 126.797, 'grad_norm': 1452.6374632655225, 'learning_rate': 8.385621700210378e-06, 'epoch': 0.78} +{'loss': 129.3531, 'grad_norm': 1494.4333244659624, 'learning_rate': 8.381319648766969e-06, 'epoch': 0.78} +{'loss': 127.2211, 'grad_norm': 1447.4153816412838, 'learning_rate': 8.377017904941777e-06, 'epoch': 0.78} +{'loss': 128.4979, 'grad_norm': 1564.5098418534342, 'learning_rate': 8.372716469552314e-06, 'epoch': 0.78} +{'loss': 124.0637, 'grad_norm': 1477.0229903656962, 'learning_rate': 8.368415343416043e-06, 'epoch': 0.78} +{'loss': 128.0336, 'grad_norm': 1873.6225404194695, 'learning_rate': 8.364114527350357e-06, 'epoch': 0.78} +{'loss': 131.4986, 'grad_norm': 1925.8290649246842, 'learning_rate': 8.359814022172596e-06, 'epoch': 0.78} +{'loss': 126.4314, 'grad_norm': 1664.5367531775555, 'learning_rate': 8.355513828700034e-06, 'epoch': 0.78} +{'loss': 133.2571, 'grad_norm': 1729.5053526900303, 'learning_rate': 8.351213947749899e-06, 'epoch': 0.78} +{'loss': 131.8651, 'grad_norm': 1653.6274568484364, 'learning_rate': 8.346914380139343e-06, 'epoch': 0.78} +{'loss': 128.9145, 'grad_norm': 1744.3117654030664, 'learning_rate': 8.342615126685471e-06, 'epoch': 0.78} +{'loss': 127.7878, 'grad_norm': 1402.4221516220841, 'learning_rate': 8.33831618820533e-06, 'epoch': 0.78} +{'loss': 133.3922, 'grad_norm': 1643.742334086713, 'learning_rate': 8.334017565515892e-06, 'epoch': 0.78} +{'loss': 127.6981, 'grad_norm': 1516.1143511532575, 'learning_rate': 8.329719259434085e-06, 'epoch': 0.78} +{'loss': 129.7535, 'grad_norm': 1602.519759935472, 'learning_rate': 8.325421270776762e-06, 'epoch': 0.78} +{'loss': 132.9763, 'grad_norm': 1746.7055534610354, 'learning_rate': 8.321123600360733e-06, 'epoch': 0.78} +{'loss': 128.6338, 'grad_norm': 1433.9024430745485, 'learning_rate': 8.316826249002737e-06, 'epoch': 0.78} +{'loss': 134.3555, 'grad_norm': 1567.0782375477074, 'learning_rate': 8.31252921751945e-06, 'epoch': 0.78} +{'loss': 128.5674, 'grad_norm': 2000.8855983906467, 'learning_rate': 8.308232506727495e-06, 'epoch': 0.78} +{'loss': 131.7644, 'grad_norm': 1811.0800681417509, 'learning_rate': 8.303936117443422e-06, 'epoch': 0.78} +{'loss': 129.1688, 'grad_norm': 1644.8907800159154, 'learning_rate': 8.29964005048374e-06, 'epoch': 0.78} +{'loss': 129.5279, 'grad_norm': 1554.2266768101922, 'learning_rate': 8.295344306664877e-06, 'epoch': 0.78} +{'loss': 130.7267, 'grad_norm': 1486.538733662277, 'learning_rate': 8.291048886803208e-06, 'epoch': 0.78} +{'loss': 128.6232, 'grad_norm': 1581.9340583774172, 'learning_rate': 8.286753791715048e-06, 'epoch': 0.78} +{'loss': 126.1842, 'grad_norm': 1959.9149889804894, 'learning_rate': 8.282459022216644e-06, 'epoch': 0.78} +{'loss': 126.4965, 'grad_norm': 1924.6442500601734, 'learning_rate': 8.27816457912419e-06, 'epoch': 0.78} +{'loss': 125.9115, 'grad_norm': 1578.8490055464665, 'learning_rate': 8.273870463253813e-06, 'epoch': 0.78} +{'loss': 130.0689, 'grad_norm': 1700.143936312748, 'learning_rate': 8.269576675421572e-06, 'epoch': 0.78} +{'loss': 125.6895, 'grad_norm': 1659.4803142447718, 'learning_rate': 8.265283216443476e-06, 'epoch': 0.78} +{'loss': 125.5708, 'grad_norm': 1589.9676514499536, 'learning_rate': 8.26099008713546e-06, 'epoch': 0.78} +{'loss': 127.6005, 'grad_norm': 1527.491450768075, 'learning_rate': 8.256697288313405e-06, 'epoch': 0.78} +{'loss': 130.7187, 'grad_norm': 2330.8697840921263, 'learning_rate': 8.252404820793127e-06, 'epoch': 0.78} +{'loss': 128.6177, 'grad_norm': 1608.7764736995632, 'learning_rate': 8.248112685390373e-06, 'epoch': 0.78} +{'loss': 128.2769, 'grad_norm': 1672.1190839561416, 'learning_rate': 8.243820882920837e-06, 'epoch': 0.78} +{'loss': 128.5455, 'grad_norm': 1846.6048895227414, 'learning_rate': 8.239529414200137e-06, 'epoch': 0.79} +{'loss': 127.9481, 'grad_norm': 1504.392959128216, 'learning_rate': 8.23523828004384e-06, 'epoch': 0.79} +{'loss': 132.2335, 'grad_norm': 1589.8050079398877, 'learning_rate': 8.230947481267446e-06, 'epoch': 0.79} +{'loss': 125.9592, 'grad_norm': 1488.9157008770449, 'learning_rate': 8.226657018686388e-06, 'epoch': 0.79} +{'loss': 130.6015, 'grad_norm': 1827.0689729015671, 'learning_rate': 8.222366893116032e-06, 'epoch': 0.79} +{'loss': 128.6202, 'grad_norm': 1679.6224415973506, 'learning_rate': 8.218077105371687e-06, 'epoch': 0.79} +{'loss': 129.4173, 'grad_norm': 1763.9724255606934, 'learning_rate': 8.213787656268599e-06, 'epoch': 0.79} +{'loss': 130.2009, 'grad_norm': 1655.9653611601955, 'learning_rate': 8.209498546621943e-06, 'epoch': 0.79} +{'loss': 125.7963, 'grad_norm': 1427.4890717516316, 'learning_rate': 8.205209777246835e-06, 'epoch': 0.79} +{'loss': 127.8344, 'grad_norm': 1498.4569722268195, 'learning_rate': 8.20092134895832e-06, 'epoch': 0.79} +{'loss': 127.3965, 'grad_norm': 1468.7215423803311, 'learning_rate': 8.196633262571381e-06, 'epoch': 0.79} +{'loss': 129.8786, 'grad_norm': 1765.5117395149623, 'learning_rate': 8.192345518900946e-06, 'epoch': 0.79} +{'loss': 127.9837, 'grad_norm': 1682.9746100330538, 'learning_rate': 8.18805811876186e-06, 'epoch': 0.79} +{'loss': 128.2103, 'grad_norm': 1703.753719191167, 'learning_rate': 8.183771062968917e-06, 'epoch': 0.79} +{'loss': 127.7026, 'grad_norm': 1577.7793304929905, 'learning_rate': 8.179484352336836e-06, 'epoch': 0.79} +{'loss': 128.5767, 'grad_norm': 1638.2396023785864, 'learning_rate': 8.175197987680274e-06, 'epoch': 0.79} +{'loss': 131.62, 'grad_norm': 1803.8021044229345, 'learning_rate': 8.170911969813829e-06, 'epoch': 0.79} +{'loss': 126.763, 'grad_norm': 1628.9197788560998, 'learning_rate': 8.166626299552021e-06, 'epoch': 0.79} +{'loss': 127.9237, 'grad_norm': 1560.7629661565875, 'learning_rate': 8.162340977709317e-06, 'epoch': 0.79} +{'loss': 126.3383, 'grad_norm': 1621.6988247106679, 'learning_rate': 8.158056005100103e-06, 'epoch': 0.79} +{'loss': 129.5781, 'grad_norm': 1507.8741573711202, 'learning_rate': 8.153771382538706e-06, 'epoch': 0.79} +{'loss': 125.5846, 'grad_norm': 1555.366348100434, 'learning_rate': 8.149487110839394e-06, 'epoch': 0.79} +{'loss': 124.4258, 'grad_norm': 1564.051000168742, 'learning_rate': 8.145203190816356e-06, 'epoch': 0.79} +{'loss': 133.5503, 'grad_norm': 1782.1633708600102, 'learning_rate': 8.140919623283723e-06, 'epoch': 0.79} +{'loss': 128.9898, 'grad_norm': 1697.1921049340046, 'learning_rate': 8.13663640905555e-06, 'epoch': 0.79} +{'loss': 133.2102, 'grad_norm': 1564.4725900208891, 'learning_rate': 8.132353548945832e-06, 'epoch': 0.79} +{'loss': 129.7739, 'grad_norm': 1646.8871975507234, 'learning_rate': 8.1280710437685e-06, 'epoch': 0.79} +{'loss': 131.6024, 'grad_norm': 1512.312353923598, 'learning_rate': 8.123788894337405e-06, 'epoch': 0.79} +{'loss': 128.7256, 'grad_norm': 2294.521423390997, 'learning_rate': 8.119507101466343e-06, 'epoch': 0.79} +{'loss': 129.6879, 'grad_norm': 1704.499051910976, 'learning_rate': 8.115225665969033e-06, 'epoch': 0.79} +{'loss': 126.572, 'grad_norm': 1607.7733269003731, 'learning_rate': 8.11094458865913e-06, 'epoch': 0.79} +{'loss': 125.5744, 'grad_norm': 1937.8226218446853, 'learning_rate': 8.106663870350228e-06, 'epoch': 0.79} +{'loss': 128.5441, 'grad_norm': 2072.851263773231, 'learning_rate': 8.102383511855841e-06, 'epoch': 0.79} +{'loss': 126.9567, 'grad_norm': 1565.170126081928, 'learning_rate': 8.098103513989419e-06, 'epoch': 0.79} +{'loss': 129.1505, 'grad_norm': 1440.7323838273778, 'learning_rate': 8.093823877564343e-06, 'epoch': 0.79} +{'loss': 128.6439, 'grad_norm': 1624.1210596392546, 'learning_rate': 8.089544603393925e-06, 'epoch': 0.79} +{'loss': 127.1312, 'grad_norm': 1668.384501719523, 'learning_rate': 8.085265692291418e-06, 'epoch': 0.79} +{'loss': 127.8872, 'grad_norm': 1703.7964645507266, 'learning_rate': 8.08098714506999e-06, 'epoch': 0.79} +{'loss': 126.721, 'grad_norm': 1722.8526684709943, 'learning_rate': 8.076708962542746e-06, 'epoch': 0.79} +{'loss': 129.0685, 'grad_norm': 1569.7095606158648, 'learning_rate': 8.072431145522728e-06, 'epoch': 0.79} +{'loss': 123.9822, 'grad_norm': 1578.5278433716014, 'learning_rate': 8.068153694822898e-06, 'epoch': 0.79} +{'loss': 131.8985, 'grad_norm': 1757.2066207073913, 'learning_rate': 8.063876611256158e-06, 'epoch': 0.79} +{'loss': 129.041, 'grad_norm': 1662.1347547373052, 'learning_rate': 8.059599895635336e-06, 'epoch': 0.79} +{'loss': 126.952, 'grad_norm': 1646.4957614909092, 'learning_rate': 8.055323548773189e-06, 'epoch': 0.79} +{'loss': 126.9427, 'grad_norm': 1655.697334546598, 'learning_rate': 8.051047571482406e-06, 'epoch': 0.79} +{'loss': 127.1268, 'grad_norm': 1546.417302696872, 'learning_rate': 8.046771964575599e-06, 'epoch': 0.79} +{'loss': 128.7763, 'grad_norm': 1503.5129440524172, 'learning_rate': 8.042496728865323e-06, 'epoch': 0.79} +{'loss': 127.8047, 'grad_norm': 1553.7674416068853, 'learning_rate': 8.038221865164055e-06, 'epoch': 0.79} +{'loss': 131.2558, 'grad_norm': 1628.0570420505821, 'learning_rate': 8.033947374284196e-06, 'epoch': 0.79} +{'loss': 131.7196, 'grad_norm': 1518.8667678095242, 'learning_rate': 8.029673257038087e-06, 'epoch': 0.79} +{'loss': 125.7581, 'grad_norm': 1545.6674284093026, 'learning_rate': 8.025399514237983e-06, 'epoch': 0.79} +{'loss': 131.9953, 'grad_norm': 1759.3160718011432, 'learning_rate': 8.021126146696087e-06, 'epoch': 0.79} +{'loss': 125.8093, 'grad_norm': 1516.4873906881303, 'learning_rate': 8.01685315522452e-06, 'epoch': 0.8} +{'loss': 125.7774, 'grad_norm': 1715.8937772641202, 'learning_rate': 8.012580540635327e-06, 'epoch': 0.8} +{'loss': 124.1942, 'grad_norm': 1740.1668325590304, 'learning_rate': 8.00830830374049e-06, 'epoch': 0.8} +{'loss': 128.5427, 'grad_norm': 1759.6735006127362, 'learning_rate': 8.00403644535191e-06, 'epoch': 0.8} +{'loss': 129.0019, 'grad_norm': 1462.6867735704213, 'learning_rate': 7.999764966281433e-06, 'epoch': 0.8} +{'loss': 132.009, 'grad_norm': 1789.100475572893, 'learning_rate': 7.995493867340817e-06, 'epoch': 0.8} +{'loss': 125.8639, 'grad_norm': 1472.4852246571959, 'learning_rate': 7.991223149341751e-06, 'epoch': 0.8} +{'loss': 125.6485, 'grad_norm': 1565.6883159840627, 'learning_rate': 7.986952813095856e-06, 'epoch': 0.8} +{'loss': 126.9964, 'grad_norm': 1631.5085351767043, 'learning_rate': 7.982682859414671e-06, 'epoch': 0.8} +{'loss': 128.2371, 'grad_norm': 1639.399892054316, 'learning_rate': 7.978413289109677e-06, 'epoch': 0.8} +{'loss': 129.6715, 'grad_norm': 1934.2482311708784, 'learning_rate': 7.974144102992273e-06, 'epoch': 0.8} +{'loss': 127.7244, 'grad_norm': 1560.5776046983776, 'learning_rate': 7.969875301873785e-06, 'epoch': 0.8} +{'loss': 125.4116, 'grad_norm': 1505.3060851714977, 'learning_rate': 7.965606886565468e-06, 'epoch': 0.8} +{'loss': 129.333, 'grad_norm': 1611.7635074842817, 'learning_rate': 7.961338857878496e-06, 'epoch': 0.8} +{'loss': 130.1073, 'grad_norm': 1552.214862550896, 'learning_rate': 7.957071216623987e-06, 'epoch': 0.8} +{'loss': 130.8976, 'grad_norm': 1439.3427206940357, 'learning_rate': 7.952803963612968e-06, 'epoch': 0.8} +{'loss': 127.8988, 'grad_norm': 1413.6946209785167, 'learning_rate': 7.9485370996564e-06, 'epoch': 0.8} +{'loss': 128.4278, 'grad_norm': 1663.432807836346, 'learning_rate': 7.94427062556517e-06, 'epoch': 0.8} +{'loss': 127.3407, 'grad_norm': 1540.7220325362725, 'learning_rate': 7.940004542150087e-06, 'epoch': 0.8} +{'loss': 129.464, 'grad_norm': 1546.9175450398548, 'learning_rate': 7.935738850221892e-06, 'epoch': 0.8} +{'loss': 129.1928, 'grad_norm': 1473.8300322340376, 'learning_rate': 7.93147355059125e-06, 'epoch': 0.8} +{'loss': 129.5689, 'grad_norm': 1454.1447306424348, 'learning_rate': 7.927208644068743e-06, 'epoch': 0.8} +{'loss': 128.2433, 'grad_norm': 1502.4387471360558, 'learning_rate': 7.922944131464889e-06, 'epoch': 0.8} +{'loss': 125.324, 'grad_norm': 1884.304383574987, 'learning_rate': 7.918680013590123e-06, 'epoch': 0.8} +{'loss': 128.942, 'grad_norm': 1705.3288949162272, 'learning_rate': 7.914416291254817e-06, 'epoch': 0.8} +{'loss': 129.3879, 'grad_norm': 1600.4067398470738, 'learning_rate': 7.910152965269253e-06, 'epoch': 0.8} +{'loss': 128.3435, 'grad_norm': 1480.9363669937468, 'learning_rate': 7.90589003644365e-06, 'epoch': 0.8} +{'loss': 126.2803, 'grad_norm': 1685.1155066905642, 'learning_rate': 7.901627505588138e-06, 'epoch': 0.8} +{'loss': 129.4288, 'grad_norm': 1603.10470119751, 'learning_rate': 7.897365373512786e-06, 'epoch': 0.8} +{'loss': 127.4713, 'grad_norm': 5492.512796893564, 'learning_rate': 7.893103641027581e-06, 'epoch': 0.8} +{'loss': 129.6616, 'grad_norm': 1597.299189956771, 'learning_rate': 7.88884230894243e-06, 'epoch': 0.8} +{'loss': 125.6969, 'grad_norm': 1716.7185453366296, 'learning_rate': 7.884581378067171e-06, 'epoch': 0.8} +{'loss': 128.0051, 'grad_norm': 1649.0299294594179, 'learning_rate': 7.880320849211558e-06, 'epoch': 0.8} +{'loss': 126.2495, 'grad_norm': 1735.2400584664488, 'learning_rate': 7.876060723185275e-06, 'epoch': 0.8} +{'loss': 134.2595, 'grad_norm': 1894.8171303293045, 'learning_rate': 7.871801000797931e-06, 'epoch': 0.8} +{'loss': 127.4439, 'grad_norm': 1756.7933492018471, 'learning_rate': 7.86754168285905e-06, 'epoch': 0.8} +{'loss': 130.6155, 'grad_norm': 1681.9142463463713, 'learning_rate': 7.86328277017809e-06, 'epoch': 0.8} +{'loss': 129.2937, 'grad_norm': 1598.2223062253597, 'learning_rate': 7.859024263564415e-06, 'epoch': 0.8} +{'loss': 125.4085, 'grad_norm': 1589.8464866766014, 'learning_rate': 7.85476616382733e-06, 'epoch': 0.8} +{'loss': 130.18, 'grad_norm': 1766.5081986664356, 'learning_rate': 7.850508471776058e-06, 'epoch': 0.8} +{'loss': 126.3162, 'grad_norm': 1532.1001302736754, 'learning_rate': 7.846251188219736e-06, 'epoch': 0.8} +{'loss': 126.6893, 'grad_norm': 1567.9083178655176, 'learning_rate': 7.841994313967433e-06, 'epoch': 0.8} +{'loss': 124.0538, 'grad_norm': 1323.5895214711345, 'learning_rate': 7.837737849828133e-06, 'epoch': 0.8} +{'loss': 123.1723, 'grad_norm': 1605.820683105086, 'learning_rate': 7.833481796610745e-06, 'epoch': 0.8} +{'loss': 125.1698, 'grad_norm': 1603.1721573049265, 'learning_rate': 7.829226155124109e-06, 'epoch': 0.8} +{'loss': 126.6055, 'grad_norm': 1861.790021854955, 'learning_rate': 7.824970926176967e-06, 'epoch': 0.8} +{'loss': 125.2463, 'grad_norm': 1687.868788895699, 'learning_rate': 7.820716110578003e-06, 'epoch': 0.8} +{'loss': 120.6447, 'grad_norm': 1581.9810480939734, 'learning_rate': 7.816461709135806e-06, 'epoch': 0.8} +{'loss': 126.1703, 'grad_norm': 1727.0950226010546, 'learning_rate': 7.812207722658894e-06, 'epoch': 0.8} +{'loss': 123.5415, 'grad_norm': 1476.61199272816, 'learning_rate': 7.807954151955713e-06, 'epoch': 0.8} +{'loss': 129.165, 'grad_norm': 1664.7844837498799, 'learning_rate': 7.803700997834615e-06, 'epoch': 0.8} +{'loss': 125.67, 'grad_norm': 1678.2726294936265, 'learning_rate': 7.799448261103886e-06, 'epoch': 0.8} +{'loss': 126.2069, 'grad_norm': 1508.2536603580616, 'learning_rate': 7.795195942571722e-06, 'epoch': 0.81} +{'loss': 127.2152, 'grad_norm': 1614.6114778361248, 'learning_rate': 7.790944043046245e-06, 'epoch': 0.81} +{'loss': 126.9287, 'grad_norm': 1493.541637949216, 'learning_rate': 7.786692563335502e-06, 'epoch': 0.81} +{'loss': 126.259, 'grad_norm': 1677.349655118149, 'learning_rate': 7.78244150424745e-06, 'epoch': 0.81} +{'loss': 126.7111, 'grad_norm': 1874.56508179945, 'learning_rate': 7.778190866589978e-06, 'epoch': 0.81} +{'loss': 126.5359, 'grad_norm': 1739.1209672409234, 'learning_rate': 7.77394065117088e-06, 'epoch': 0.81} +{'loss': 130.4336, 'grad_norm': 1616.983868852378, 'learning_rate': 7.769690858797881e-06, 'epoch': 0.81} +{'loss': 126.9618, 'grad_norm': 1731.7168435734632, 'learning_rate': 7.765441490278625e-06, 'epoch': 0.81} +{'loss': 125.3018, 'grad_norm': 1526.7617762365562, 'learning_rate': 7.761192546420672e-06, 'epoch': 0.81} +{'loss': 124.4326, 'grad_norm': 1492.5466971011415, 'learning_rate': 7.7569440280315e-06, 'epoch': 0.81} +{'loss': 126.0465, 'grad_norm': 1762.611582706376, 'learning_rate': 7.752695935918512e-06, 'epoch': 0.81} +{'loss': 125.186, 'grad_norm': 2231.143575892615, 'learning_rate': 7.748448270889022e-06, 'epoch': 0.81} +{'loss': 126.2913, 'grad_norm': 1997.5858893132122, 'learning_rate': 7.744201033750272e-06, 'epoch': 0.81} +{'loss': 125.3664, 'grad_norm': 1723.4193867982453, 'learning_rate': 7.739954225309417e-06, 'epoch': 0.81} +{'loss': 125.8096, 'grad_norm': 1423.9678265787168, 'learning_rate': 7.735707846373527e-06, 'epoch': 0.81} +[2025-05-29 08:18:04,436] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step42000 is about to be saved! +[2025-05-29 08:18:04,452] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 08:18:04,452] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 08:18:04,495] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 08:18:04,666] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 08:23:51,337] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 08:23:51,408] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 08:23:52,709] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step42000 is ready now! +[2025-05-29 08:23:52,739] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step42000 is about to be saved! +[2025-05-29 08:23:52,761] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 08:23:52,761] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 08:23:52,794] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 08:23:52,858] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 08:29:38,226] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 08:29:38,647] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-42000/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 08:29:51,964] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step42000 is ready now! +{'loss': 123.8211, 'grad_norm': 1870.0561851776451, 'learning_rate': 7.731461897749601e-06, 'epoch': 0.81} +{'loss': 127.1798, 'grad_norm': 1468.0024121973956, 'learning_rate': 7.727216380244544e-06, 'epoch': 0.81} +{'loss': 129.7866, 'grad_norm': 1822.0026940882499, 'learning_rate': 7.72297129466519e-06, 'epoch': 0.81} +{'loss': 126.5488, 'grad_norm': 1533.8177035954852, 'learning_rate': 7.718726641818286e-06, 'epoch': 0.81} +{'loss': 126.0787, 'grad_norm': 1772.397523263235, 'learning_rate': 7.714482422510494e-06, 'epoch': 0.81} +{'loss': 124.5997, 'grad_norm': 2039.8824067835772, 'learning_rate': 7.710238637548401e-06, 'epoch': 0.81} +{'loss': 127.7182, 'grad_norm': 1680.920991373902, 'learning_rate': 7.705995287738498e-06, 'epoch': 0.81} +{'loss': 131.5149, 'grad_norm': 1827.4345534294187, 'learning_rate': 7.70175237388721e-06, 'epoch': 0.81} +{'loss': 125.5263, 'grad_norm': 1645.9584073233066, 'learning_rate': 7.697509896800875e-06, 'epoch': 0.81} +{'loss': 126.0243, 'grad_norm': 1932.881542255196, 'learning_rate': 7.693267857285733e-06, 'epoch': 0.81} +{'loss': 126.0204, 'grad_norm': 1537.536065501508, 'learning_rate': 7.68902625614796e-06, 'epoch': 0.81} +{'loss': 128.1087, 'grad_norm': 1628.0904819087104, 'learning_rate': 7.68478509419364e-06, 'epoch': 0.81} +{'loss': 127.0256, 'grad_norm': 1790.286203459383, 'learning_rate': 7.680544372228768e-06, 'epoch': 0.81} +{'loss': 125.508, 'grad_norm': 1733.6963220095981, 'learning_rate': 7.676304091059273e-06, 'epoch': 0.81} +{'loss': 124.6761, 'grad_norm': 1598.0567267518375, 'learning_rate': 7.67206425149098e-06, 'epoch': 0.81} +{'loss': 127.3982, 'grad_norm': 1642.2359776842143, 'learning_rate': 7.667824854329647e-06, 'epoch': 0.81} +{'loss': 127.1872, 'grad_norm': 1891.471462693233, 'learning_rate': 7.66358590038093e-06, 'epoch': 0.81} +{'loss': 128.5282, 'grad_norm': 1576.5891745592196, 'learning_rate': 7.659347390450416e-06, 'epoch': 0.81} +{'loss': 127.958, 'grad_norm': 1688.1488710890196, 'learning_rate': 7.655109325343605e-06, 'epoch': 0.81} +{'loss': 124.1798, 'grad_norm': 1455.1241335422264, 'learning_rate': 7.650871705865907e-06, 'epoch': 0.81} +{'loss': 129.4971, 'grad_norm': 1556.8800877703536, 'learning_rate': 7.646634532822652e-06, 'epoch': 0.81} +{'loss': 125.1379, 'grad_norm': 1501.460728191943, 'learning_rate': 7.642397807019084e-06, 'epoch': 0.81} +{'loss': 124.7303, 'grad_norm': 1750.300073147513, 'learning_rate': 7.63816152926035e-06, 'epoch': 0.81} +{'loss': 123.962, 'grad_norm': 1755.869112269825, 'learning_rate': 7.63392570035154e-06, 'epoch': 0.81} +{'loss': 125.3861, 'grad_norm': 1555.0370348052465, 'learning_rate': 7.629690321097636e-06, 'epoch': 0.81} +{'loss': 125.1372, 'grad_norm': 1827.6639585266428, 'learning_rate': 7.625455392303536e-06, 'epoch': 0.81} +{'loss': 122.5018, 'grad_norm': 1562.7127390394207, 'learning_rate': 7.621220914774061e-06, 'epoch': 0.81} +{'loss': 124.5177, 'grad_norm': 1518.6265495699536, 'learning_rate': 7.616986889313939e-06, 'epoch': 0.81} +{'loss': 127.6678, 'grad_norm': 1656.8175378052745, 'learning_rate': 7.612753316727818e-06, 'epoch': 0.81} +{'loss': 126.6464, 'grad_norm': 1731.1229133442907, 'learning_rate': 7.608520197820259e-06, 'epoch': 0.81} +{'loss': 124.0909, 'grad_norm': 1567.207749076016, 'learning_rate': 7.604287533395729e-06, 'epoch': 0.81} +{'loss': 125.8519, 'grad_norm': 1551.3089141724415, 'learning_rate': 7.600055324258622e-06, 'epoch': 0.81} +{'loss': 124.7943, 'grad_norm': 1498.8796538451506, 'learning_rate': 7.595823571213228e-06, 'epoch': 0.81} +{'loss': 122.0255, 'grad_norm': 1650.4090010521772, 'learning_rate': 7.591592275063768e-06, 'epoch': 0.81} +{'loss': 125.012, 'grad_norm': 1645.861042984249, 'learning_rate': 7.58736143661437e-06, 'epoch': 0.81} +{'loss': 128.5699, 'grad_norm': 1603.723869226461, 'learning_rate': 7.583131056669067e-06, 'epoch': 0.81} +{'loss': 126.7291, 'grad_norm': 1566.504827190827, 'learning_rate': 7.578901136031817e-06, 'epoch': 0.81} +{'loss': 123.0026, 'grad_norm': 1793.758779792517, 'learning_rate': 7.5746716755064795e-06, 'epoch': 0.82} +{'loss': 124.07, 'grad_norm': 1531.645041606873, 'learning_rate': 7.570442675896836e-06, 'epoch': 0.82} +{'loss': 130.6169, 'grad_norm': 1702.4565703323706, 'learning_rate': 7.5662141380065805e-06, 'epoch': 0.82} +{'loss': 125.8627, 'grad_norm': 1573.5416868210261, 'learning_rate': 7.561986062639306e-06, 'epoch': 0.82} +{'loss': 136.8251, 'grad_norm': 1865.9753916241136, 'learning_rate': 7.557758450598534e-06, 'epoch': 0.82} +{'loss': 124.7507, 'grad_norm': 1539.7194170789087, 'learning_rate': 7.553531302687686e-06, 'epoch': 0.82} +{'loss': 125.729, 'grad_norm': 1507.2651625126746, 'learning_rate': 7.549304619710103e-06, 'epoch': 0.82} +{'loss': 131.6467, 'grad_norm': 1921.0666489585112, 'learning_rate': 7.545078402469038e-06, 'epoch': 0.82} +{'loss': 129.7211, 'grad_norm': 1603.3125811193872, 'learning_rate': 7.540852651767646e-06, 'epoch': 0.82} +{'loss': 126.3625, 'grad_norm': 1508.3547979347634, 'learning_rate': 7.536627368409005e-06, 'epoch': 0.82} +{'loss': 124.3311, 'grad_norm': 1603.430287961136, 'learning_rate': 7.532402553196092e-06, 'epoch': 0.82} +{'loss': 126.2771, 'grad_norm': 1611.7851190243641, 'learning_rate': 7.5281782069318075e-06, 'epoch': 0.82} +{'loss': 127.0011, 'grad_norm': 1762.5070753813866, 'learning_rate': 7.523954330418959e-06, 'epoch': 0.82} +{'loss': 128.8893, 'grad_norm': 1787.6796700349125, 'learning_rate': 7.519730924460257e-06, 'epoch': 0.82} +{'loss': 127.4957, 'grad_norm': 1573.8258695605632, 'learning_rate': 7.515507989858334e-06, 'epoch': 0.82} +{'loss': 125.411, 'grad_norm': 1515.765629599896, 'learning_rate': 7.5112855274157205e-06, 'epoch': 0.82} +{'loss': 128.2863, 'grad_norm': 1854.1972093949837, 'learning_rate': 7.50706353793487e-06, 'epoch': 0.82} +{'loss': 124.961, 'grad_norm': 1524.8395870409036, 'learning_rate': 7.502842022218142e-06, 'epoch': 0.82} +{'loss': 126.8736, 'grad_norm': 1620.3073924954926, 'learning_rate': 7.498620981067799e-06, 'epoch': 0.82} +{'loss': 127.3781, 'grad_norm': 1926.4583758990136, 'learning_rate': 7.494400415286023e-06, 'epoch': 0.82} +{'loss': 125.8736, 'grad_norm': 1621.7478458946232, 'learning_rate': 7.490180325674894e-06, 'epoch': 0.82} +{'loss': 129.1492, 'grad_norm': 1723.1192801260474, 'learning_rate': 7.485960713036416e-06, 'epoch': 0.82} +{'loss': 124.6202, 'grad_norm': 1590.711184041737, 'learning_rate': 7.4817415781724954e-06, 'epoch': 0.82} +{'loss': 123.0889, 'grad_norm': 1476.6410058453066, 'learning_rate': 7.477522921884946e-06, 'epoch': 0.82} +{'loss': 124.4753, 'grad_norm': 1725.3742494831251, 'learning_rate': 7.473304744975489e-06, 'epoch': 0.82} +{'loss': 124.3773, 'grad_norm': 1779.3240731633507, 'learning_rate': 7.469087048245758e-06, 'epoch': 0.82} +{'loss': 123.606, 'grad_norm': 1732.415782066581, 'learning_rate': 7.464869832497302e-06, 'epoch': 0.82} +{'loss': 126.2887, 'grad_norm': 1813.6551921071512, 'learning_rate': 7.460653098531563e-06, 'epoch': 0.82} +{'loss': 126.6513, 'grad_norm': 1489.673682045778, 'learning_rate': 7.456436847149906e-06, 'epoch': 0.82} +{'loss': 123.7511, 'grad_norm': 1706.880061178002, 'learning_rate': 7.4522210791535945e-06, 'epoch': 0.82} +{'loss': 124.3169, 'grad_norm': 1491.8796705394336, 'learning_rate': 7.448005795343804e-06, 'epoch': 0.82} +{'loss': 126.6956, 'grad_norm': 1712.1258493819435, 'learning_rate': 7.443790996521624e-06, 'epoch': 0.82} +{'loss': 123.5776, 'grad_norm': 1425.0950072637547, 'learning_rate': 7.439576683488039e-06, 'epoch': 0.82} +{'loss': 124.1086, 'grad_norm': 1359.8367686028287, 'learning_rate': 7.435362857043953e-06, 'epoch': 0.82} +{'loss': 120.3533, 'grad_norm': 1428.3358143959774, 'learning_rate': 7.431149517990167e-06, 'epoch': 0.82} +{'loss': 124.8658, 'grad_norm': 1573.7428192735317, 'learning_rate': 7.426936667127396e-06, 'epoch': 0.82} +{'loss': 127.4182, 'grad_norm': 1560.2922459538397, 'learning_rate': 7.422724305256268e-06, 'epoch': 0.82} +{'loss': 125.2135, 'grad_norm': 1561.043636860999, 'learning_rate': 7.418512433177304e-06, 'epoch': 0.82} +{'loss': 131.7609, 'grad_norm': 1657.0129217550202, 'learning_rate': 7.414301051690944e-06, 'epoch': 0.82} +{'loss': 124.5746, 'grad_norm': 1550.5131163219473, 'learning_rate': 7.410090161597523e-06, 'epoch': 0.82} +{'loss': 121.7083, 'grad_norm': 1912.04638724778, 'learning_rate': 7.405879763697294e-06, 'epoch': 0.82} +{'loss': 127.4755, 'grad_norm': 1780.6583085642171, 'learning_rate': 7.401669858790414e-06, 'epoch': 0.82} +{'loss': 122.1273, 'grad_norm': 1583.3980641901032, 'learning_rate': 7.39746044767694e-06, 'epoch': 0.82} +{'loss': 124.7367, 'grad_norm': 1675.8272915589716, 'learning_rate': 7.393251531156844e-06, 'epoch': 0.82} +{'loss': 123.8683, 'grad_norm': 1589.8071913779177, 'learning_rate': 7.389043110029995e-06, 'epoch': 0.82} +{'loss': 124.4809, 'grad_norm': 1890.1006738912345, 'learning_rate': 7.3848351850961705e-06, 'epoch': 0.82} +{'loss': 130.0568, 'grad_norm': 1702.5171036033703, 'learning_rate': 7.380627757155065e-06, 'epoch': 0.82} +{'loss': 127.8564, 'grad_norm': 1582.7549515429102, 'learning_rate': 7.376420827006261e-06, 'epoch': 0.82} +{'loss': 127.6861, 'grad_norm': 1615.9302566488598, 'learning_rate': 7.372214395449258e-06, 'epoch': 0.82} +{'loss': 128.1967, 'grad_norm': 1731.736780934086, 'learning_rate': 7.368008463283452e-06, 'epoch': 0.82} +{'loss': 124.1466, 'grad_norm': 1615.5607781836693, 'learning_rate': 7.3638030313081525e-06, 'epoch': 0.82} +{'loss': 129.6207, 'grad_norm': 1653.4979301777068, 'learning_rate': 7.359598100322575e-06, 'epoch': 0.82} +{'loss': 127.4061, 'grad_norm': 1606.3139424481226, 'learning_rate': 7.355393671125829e-06, 'epoch': 0.83} +{'loss': 124.9199, 'grad_norm': 1697.8624641959225, 'learning_rate': 7.35118974451694e-06, 'epoch': 0.83} +{'loss': 129.193, 'grad_norm': 1603.833403869032, 'learning_rate': 7.346986321294829e-06, 'epoch': 0.83} +{'loss': 122.0093, 'grad_norm': 1675.0175096954745, 'learning_rate': 7.3427834022583255e-06, 'epoch': 0.83} +{'loss': 123.9885, 'grad_norm': 1607.4121489334666, 'learning_rate': 7.33858098820617e-06, 'epoch': 0.83} +{'loss': 119.4061, 'grad_norm': 1479.4043850913906, 'learning_rate': 7.3343790799369926e-06, 'epoch': 0.83} +{'loss': 122.1903, 'grad_norm': 1482.1249631611151, 'learning_rate': 7.330177678249338e-06, 'epoch': 0.83} +{'loss': 129.2852, 'grad_norm': 1593.9321716331249, 'learning_rate': 7.325976783941653e-06, 'epoch': 0.83} +{'loss': 121.0974, 'grad_norm': 1610.4279457862028, 'learning_rate': 7.321776397812279e-06, 'epoch': 0.83} +{'loss': 127.5081, 'grad_norm': 1516.452677828383, 'learning_rate': 7.317576520659475e-06, 'epoch': 0.83} +{'loss': 120.6062, 'grad_norm': 1528.7616289241444, 'learning_rate': 7.313377153281399e-06, 'epoch': 0.83} +{'loss': 125.7294, 'grad_norm': 1799.6525452268825, 'learning_rate': 7.309178296476102e-06, 'epoch': 0.83} +{'loss': 124.9911, 'grad_norm': 1520.1127294087796, 'learning_rate': 7.304979951041554e-06, 'epoch': 0.83} +{'loss': 127.005, 'grad_norm': 1471.2686585858362, 'learning_rate': 7.300782117775611e-06, 'epoch': 0.83} +{'loss': 128.9179, 'grad_norm': 1471.0081804904853, 'learning_rate': 7.296584797476045e-06, 'epoch': 0.83} +{'loss': 127.0053, 'grad_norm': 1718.0594117420796, 'learning_rate': 7.292387990940526e-06, 'epoch': 0.83} +{'loss': 125.5925, 'grad_norm': 1614.761685260497, 'learning_rate': 7.288191698966625e-06, 'epoch': 0.83} +{'loss': 125.2614, 'grad_norm': 1747.845092470497, 'learning_rate': 7.283995922351816e-06, 'epoch': 0.83} +{'loss': 125.0071, 'grad_norm': 1582.8376341495236, 'learning_rate': 7.279800661893472e-06, 'epoch': 0.83} +{'loss': 121.6348, 'grad_norm': 1884.4680029844603, 'learning_rate': 7.275605918388877e-06, 'epoch': 0.83} +{'loss': 124.8317, 'grad_norm': 1460.4364794872104, 'learning_rate': 7.27141169263521e-06, 'epoch': 0.83} +{'loss': 129.1782, 'grad_norm': 2364.56430866904, 'learning_rate': 7.267217985429551e-06, 'epoch': 0.83} +{'loss': 122.2058, 'grad_norm': 1326.8756404754256, 'learning_rate': 7.263024797568884e-06, 'epoch': 0.83} +{'loss': 125.8929, 'grad_norm': 1936.1307326755518, 'learning_rate': 7.258832129850087e-06, 'epoch': 0.83} +{'loss': 130.5443, 'grad_norm': 1488.8816942615906, 'learning_rate': 7.254639983069955e-06, 'epoch': 0.83} +{'loss': 125.3233, 'grad_norm': 1799.1162416855898, 'learning_rate': 7.250448358025171e-06, 'epoch': 0.83} +{'loss': 129.0914, 'grad_norm': 1707.2299345781728, 'learning_rate': 7.246257255512319e-06, 'epoch': 0.83} +{'loss': 127.9883, 'grad_norm': 1686.2467861140394, 'learning_rate': 7.242066676327893e-06, 'epoch': 0.83} +{'loss': 123.8956, 'grad_norm': 1847.3911234111847, 'learning_rate': 7.2378766212682735e-06, 'epoch': 0.83} +{'loss': 123.9962, 'grad_norm': 1665.650041088934, 'learning_rate': 7.233687091129757e-06, 'epoch': 0.83} +{'loss': 122.5976, 'grad_norm': 1543.5602604674189, 'learning_rate': 7.22949808670853e-06, 'epoch': 0.83} +{'loss': 125.4859, 'grad_norm': 1493.653037803279, 'learning_rate': 7.225309608800681e-06, 'epoch': 0.83} +{'loss': 121.3372, 'grad_norm': 1638.165570631612, 'learning_rate': 7.221121658202202e-06, 'epoch': 0.83} +{'loss': 124.2882, 'grad_norm': 1585.5321697387412, 'learning_rate': 7.216934235708974e-06, 'epoch': 0.83} +{'loss': 126.6571, 'grad_norm': 1505.6081991915994, 'learning_rate': 7.212747342116795e-06, 'epoch': 0.83} +{'loss': 126.2316, 'grad_norm': 1748.6032522554558, 'learning_rate': 7.208560978221351e-06, 'epoch': 0.83} +{'loss': 125.2988, 'grad_norm': 1549.4214800644538, 'learning_rate': 7.2043751448182275e-06, 'epoch': 0.83} +{'loss': 125.648, 'grad_norm': 1814.5841637076571, 'learning_rate': 7.200189842702912e-06, 'epoch': 0.83} +{'loss': 122.7578, 'grad_norm': 1657.793142538385, 'learning_rate': 7.1960050726707866e-06, 'epoch': 0.83} +{'loss': 125.5859, 'grad_norm': 1713.2373333576063, 'learning_rate': 7.191820835517141e-06, 'epoch': 0.83} +{'loss': 124.9447, 'grad_norm': 1765.5253831709801, 'learning_rate': 7.187637132037159e-06, 'epoch': 0.83} +{'loss': 123.737, 'grad_norm': 1549.2360417192544, 'learning_rate': 7.183453963025919e-06, 'epoch': 0.83} +{'loss': 122.4418, 'grad_norm': 1396.0702390004349, 'learning_rate': 7.1792713292784025e-06, 'epoch': 0.83} +{'loss': 131.704, 'grad_norm': 1615.2657091184926, 'learning_rate': 7.175089231589485e-06, 'epoch': 0.83} +{'loss': 125.9303, 'grad_norm': 1511.0908421132347, 'learning_rate': 7.1709076707539494e-06, 'epoch': 0.83} +{'loss': 122.4348, 'grad_norm': 1627.299096252581, 'learning_rate': 7.166726647566466e-06, 'epoch': 0.83} +{'loss': 128.0677, 'grad_norm': 1566.9191589681504, 'learning_rate': 7.162546162821611e-06, 'epoch': 0.83} +{'loss': 124.5336, 'grad_norm': 1591.5218795737458, 'learning_rate': 7.158366217313848e-06, 'epoch': 0.83} +{'loss': 125.9444, 'grad_norm': 1449.714857363188, 'learning_rate': 7.154186811837549e-06, 'epoch': 0.83} +{'loss': 123.4442, 'grad_norm': 1487.360932126987, 'learning_rate': 7.150007947186982e-06, 'epoch': 0.83} +{'loss': 124.6157, 'grad_norm': 1440.7366359359592, 'learning_rate': 7.145829624156304e-06, 'epoch': 0.83} +{'loss': 126.0772, 'grad_norm': 1454.1608742393878, 'learning_rate': 7.141651843539579e-06, 'epoch': 0.83} +{'loss': 125.2821, 'grad_norm': 1492.3928920220428, 'learning_rate': 7.1374746061307584e-06, 'epoch': 0.84} +{'loss': 125.1907, 'grad_norm': 1492.414777857063, 'learning_rate': 7.133297912723696e-06, 'epoch': 0.84} +{'loss': 125.6397, 'grad_norm': 1693.3167128278576, 'learning_rate': 7.129121764112146e-06, 'epoch': 0.84} +{'loss': 127.5695, 'grad_norm': 1735.5055480025512, 'learning_rate': 7.124946161089751e-06, 'epoch': 0.84} +{'loss': 123.8233, 'grad_norm': 1670.3805464499865, 'learning_rate': 7.120771104450055e-06, 'epoch': 0.84} +{'loss': 126.2361, 'grad_norm': 1464.5915838925669, 'learning_rate': 7.1165965949864934e-06, 'epoch': 0.84} +{'loss': 121.8056, 'grad_norm': 1456.2540830331827, 'learning_rate': 7.1124226334924016e-06, 'epoch': 0.84} +{'loss': 126.9057, 'grad_norm': 1876.2469295908709, 'learning_rate': 7.108249220761016e-06, 'epoch': 0.84} +{'loss': 124.7107, 'grad_norm': 1540.8311456837894, 'learning_rate': 7.1040763575854565e-06, 'epoch': 0.84} +{'loss': 122.7209, 'grad_norm': 1608.1378076433932, 'learning_rate': 7.099904044758749e-06, 'epoch': 0.84} +{'loss': 127.1641, 'grad_norm': 1475.94399036271, 'learning_rate': 7.095732283073806e-06, 'epoch': 0.84} +{'loss': 128.7572, 'grad_norm': 1586.8278495049267, 'learning_rate': 7.091561073323438e-06, 'epoch': 0.84} +{'loss': 126.6825, 'grad_norm': 1788.2209109198159, 'learning_rate': 7.087390416300364e-06, 'epoch': 0.84} +{'loss': 123.5928, 'grad_norm': 1549.1741096614132, 'learning_rate': 7.083220312797177e-06, 'epoch': 0.84} +{'loss': 123.5012, 'grad_norm': 1556.8789324250758, 'learning_rate': 7.079050763606376e-06, 'epoch': 0.84} +{'loss': 125.5141, 'grad_norm': 1719.722789592408, 'learning_rate': 7.074881769520354e-06, 'epoch': 0.84} +{'loss': 124.7584, 'grad_norm': 1862.7855602128916, 'learning_rate': 7.070713331331395e-06, 'epoch': 0.84} +{'loss': 140.0171, 'grad_norm': 1611.0678238891214, 'learning_rate': 7.066545449831686e-06, 'epoch': 0.84} +{'loss': 124.5041, 'grad_norm': 1686.3362844541011, 'learning_rate': 7.062378125813294e-06, 'epoch': 0.84} +{'loss': 123.2415, 'grad_norm': 1565.1200755740583, 'learning_rate': 7.058211360068196e-06, 'epoch': 0.84} +{'loss': 124.9315, 'grad_norm': 1520.9114879086214, 'learning_rate': 7.054045153388248e-06, 'epoch': 0.84} +{'loss': 127.6873, 'grad_norm': 1855.5406561557018, 'learning_rate': 7.049879506565207e-06, 'epoch': 0.84} +{'loss': 122.5455, 'grad_norm': 1670.2338079171484, 'learning_rate': 7.045714420390732e-06, 'epoch': 0.84} +{'loss': 124.6758, 'grad_norm': 1642.2819412703427, 'learning_rate': 7.041549895656356e-06, 'epoch': 0.84} +{'loss': 122.7267, 'grad_norm': 1821.5938936708721, 'learning_rate': 7.037385933153527e-06, 'epoch': 0.84} +{'loss': 122.2598, 'grad_norm': 1566.4117547894352, 'learning_rate': 7.033222533673565e-06, 'epoch': 0.84} +{'loss': 126.53, 'grad_norm': 1714.540900188494, 'learning_rate': 7.029059698007699e-06, 'epoch': 0.84} +{'loss': 122.9632, 'grad_norm': 1418.737934855184, 'learning_rate': 7.024897426947038e-06, 'epoch': 0.84} +{'loss': 123.9551, 'grad_norm': 1506.1685343923004, 'learning_rate': 7.020735721282603e-06, 'epoch': 0.84} +{'loss': 121.6492, 'grad_norm': 1740.590540385604, 'learning_rate': 7.016574581805284e-06, 'epoch': 0.84} +{'loss': 125.7334, 'grad_norm': 1571.059758079141, 'learning_rate': 7.012414009305884e-06, 'epoch': 0.84} +{'loss': 126.1911, 'grad_norm': 1388.2809538797678, 'learning_rate': 7.008254004575081e-06, 'epoch': 0.84} +{'loss': 138.2247, 'grad_norm': 1733.8378012029302, 'learning_rate': 7.0040945684034555e-06, 'epoch': 0.84} +{'loss': 124.3327, 'grad_norm': 1720.8252923316754, 'learning_rate': 6.999935701581482e-06, 'epoch': 0.84} +{'loss': 124.9543, 'grad_norm': 1694.4707857435137, 'learning_rate': 6.9957774048995174e-06, 'epoch': 0.84} +{'loss': 123.892, 'grad_norm': 1619.4723952983707, 'learning_rate': 6.991619679147818e-06, 'epoch': 0.84} +{'loss': 122.6521, 'grad_norm': 1637.7604128926619, 'learning_rate': 6.987462525116526e-06, 'epoch': 0.84} +{'loss': 123.2961, 'grad_norm': 1597.2150626634632, 'learning_rate': 6.983305943595677e-06, 'epoch': 0.84} +{'loss': 123.7728, 'grad_norm': 1505.3447685726712, 'learning_rate': 6.979149935375205e-06, 'epoch': 0.84} +{'loss': 124.1176, 'grad_norm': 1577.7094491056564, 'learning_rate': 6.9749945012449225e-06, 'epoch': 0.84} +{'loss': 122.2987, 'grad_norm': 1696.134171350137, 'learning_rate': 6.970839641994545e-06, 'epoch': 0.84} +{'loss': 126.2877, 'grad_norm': 1517.310612238268, 'learning_rate': 6.966685358413663e-06, 'epoch': 0.84} +{'loss': 121.9255, 'grad_norm': 1568.946466841836, 'learning_rate': 6.962531651291773e-06, 'epoch': 0.84} +{'loss': 121.8052, 'grad_norm': 1502.9179641350495, 'learning_rate': 6.9583785214182594e-06, 'epoch': 0.84} +{'loss': 123.4488, 'grad_norm': 1775.2793916908238, 'learning_rate': 6.9542259695823884e-06, 'epoch': 0.84} +{'loss': 124.4139, 'grad_norm': 1724.1983946730618, 'learning_rate': 6.950073996573328e-06, 'epoch': 0.84} +{'loss': 126.8929, 'grad_norm': 1686.2724311660293, 'learning_rate': 6.945922603180122e-06, 'epoch': 0.84} +{'loss': 125.5411, 'grad_norm': 1604.0278822811017, 'learning_rate': 6.941771790191716e-06, 'epoch': 0.84} +{'loss': 122.9915, 'grad_norm': 1461.3247968337428, 'learning_rate': 6.937621558396943e-06, 'epoch': 0.84} +{'loss': 121.2583, 'grad_norm': 1578.2204578819926, 'learning_rate': 6.933471908584521e-06, 'epoch': 0.84} +{'loss': 125.4772, 'grad_norm': 1419.708008992405, 'learning_rate': 6.9293228415430625e-06, 'epoch': 0.84} +{'loss': 120.5448, 'grad_norm': 1563.1469626396442, 'learning_rate': 6.925174358061064e-06, 'epoch': 0.84} +{'loss': 123.3889, 'grad_norm': 1624.564365947182, 'learning_rate': 6.921026458926915e-06, 'epoch': 0.85} +{'loss': 122.7071, 'grad_norm': 1741.155934853828, 'learning_rate': 6.9168791449288965e-06, 'epoch': 0.85} +{'loss': 123.0481, 'grad_norm': 1585.0282890098197, 'learning_rate': 6.912732416855171e-06, 'epoch': 0.85} +{'loss': 123.6523, 'grad_norm': 1493.8951362341575, 'learning_rate': 6.908586275493796e-06, 'epoch': 0.85} +{'loss': 122.0176, 'grad_norm': 1576.0711248385605, 'learning_rate': 6.904440721632715e-06, 'epoch': 0.85} +{'loss': 120.4068, 'grad_norm': 1547.8652721985045, 'learning_rate': 6.900295756059751e-06, 'epoch': 0.85} +{'loss': 125.4299, 'grad_norm': 1594.2004962647582, 'learning_rate': 6.896151379562637e-06, 'epoch': 0.85} +[2025-05-29 10:17:56,473] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step44000 is about to be saved! +[2025-05-29 10:17:56,504] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 10:17:56,504] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 10:17:56,614] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 10:17:56,642] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 10:23:58,767] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 10:23:58,812] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 10:23:58,826] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step44000 is ready now! +[2025-05-29 10:23:58,940] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step44000 is about to be saved! +[2025-05-29 10:23:58,966] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 10:23:58,966] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 10:23:59,087] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 10:23:59,180] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 10:29:47,130] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 10:29:47,210] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-44000/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 10:29:58,139] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step44000 is ready now! +{'loss': 123.6935, 'grad_norm': 1757.6112912755607, 'learning_rate': 6.892007592928977e-06, 'epoch': 0.85} +{'loss': 136.7211, 'grad_norm': 1469.6162055329155, 'learning_rate': 6.887864396946263e-06, 'epoch': 0.85} +{'loss': 124.795, 'grad_norm': 3013.251752957825, 'learning_rate': 6.8837217924018825e-06, 'epoch': 0.85} +{'loss': 125.2296, 'grad_norm': 1573.9928314251663, 'learning_rate': 6.8795797800831025e-06, 'epoch': 0.85} +{'loss': 124.5564, 'grad_norm': 1611.9622926262382, 'learning_rate': 6.8754383607770855e-06, 'epoch': 0.85} +{'loss': 120.229, 'grad_norm': 1671.3409505807128, 'learning_rate': 6.87129753527088e-06, 'epoch': 0.85} +{'loss': 122.9378, 'grad_norm': 1782.9476334467424, 'learning_rate': 6.867157304351414e-06, 'epoch': 0.85} +{'loss': 122.5747, 'grad_norm': 1426.9635526339182, 'learning_rate': 6.863017668805512e-06, 'epoch': 0.85} +{'loss': 123.1378, 'grad_norm': 1631.7524099104246, 'learning_rate': 6.858878629419874e-06, 'epoch': 0.85} +{'loss': 118.9458, 'grad_norm': 1458.486788337334, 'learning_rate': 6.854740186981102e-06, 'epoch': 0.85} +{'loss': 124.5778, 'grad_norm': 1683.1224436617863, 'learning_rate': 6.850602342275674e-06, 'epoch': 0.85} +{'loss': 123.2199, 'grad_norm': 1627.8865675903528, 'learning_rate': 6.846465096089955e-06, 'epoch': 0.85} +{'loss': 126.1063, 'grad_norm': 1862.832593079097, 'learning_rate': 6.8423284492102e-06, 'epoch': 0.85} +{'loss': 122.3192, 'grad_norm': 1499.3533112619937, 'learning_rate': 6.8381924024225434e-06, 'epoch': 0.85} +{'loss': 121.8646, 'grad_norm': 1287.512722580214, 'learning_rate': 6.834056956513015e-06, 'epoch': 0.85} +{'loss': 123.6237, 'grad_norm': 1379.7795532749703, 'learning_rate': 6.829922112267529e-06, 'epoch': 0.85} +{'loss': 120.1513, 'grad_norm': 1514.1928557860997, 'learning_rate': 6.825787870471873e-06, 'epoch': 0.85} +{'loss': 124.3843, 'grad_norm': 1496.317149396831, 'learning_rate': 6.821654231911736e-06, 'epoch': 0.85} +{'loss': 123.7177, 'grad_norm': 1672.6892212121993, 'learning_rate': 6.8175211973726785e-06, 'epoch': 0.85} +{'loss': 120.9187, 'grad_norm': 1639.827345031978, 'learning_rate': 6.813388767640161e-06, 'epoch': 0.85} +{'loss': 120.8567, 'grad_norm': 1565.7994005693715, 'learning_rate': 6.809256943499519e-06, 'epoch': 0.85} +{'loss': 125.8151, 'grad_norm': 1606.699696625137, 'learning_rate': 6.805125725735971e-06, 'epoch': 0.85} +{'loss': 125.9316, 'grad_norm': 1566.9864804179235, 'learning_rate': 6.800995115134631e-06, 'epoch': 0.85} +{'loss': 122.4424, 'grad_norm': 1672.8610062333487, 'learning_rate': 6.796865112480482e-06, 'epoch': 0.85} +{'loss': 127.2449, 'grad_norm': 1382.1014493854757, 'learning_rate': 6.792735718558408e-06, 'epoch': 0.85} +{'loss': 123.3223, 'grad_norm': 1624.7082494972735, 'learning_rate': 6.788606934153168e-06, 'epoch': 0.85} +{'loss': 121.8117, 'grad_norm': 1486.278288781692, 'learning_rate': 6.784478760049406e-06, 'epoch': 0.85} +{'loss': 121.1655, 'grad_norm': 1526.0648550946798, 'learning_rate': 6.780351197031651e-06, 'epoch': 0.85} +{'loss': 119.8526, 'grad_norm': 1451.3640886905814, 'learning_rate': 6.776224245884313e-06, 'epoch': 0.85} +{'loss': 122.8855, 'grad_norm': 1825.4742122137802, 'learning_rate': 6.772097907391695e-06, 'epoch': 0.85} +{'loss': 121.0515, 'grad_norm': 1941.1643588450668, 'learning_rate': 6.767972182337974e-06, 'epoch': 0.85} +{'loss': 121.1038, 'grad_norm': 1488.6142987795176, 'learning_rate': 6.76384707150721e-06, 'epoch': 0.85} +{'loss': 124.7128, 'grad_norm': 1531.7259996583284, 'learning_rate': 6.7597225756833575e-06, 'epoch': 0.85} +{'loss': 127.7057, 'grad_norm': 1967.2211819263227, 'learning_rate': 6.755598695650236e-06, 'epoch': 0.85} +{'loss': 121.6369, 'grad_norm': 1387.9565428396888, 'learning_rate': 6.751475432191565e-06, 'epoch': 0.85} +{'loss': 123.4334, 'grad_norm': 1581.7869800853857, 'learning_rate': 6.747352786090944e-06, 'epoch': 0.85} +{'loss': 122.8002, 'grad_norm': 1582.2404218741572, 'learning_rate': 6.743230758131845e-06, 'epoch': 0.85} +{'loss': 120.5303, 'grad_norm': 1774.298216683005, 'learning_rate': 6.7391093490976285e-06, 'epoch': 0.85} +{'loss': 127.7051, 'grad_norm': 1539.7550234496018, 'learning_rate': 6.734988559771538e-06, 'epoch': 0.85} +{'loss': 124.0886, 'grad_norm': 1585.3611837951903, 'learning_rate': 6.730868390936706e-06, 'epoch': 0.85} +{'loss': 123.5734, 'grad_norm': 1701.3094498932958, 'learning_rate': 6.7267488433761315e-06, 'epoch': 0.85} +{'loss': 124.2328, 'grad_norm': 1848.476806532009, 'learning_rate': 6.72262991787271e-06, 'epoch': 0.85} +{'loss': 120.4827, 'grad_norm': 1573.3603868943385, 'learning_rate': 6.718511615209209e-06, 'epoch': 0.85} +{'loss': 121.477, 'grad_norm': 1783.2593937025702, 'learning_rate': 6.714393936168279e-06, 'epoch': 0.85} +{'loss': 121.0161, 'grad_norm': 1481.2091888183186, 'learning_rate': 6.710276881532463e-06, 'epoch': 0.85} +{'loss': 124.5626, 'grad_norm': 2083.8045611574858, 'learning_rate': 6.706160452084169e-06, 'epoch': 0.86} +{'loss': 124.4125, 'grad_norm': 1462.1660003637012, 'learning_rate': 6.7020446486057e-06, 'epoch': 0.86} +{'loss': 121.0973, 'grad_norm': 1725.5706147355256, 'learning_rate': 6.6979294718792275e-06, 'epoch': 0.86} +{'loss': 120.0068, 'grad_norm': 1428.0183691408283, 'learning_rate': 6.693814922686812e-06, 'epoch': 0.86} +{'loss': 123.5326, 'grad_norm': 1718.8756943641333, 'learning_rate': 6.689701001810401e-06, 'epoch': 0.86} +{'loss': 126.4358, 'grad_norm': 1814.710497371072, 'learning_rate': 6.685587710031803e-06, 'epoch': 0.86} +{'loss': 121.7514, 'grad_norm': 1488.2850275920553, 'learning_rate': 6.681475048132729e-06, 'epoch': 0.86} +{'loss': 126.2723, 'grad_norm': 1394.1443991119372, 'learning_rate': 6.677363016894751e-06, 'epoch': 0.86} +{'loss': 121.6987, 'grad_norm': 1481.428427802298, 'learning_rate': 6.673251617099332e-06, 'epoch': 0.86} +{'loss': 121.863, 'grad_norm': 1533.3529370205586, 'learning_rate': 6.669140849527821e-06, 'epoch': 0.86} +{'loss': 122.3599, 'grad_norm': 2500.3142338571274, 'learning_rate': 6.66503071496143e-06, 'epoch': 0.86} +{'loss': 124.7511, 'grad_norm': 1818.7625471993488, 'learning_rate': 6.660921214181266e-06, 'epoch': 0.86} +{'loss': 122.261, 'grad_norm': 1551.0054730473153, 'learning_rate': 6.656812347968302e-06, 'epoch': 0.86} +{'loss': 122.4688, 'grad_norm': 1423.9576199400483, 'learning_rate': 6.652704117103401e-06, 'epoch': 0.86} +{'loss': 120.7304, 'grad_norm': 1424.0156122926892, 'learning_rate': 6.648596522367307e-06, 'epoch': 0.86} +{'loss': 126.8976, 'grad_norm': 1652.56560605021, 'learning_rate': 6.64448956454063e-06, 'epoch': 0.86} +{'loss': 123.0573, 'grad_norm': 1587.8886785322254, 'learning_rate': 6.640383244403874e-06, 'epoch': 0.86} +{'loss': 122.0028, 'grad_norm': 1648.3530067587396, 'learning_rate': 6.636277562737407e-06, 'epoch': 0.86} +{'loss': 122.8943, 'grad_norm': 1574.9719861133194, 'learning_rate': 6.632172520321488e-06, 'epoch': 0.86} +{'loss': 117.7303, 'grad_norm': 1656.5297669188406, 'learning_rate': 6.628068117936253e-06, 'epoch': 0.86} +{'loss': 123.873, 'grad_norm': 1504.353227467709, 'learning_rate': 6.623964356361707e-06, 'epoch': 0.86} +{'loss': 123.0909, 'grad_norm': 1623.373551927616, 'learning_rate': 6.619861236377745e-06, 'epoch': 0.86} +{'loss': 118.7597, 'grad_norm': 1495.7892601666936, 'learning_rate': 6.615758758764128e-06, 'epoch': 0.86} +{'loss': 120.7651, 'grad_norm': 1776.3968371949377, 'learning_rate': 6.611656924300505e-06, 'epoch': 0.86} +{'loss': 121.3098, 'grad_norm': 1633.3548382141296, 'learning_rate': 6.607555733766404e-06, 'epoch': 0.86} +{'loss': 118.469, 'grad_norm': 1472.93237213372, 'learning_rate': 6.603455187941221e-06, 'epoch': 0.86} +{'loss': 121.36, 'grad_norm': 1523.5272181233781, 'learning_rate': 6.599355287604232e-06, 'epoch': 0.86} +{'loss': 121.1773, 'grad_norm': 1595.9250312077895, 'learning_rate': 6.595256033534598e-06, 'epoch': 0.86} +{'loss': 125.5144, 'grad_norm': 1461.729705385105, 'learning_rate': 6.591157426511346e-06, 'epoch': 0.86} +{'loss': 120.7982, 'grad_norm': 1355.9513897073327, 'learning_rate': 6.5870594673133925e-06, 'epoch': 0.86} +{'loss': 122.6542, 'grad_norm': 1562.6029241576184, 'learning_rate': 6.582962156719521e-06, 'epoch': 0.86} +{'loss': 126.4576, 'grad_norm': 1970.8794819708883, 'learning_rate': 6.578865495508396e-06, 'epoch': 0.86} +{'loss': 123.6567, 'grad_norm': 1464.5053780822668, 'learning_rate': 6.5747694844585565e-06, 'epoch': 0.86} +{'loss': 122.788, 'grad_norm': 1553.7873843521086, 'learning_rate': 6.570674124348415e-06, 'epoch': 0.86} +{'loss': 121.2449, 'grad_norm': 1275.6004341528408, 'learning_rate': 6.5665794159562734e-06, 'epoch': 0.86} +{'loss': 122.3702, 'grad_norm': 1703.0666886888177, 'learning_rate': 6.5624853600602966e-06, 'epoch': 0.86} +{'loss': 118.9063, 'grad_norm': 1335.0513939854582, 'learning_rate': 6.558391957438528e-06, 'epoch': 0.86} +{'loss': 120.4031, 'grad_norm': 1349.97290432344, 'learning_rate': 6.55429920886889e-06, 'epoch': 0.86} +{'loss': 123.9995, 'grad_norm': 1615.297752519338, 'learning_rate': 6.550207115129177e-06, 'epoch': 0.86} +{'loss': 123.3562, 'grad_norm': 1698.0133747472603, 'learning_rate': 6.546115676997062e-06, 'epoch': 0.86} +{'loss': 123.9214, 'grad_norm': 1509.6563764071263, 'learning_rate': 6.542024895250099e-06, 'epoch': 0.86} +{'loss': 121.3318, 'grad_norm': 1556.355477338029, 'learning_rate': 6.537934770665701e-06, 'epoch': 0.86} +{'loss': 119.64, 'grad_norm': 1609.824166214416, 'learning_rate': 6.533845304021173e-06, 'epoch': 0.86} +{'loss': 122.9359, 'grad_norm': 1532.9391706058045, 'learning_rate': 6.529756496093681e-06, 'epoch': 0.86} +{'loss': 121.1948, 'grad_norm': 1417.7279830389068, 'learning_rate': 6.525668347660279e-06, 'epoch': 0.86} +{'loss': 126.3273, 'grad_norm': 1485.3121405146005, 'learning_rate': 6.521580859497888e-06, 'epoch': 0.86} +{'loss': 127.5247, 'grad_norm': 1741.9251959557428, 'learning_rate': 6.517494032383302e-06, 'epoch': 0.86} +{'loss': 122.0216, 'grad_norm': 1531.4406700934053, 'learning_rate': 6.513407867093197e-06, 'epoch': 0.86} +{'loss': 122.6142, 'grad_norm': 1335.8910149957019, 'learning_rate': 6.509322364404112e-06, 'epoch': 0.86} +{'loss': 123.5584, 'grad_norm': 1332.4569442773375, 'learning_rate': 6.50523752509247e-06, 'epoch': 0.86} +{'loss': 123.6, 'grad_norm': 1434.7179584295218, 'learning_rate': 6.501153349934568e-06, 'epoch': 0.86} +{'loss': 118.76, 'grad_norm': 1362.2808793515742, 'learning_rate': 6.497069839706569e-06, 'epoch': 0.86} +{'loss': 120.1633, 'grad_norm': 1657.068805896479, 'learning_rate': 6.492986995184516e-06, 'epoch': 0.87} +{'loss': 122.2124, 'grad_norm': 1613.2559743194734, 'learning_rate': 6.488904817144317e-06, 'epoch': 0.87} +{'loss': 122.9756, 'grad_norm': 2580.011216076813, 'learning_rate': 6.4848233063617676e-06, 'epoch': 0.87} +{'loss': 123.0642, 'grad_norm': 1473.6796249736203, 'learning_rate': 6.4807424636125285e-06, 'epoch': 0.87} +{'loss': 119.9796, 'grad_norm': 1466.475967565021, 'learning_rate': 6.476662289672128e-06, 'epoch': 0.87} +{'loss': 121.7387, 'grad_norm': 1813.49551342137, 'learning_rate': 6.47258278531598e-06, 'epoch': 0.87} +{'loss': 120.9844, 'grad_norm': 1503.2994671549402, 'learning_rate': 6.468503951319354e-06, 'epoch': 0.87} +{'loss': 121.9002, 'grad_norm': 1887.9668895931627, 'learning_rate': 6.464425788457412e-06, 'epoch': 0.87} +{'loss': 121.7533, 'grad_norm': 1623.3180943354141, 'learning_rate': 6.4603482975051755e-06, 'epoch': 0.87} +{'loss': 120.0601, 'grad_norm': 1629.739610271337, 'learning_rate': 6.456271479237541e-06, 'epoch': 0.87} +{'loss': 125.3699, 'grad_norm': 1685.922308306844, 'learning_rate': 6.452195334429277e-06, 'epoch': 0.87} +{'loss': 119.7375, 'grad_norm': 1921.4616875371305, 'learning_rate': 6.448119863855022e-06, 'epoch': 0.87} +{'loss': 123.2988, 'grad_norm': 1569.5229736947515, 'learning_rate': 6.444045068289297e-06, 'epoch': 0.87} +{'loss': 119.4145, 'grad_norm': 1772.5507151974234, 'learning_rate': 6.439970948506482e-06, 'epoch': 0.87} +{'loss': 120.7936, 'grad_norm': 1537.2730363065539, 'learning_rate': 6.435897505280832e-06, 'epoch': 0.87} +{'loss': 119.1234, 'grad_norm': 1408.652220274104, 'learning_rate': 6.431824739386477e-06, 'epoch': 0.87} +{'loss': 120.6795, 'grad_norm': 1630.2664662500983, 'learning_rate': 6.427752651597411e-06, 'epoch': 0.87} +{'loss': 120.262, 'grad_norm': 1441.2375537895773, 'learning_rate': 6.4236812426875124e-06, 'epoch': 0.87} +{'loss': 124.125, 'grad_norm': 1568.0599060986876, 'learning_rate': 6.419610513430516e-06, 'epoch': 0.87} +{'loss': 120.3583, 'grad_norm': 1528.5560673236641, 'learning_rate': 6.415540464600038e-06, 'epoch': 0.87} +{'loss': 120.6189, 'grad_norm': 1880.6791631712829, 'learning_rate': 6.411471096969559e-06, 'epoch': 0.87} +{'loss': 120.5365, 'grad_norm': 1556.9797377752802, 'learning_rate': 6.4074024113124275e-06, 'epoch': 0.87} +{'loss': 121.1038, 'grad_norm': 1758.5547115994957, 'learning_rate': 6.403334408401878e-06, 'epoch': 0.87} +{'loss': 121.191, 'grad_norm': 1762.9236033122745, 'learning_rate': 6.399267089010994e-06, 'epoch': 0.87} +{'loss': 116.3453, 'grad_norm': 1645.6362493435481, 'learning_rate': 6.395200453912747e-06, 'epoch': 0.87} +{'loss': 123.8195, 'grad_norm': 1534.2942728072915, 'learning_rate': 6.391134503879963e-06, 'epoch': 0.87} +{'loss': 119.7889, 'grad_norm': 1531.046646542584, 'learning_rate': 6.387069239685355e-06, 'epoch': 0.87} +{'loss': 121.9876, 'grad_norm': 1817.5612886767096, 'learning_rate': 6.383004662101486e-06, 'epoch': 0.87} +{'loss': 119.7718, 'grad_norm': 1686.8877512970896, 'learning_rate': 6.378940771900807e-06, 'epoch': 0.87} +{'loss': 121.5755, 'grad_norm': 1621.1248275025864, 'learning_rate': 6.374877569855629e-06, 'epoch': 0.87} +{'loss': 121.605, 'grad_norm': 1691.1568430120412, 'learning_rate': 6.3708150567381295e-06, 'epoch': 0.87} +{'loss': 122.1994, 'grad_norm': 1865.4462620321688, 'learning_rate': 6.3667532333203655e-06, 'epoch': 0.87} +{'loss': 119.9125, 'grad_norm': 1575.0000218015564, 'learning_rate': 6.362692100374247e-06, 'epoch': 0.87} +{'loss': 119.5969, 'grad_norm': 1667.5513607160092, 'learning_rate': 6.35863165867157e-06, 'epoch': 0.87} +{'loss': 118.3386, 'grad_norm': 1571.849871790416, 'learning_rate': 6.354571908983992e-06, 'epoch': 0.87} +{'loss': 122.0973, 'grad_norm': 1549.8730670517182, 'learning_rate': 6.350512852083033e-06, 'epoch': 0.87} +{'loss': 122.9266, 'grad_norm': 1514.3360356534624, 'learning_rate': 6.346454488740091e-06, 'epoch': 0.87} +{'loss': 116.0596, 'grad_norm': 1738.4772126154735, 'learning_rate': 6.3423968197264214e-06, 'epoch': 0.87} +{'loss': 126.803, 'grad_norm': 3155.393843284017, 'learning_rate': 6.338339845813164e-06, 'epoch': 0.87} +{'loss': 120.5236, 'grad_norm': 1518.6102054723963, 'learning_rate': 6.3342835677713114e-06, 'epoch': 0.87} +{'loss': 122.6305, 'grad_norm': 1482.9544492214036, 'learning_rate': 6.330227986371728e-06, 'epoch': 0.87} +{'loss': 121.6525, 'grad_norm': 1698.792162616738, 'learning_rate': 6.326173102385149e-06, 'epoch': 0.87} +{'loss': 122.7839, 'grad_norm': 1417.8014941175004, 'learning_rate': 6.322118916582173e-06, 'epoch': 0.87} +{'loss': 120.7957, 'grad_norm': 1561.5005670993858, 'learning_rate': 6.31806542973327e-06, 'epoch': 0.87} +{'loss': 116.3188, 'grad_norm': 1574.038420188892, 'learning_rate': 6.314012642608778e-06, 'epoch': 0.87} +{'loss': 119.0269, 'grad_norm': 1675.5564756245988, 'learning_rate': 6.309960555978894e-06, 'epoch': 0.87} +{'loss': 121.764, 'grad_norm': 1405.2718319338999, 'learning_rate': 6.305909170613689e-06, 'epoch': 0.87} +{'loss': 122.0257, 'grad_norm': 1542.8188925011495, 'learning_rate': 6.301858487283095e-06, 'epoch': 0.87} +{'loss': 120.6749, 'grad_norm': 1396.726499747516, 'learning_rate': 6.2978085067569224e-06, 'epoch': 0.87} +{'loss': 117.4432, 'grad_norm': 1455.4019200991177, 'learning_rate': 6.293759229804835e-06, 'epoch': 0.87} +{'loss': 116.923, 'grad_norm': 1353.298212761294, 'learning_rate': 6.2897106571963695e-06, 'epoch': 0.87} +{'loss': 122.4219, 'grad_norm': 1625.9470170280724, 'learning_rate': 6.285662789700925e-06, 'epoch': 0.87} +{'loss': 118.9573, 'grad_norm': 1488.2888202162455, 'learning_rate': 6.2816156280877675e-06, 'epoch': 0.88} +{'loss': 121.8092, 'grad_norm': 1727.2041737330167, 'learning_rate': 6.2775691731260365e-06, 'epoch': 0.88} +{'loss': 120.6148, 'grad_norm': 1481.0953192572597, 'learning_rate': 6.273523425584726e-06, 'epoch': 0.88} +{'loss': 120.3094, 'grad_norm': 1523.6167605936814, 'learning_rate': 6.269478386232704e-06, 'epoch': 0.88} +{'loss': 121.6111, 'grad_norm': 1363.3090066855725, 'learning_rate': 6.265434055838693e-06, 'epoch': 0.88} +{'loss': 120.0028, 'grad_norm': 1675.7795888140236, 'learning_rate': 6.261390435171293e-06, 'epoch': 0.88} +{'loss': 121.7294, 'grad_norm': 1539.60663584883, 'learning_rate': 6.257347524998967e-06, 'epoch': 0.88} +{'loss': 121.1181, 'grad_norm': 1487.474230407803, 'learning_rate': 6.2533053260900345e-06, 'epoch': 0.88} +{'loss': 118.9305, 'grad_norm': 1585.790846611387, 'learning_rate': 6.24926383921269e-06, 'epoch': 0.88} +{'loss': 119.005, 'grad_norm': 1454.5454490733634, 'learning_rate': 6.245223065134986e-06, 'epoch': 0.88} +{'loss': 120.1794, 'grad_norm': 1556.075078631767, 'learning_rate': 6.241183004624838e-06, 'epoch': 0.88} +{'loss': 123.6654, 'grad_norm': 1624.7981374010653, 'learning_rate': 6.237143658450038e-06, 'epoch': 0.88} +{'loss': 123.2844, 'grad_norm': 1458.7432198678825, 'learning_rate': 6.233105027378227e-06, 'epoch': 0.88} +{'loss': 118.4823, 'grad_norm': 1793.8925129791016, 'learning_rate': 6.229067112176921e-06, 'epoch': 0.88} +{'loss': 119.9743, 'grad_norm': 1372.198783540625, 'learning_rate': 6.2250299136134925e-06, 'epoch': 0.88} +{'loss': 126.2758, 'grad_norm': 1377.5988538224026, 'learning_rate': 6.220993432455181e-06, 'epoch': 0.88} +{'loss': 120.9139, 'grad_norm': 1491.419042209283, 'learning_rate': 6.216957669469096e-06, 'epoch': 0.88} +{'loss': 120.3521, 'grad_norm': 1563.9860079748025, 'learning_rate': 6.212922625422197e-06, 'epoch': 0.88} +{'loss': 120.2771, 'grad_norm': 1722.357625888901, 'learning_rate': 6.208888301081321e-06, 'epoch': 0.88} +{'loss': 120.0387, 'grad_norm': 1692.5244883105615, 'learning_rate': 6.204854697213154e-06, 'epoch': 0.88} +{'loss': 120.2586, 'grad_norm': 1400.6902285110266, 'learning_rate': 6.200821814584255e-06, 'epoch': 0.88} +{'loss': 121.3671, 'grad_norm': 1641.3383022895246, 'learning_rate': 6.196789653961048e-06, 'epoch': 0.88} +{'loss': 120.6237, 'grad_norm': 1307.501558206824, 'learning_rate': 6.192758216109811e-06, 'epoch': 0.88} +{'loss': 121.1504, 'grad_norm': 1588.139254823646, 'learning_rate': 6.1887275017966915e-06, 'epoch': 0.88} +{'loss': 120.5575, 'grad_norm': 1448.314748586273, 'learning_rate': 6.184697511787695e-06, 'epoch': 0.88} +{'loss': 120.3365, 'grad_norm': 1802.8877652179485, 'learning_rate': 6.180668246848689e-06, 'epoch': 0.88} +{'loss': 121.7098, 'grad_norm': 1625.02935932431, 'learning_rate': 6.176639707745411e-06, 'epoch': 0.88} +{'loss': 120.1188, 'grad_norm': 1694.7895962512594, 'learning_rate': 6.172611895243453e-06, 'epoch': 0.88} +{'loss': 118.379, 'grad_norm': 1464.2592920080278, 'learning_rate': 6.168584810108269e-06, 'epoch': 0.88} +{'loss': 118.0648, 'grad_norm': 1509.4159869872512, 'learning_rate': 6.164558453105178e-06, 'epoch': 0.88} +{'loss': 120.6154, 'grad_norm': 1624.375409946469, 'learning_rate': 6.160532824999357e-06, 'epoch': 0.88} +{'loss': 119.5504, 'grad_norm': 1472.307840077675, 'learning_rate': 6.1565079265558506e-06, 'epoch': 0.88} +{'loss': 122.9993, 'grad_norm': 1443.8151288363126, 'learning_rate': 6.1524837585395605e-06, 'epoch': 0.88} +{'loss': 123.7033, 'grad_norm': 1801.44821926601, 'learning_rate': 6.148460321715246e-06, 'epoch': 0.88} +{'loss': 120.3461, 'grad_norm': 1722.986940743167, 'learning_rate': 6.144437616847539e-06, 'epoch': 0.88} +{'loss': 121.7858, 'grad_norm': 1586.2287730075288, 'learning_rate': 6.140415644700915e-06, 'epoch': 0.88} +{'loss': 119.6255, 'grad_norm': 2143.9787390031015, 'learning_rate': 6.136394406039727e-06, 'epoch': 0.88} +{'loss': 119.4185, 'grad_norm': 1585.9356407699672, 'learning_rate': 6.132373901628182e-06, 'epoch': 0.88} +{'loss': 122.3148, 'grad_norm': 1619.3775785519265, 'learning_rate': 6.128354132230343e-06, 'epoch': 0.88} +{'loss': 120.5802, 'grad_norm': 1576.6305469789393, 'learning_rate': 6.1243350986101416e-06, 'epoch': 0.88} +{'loss': 119.4972, 'grad_norm': 1419.2234592175073, 'learning_rate': 6.120316801531359e-06, 'epoch': 0.88} +{'loss': 117.1799, 'grad_norm': 1485.7377679711594, 'learning_rate': 6.116299241757648e-06, 'epoch': 0.88} +{'loss': 119.0286, 'grad_norm': 1491.6980297286982, 'learning_rate': 6.112282420052518e-06, 'epoch': 0.88} +{'loss': 122.6637, 'grad_norm': 1587.6558616830328, 'learning_rate': 6.108266337179332e-06, 'epoch': 0.88} +{'loss': 119.5979, 'grad_norm': 1615.8389874193801, 'learning_rate': 6.10425099390132e-06, 'epoch': 0.88} +{'loss': 125.392, 'grad_norm': 6001.302460552092, 'learning_rate': 6.10023639098156e-06, 'epoch': 0.88} +{'loss': 117.6675, 'grad_norm': 1510.7064707534075, 'learning_rate': 6.096222529183008e-06, 'epoch': 0.88} +{'loss': 120.5846, 'grad_norm': 1574.702542014146, 'learning_rate': 6.092209409268466e-06, 'epoch': 0.88} +{'loss': 120.4117, 'grad_norm': 1376.180441191313, 'learning_rate': 6.088197032000595e-06, 'epoch': 0.88} +{'loss': 122.4068, 'grad_norm': 1726.7129891345464, 'learning_rate': 6.084185398141921e-06, 'epoch': 0.88} +{'loss': 120.3736, 'grad_norm': 1534.6652531201019, 'learning_rate': 6.0801745084548195e-06, 'epoch': 0.88} +[2025-05-29 12:18:28,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step46000 is about to be saved! +[2025-05-29 12:18:28,382] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 12:18:28,382] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 12:18:28,474] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 12:18:28,540] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 12:24:20,655] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 12:24:20,700] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 12:24:34,957] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step46000 is ready now! +[2025-05-29 12:24:35,072] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step46000 is about to be saved! +[2025-05-29 12:24:35,099] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 12:24:35,099] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 12:24:35,238] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 12:24:35,324] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 12:30:19,383] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 12:30:19,448] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-46000/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 12:30:29,321] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step46000 is ready now! +{'loss': 120.8557, 'grad_norm': 1585.1232889484959, 'learning_rate': 6.0761643637015355e-06, 'epoch': 0.88} +{'loss': 116.4663, 'grad_norm': 1382.3837303158796, 'learning_rate': 6.072154964644168e-06, 'epoch': 0.89} +{'loss': 116.3321, 'grad_norm': 1684.7295651473082, 'learning_rate': 6.068146312044669e-06, 'epoch': 0.89} +{'loss': 119.062, 'grad_norm': 1586.4731747084452, 'learning_rate': 6.064138406664859e-06, 'epoch': 0.89} +{'loss': 119.0894, 'grad_norm': 1532.8120186572025, 'learning_rate': 6.060131249266404e-06, 'epoch': 0.89} +{'loss': 126.0543, 'grad_norm': 1395.2624994239545, 'learning_rate': 6.056124840610839e-06, 'epoch': 0.89} +{'loss': 121.0895, 'grad_norm': 1430.8439329869839, 'learning_rate': 6.052119181459554e-06, 'epoch': 0.89} +{'loss': 120.1282, 'grad_norm': 1401.2438438166332, 'learning_rate': 6.048114272573791e-06, 'epoch': 0.89} +{'loss': 117.7711, 'grad_norm': 1542.2370181750687, 'learning_rate': 6.0441101147146564e-06, 'epoch': 0.89} +{'loss': 118.588, 'grad_norm': 1585.2865924483547, 'learning_rate': 6.040106708643104e-06, 'epoch': 0.89} +{'loss': 122.1374, 'grad_norm': 1524.16478803419, 'learning_rate': 6.03610405511996e-06, 'epoch': 0.89} +{'loss': 119.4124, 'grad_norm': 1444.9897566855127, 'learning_rate': 6.032102154905897e-06, 'epoch': 0.89} +{'loss': 120.8917, 'grad_norm': 1638.5732978696935, 'learning_rate': 6.028101008761445e-06, 'epoch': 0.89} +{'loss': 120.2913, 'grad_norm': 1362.0037182920248, 'learning_rate': 6.024100617446991e-06, 'epoch': 0.89} +{'loss': 120.4558, 'grad_norm': 1561.577291893263, 'learning_rate': 6.020100981722778e-06, 'epoch': 0.89} +{'loss': 118.2416, 'grad_norm': 1662.2828511787336, 'learning_rate': 6.016102102348914e-06, 'epoch': 0.89} +{'loss': 119.5238, 'grad_norm': 1542.7764882083786, 'learning_rate': 6.012103980085351e-06, 'epoch': 0.89} +{'loss': 118.8429, 'grad_norm': 1932.6423495014928, 'learning_rate': 6.0081066156919066e-06, 'epoch': 0.89} +{'loss': 122.8001, 'grad_norm': 1586.0721267030901, 'learning_rate': 6.004110009928246e-06, 'epoch': 0.89} +{'loss': 118.342, 'grad_norm': 1459.780682455088, 'learning_rate': 6.000114163553894e-06, 'epoch': 0.89} +{'loss': 119.0849, 'grad_norm': 1541.0708286265558, 'learning_rate': 5.9961190773282385e-06, 'epoch': 0.89} +{'loss': 116.7931, 'grad_norm': 1439.6150047988488, 'learning_rate': 5.992124752010509e-06, 'epoch': 0.89} +{'loss': 124.7445, 'grad_norm': 1596.3226938547589, 'learning_rate': 5.988131188359802e-06, 'epoch': 0.89} +{'loss': 118.153, 'grad_norm': 1540.4700486650083, 'learning_rate': 5.984138387135061e-06, 'epoch': 0.89} +{'loss': 120.7565, 'grad_norm': 1480.053794456152, 'learning_rate': 5.980146349095089e-06, 'epoch': 0.89} +{'loss': 118.8959, 'grad_norm': 1637.007744533754, 'learning_rate': 5.9761550749985484e-06, 'epoch': 0.89} +{'loss': 124.8347, 'grad_norm': 1437.115331937414, 'learning_rate': 5.972164565603944e-06, 'epoch': 0.89} +{'loss': 119.0565, 'grad_norm': 1445.978435486414, 'learning_rate': 5.968174821669648e-06, 'epoch': 0.89} +{'loss': 121.7586, 'grad_norm': 1671.452068426098, 'learning_rate': 5.964185843953878e-06, 'epoch': 0.89} +{'loss': 118.9209, 'grad_norm': 1370.9564257517422, 'learning_rate': 5.96019763321471e-06, 'epoch': 0.89} +{'loss': 119.7727, 'grad_norm': 1626.6842013288654, 'learning_rate': 5.9562101902100775e-06, 'epoch': 0.89} +{'loss': 121.3714, 'grad_norm': 1673.440286867783, 'learning_rate': 5.952223515697761e-06, 'epoch': 0.89} +{'loss': 118.212, 'grad_norm': 1493.632420047772, 'learning_rate': 5.948237610435401e-06, 'epoch': 0.89} +{'loss': 115.9387, 'grad_norm': 1363.4598156060686, 'learning_rate': 5.944252475180487e-06, 'epoch': 0.89} +{'loss': 120.9167, 'grad_norm': 1564.2397743447054, 'learning_rate': 5.9402681106903635e-06, 'epoch': 0.89} +{'loss': 122.586, 'grad_norm': 1853.21426156551, 'learning_rate': 5.936284517722235e-06, 'epoch': 0.89} +{'loss': 118.9897, 'grad_norm': 1378.153131414202, 'learning_rate': 5.932301697033147e-06, 'epoch': 0.89} +{'loss': 117.0686, 'grad_norm': 1468.980364219876, 'learning_rate': 5.928319649380012e-06, 'epoch': 0.89} +{'loss': 116.2744, 'grad_norm': 1408.5387358145092, 'learning_rate': 5.924338375519583e-06, 'epoch': 0.89} +{'loss': 119.7092, 'grad_norm': 1321.4916879055918, 'learning_rate': 5.92035787620847e-06, 'epoch': 0.89} +{'loss': 120.0966, 'grad_norm': 1490.2898221254327, 'learning_rate': 5.916378152203146e-06, 'epoch': 0.89} +{'loss': 117.308, 'grad_norm': 1392.4332210319913, 'learning_rate': 5.912399204259921e-06, 'epoch': 0.89} +{'loss': 119.3671, 'grad_norm': 1682.7521331970063, 'learning_rate': 5.908421033134969e-06, 'epoch': 0.89} +{'loss': 119.167, 'grad_norm': 1865.4899476900366, 'learning_rate': 5.904443639584307e-06, 'epoch': 0.89} +{'loss': 120.4471, 'grad_norm': 1557.5648902004402, 'learning_rate': 5.90046702436381e-06, 'epoch': 0.89} +{'loss': 116.699, 'grad_norm': 1637.602413434123, 'learning_rate': 5.896491188229212e-06, 'epoch': 0.89} +{'loss': 117.5863, 'grad_norm': 1555.5201654125358, 'learning_rate': 5.892516131936082e-06, 'epoch': 0.89} +{'loss': 121.1491, 'grad_norm': 1641.5118771061022, 'learning_rate': 5.888541856239854e-06, 'epoch': 0.89} +{'loss': 119.4191, 'grad_norm': 1497.120493979406, 'learning_rate': 5.8845683618958105e-06, 'epoch': 0.89} +{'loss': 117.3287, 'grad_norm': 1674.5884231025505, 'learning_rate': 5.88059564965908e-06, 'epoch': 0.89} +{'loss': 121.3757, 'grad_norm': 1523.5396433915273, 'learning_rate': 5.876623720284651e-06, 'epoch': 0.89} +{'loss': 119.3536, 'grad_norm': 1852.3732577486758, 'learning_rate': 5.872652574527361e-06, 'epoch': 0.89} +{'loss': 125.7221, 'grad_norm': 1651.2933689234076, 'learning_rate': 5.8686822131418916e-06, 'epoch': 0.89} +{'loss': 117.0111, 'grad_norm': 1586.3305643665901, 'learning_rate': 5.8647126368827875e-06, 'epoch': 0.9} +{'loss': 119.4693, 'grad_norm': 1511.3347707877926, 'learning_rate': 5.860743846504429e-06, 'epoch': 0.9} +{'loss': 119.6973, 'grad_norm': 1658.479089032038, 'learning_rate': 5.85677584276106e-06, 'epoch': 0.9} +{'loss': 122.7623, 'grad_norm': 1625.5570782584352, 'learning_rate': 5.852808626406772e-06, 'epoch': 0.9} +{'loss': 118.5745, 'grad_norm': 1394.7761100740747, 'learning_rate': 5.848842198195501e-06, 'epoch': 0.9} +{'loss': 114.3349, 'grad_norm': 1896.697917486226, 'learning_rate': 5.844876558881043e-06, 'epoch': 0.9} +{'loss': 119.6247, 'grad_norm': 1578.249281853998, 'learning_rate': 5.840911709217028e-06, 'epoch': 0.9} +{'loss': 125.1916, 'grad_norm': 1607.971870538308, 'learning_rate': 5.836947649956957e-06, 'epoch': 0.9} +{'loss': 116.1277, 'grad_norm': 1702.7415342692486, 'learning_rate': 5.8329843818541665e-06, 'epoch': 0.9} +{'loss': 116.7124, 'grad_norm': 1382.8561652822464, 'learning_rate': 5.829021905661847e-06, 'epoch': 0.9} +{'loss': 120.6895, 'grad_norm': 1451.0972181646398, 'learning_rate': 5.825060222133032e-06, 'epoch': 0.9} +{'loss': 119.0312, 'grad_norm': 1532.99007949686, 'learning_rate': 5.821099332020614e-06, 'epoch': 0.9} +{'loss': 119.1985, 'grad_norm': 1639.2856941688412, 'learning_rate': 5.817139236077336e-06, 'epoch': 0.9} +{'loss': 121.0666, 'grad_norm': 1590.1115893243368, 'learning_rate': 5.813179935055782e-06, 'epoch': 0.9} +{'loss': 120.2387, 'grad_norm': 1456.9430772052035, 'learning_rate': 5.809221429708385e-06, 'epoch': 0.9} +{'loss': 120.6546, 'grad_norm': 1690.3072708938205, 'learning_rate': 5.805263720787426e-06, 'epoch': 0.9} +{'loss': 121.8316, 'grad_norm': 1675.6253760412535, 'learning_rate': 5.801306809045044e-06, 'epoch': 0.9} +{'loss': 118.082, 'grad_norm': 1309.7387197727558, 'learning_rate': 5.797350695233224e-06, 'epoch': 0.9} +{'loss': 119.3271, 'grad_norm': 1581.758533978753, 'learning_rate': 5.793395380103792e-06, 'epoch': 0.9} +{'loss': 120.9247, 'grad_norm': 1516.5753973843293, 'learning_rate': 5.7894408644084274e-06, 'epoch': 0.9} +{'loss': 119.8621, 'grad_norm': 1533.7215818218617, 'learning_rate': 5.785487148898652e-06, 'epoch': 0.9} +{'loss': 119.0115, 'grad_norm': 1485.86028467387, 'learning_rate': 5.781534234325843e-06, 'epoch': 0.9} +{'loss': 117.3806, 'grad_norm': 1721.5570179098422, 'learning_rate': 5.777582121441227e-06, 'epoch': 0.9} +{'loss': 121.3561, 'grad_norm': 1591.013853447145, 'learning_rate': 5.773630810995872e-06, 'epoch': 0.9} +{'loss': 120.5287, 'grad_norm': 1329.7452181900312, 'learning_rate': 5.769680303740692e-06, 'epoch': 0.9} +{'loss': 124.2985, 'grad_norm': 1371.304368620902, 'learning_rate': 5.765730600426449e-06, 'epoch': 0.9} +{'loss': 116.3226, 'grad_norm': 1682.4783485107794, 'learning_rate': 5.761781701803764e-06, 'epoch': 0.9} +{'loss': 119.7397, 'grad_norm': 1408.4899435631032, 'learning_rate': 5.757833608623087e-06, 'epoch': 0.9} +{'loss': 133.8524, 'grad_norm': 28040.716504847, 'learning_rate': 5.753886321634731e-06, 'epoch': 0.9} +{'loss': 118.8914, 'grad_norm': 1559.3873321322576, 'learning_rate': 5.749939841588846e-06, 'epoch': 0.9} +{'loss': 120.3956, 'grad_norm': 1519.9741049001389, 'learning_rate': 5.745994169235431e-06, 'epoch': 0.9} +{'loss': 119.9585, 'grad_norm': 1651.6733619306974, 'learning_rate': 5.742049305324329e-06, 'epoch': 0.9} +{'loss': 117.0802, 'grad_norm': 1553.5657982448074, 'learning_rate': 5.7381052506052325e-06, 'epoch': 0.9} +{'loss': 121.5495, 'grad_norm': 1619.9325260196283, 'learning_rate': 5.734162005827688e-06, 'epoch': 0.9} +{'loss': 115.9642, 'grad_norm': 1404.2603839159638, 'learning_rate': 5.730219571741074e-06, 'epoch': 0.9} +{'loss': 121.1408, 'grad_norm': 1667.933516457981, 'learning_rate': 5.7262779490946204e-06, 'epoch': 0.9} +{'loss': 117.3538, 'grad_norm': 1497.9447023698967, 'learning_rate': 5.722337138637402e-06, 'epoch': 0.9} +{'loss': 119.0434, 'grad_norm': 1674.3098178353978, 'learning_rate': 5.718397141118341e-06, 'epoch': 0.9} +{'loss': 117.7942, 'grad_norm': 1416.3593172063818, 'learning_rate': 5.7144579572862115e-06, 'epoch': 0.9} +{'loss': 118.9144, 'grad_norm': 1628.2870926011592, 'learning_rate': 5.71051958788962e-06, 'epoch': 0.9} +{'loss': 119.3042, 'grad_norm': 1624.0399380652311, 'learning_rate': 5.7065820336770284e-06, 'epoch': 0.9} +{'loss': 117.4748, 'grad_norm': 1562.1058106742796, 'learning_rate': 5.7026452953967315e-06, 'epoch': 0.9} +{'loss': 113.4422, 'grad_norm': 1557.5444413570106, 'learning_rate': 5.698709373796882e-06, 'epoch': 0.9} +{'loss': 118.2321, 'grad_norm': 1575.4097508519, 'learning_rate': 5.6947742696254794e-06, 'epoch': 0.9} +{'loss': 120.4294, 'grad_norm': 1507.807935890066, 'learning_rate': 5.690839983630354e-06, 'epoch': 0.9} +{'loss': 116.2918, 'grad_norm': 1411.9458134430567, 'learning_rate': 5.68690651655919e-06, 'epoch': 0.9} +{'loss': 117.1647, 'grad_norm': 1588.2481184129515, 'learning_rate': 5.682973869159509e-06, 'epoch': 0.9} +{'loss': 118.5864, 'grad_norm': 1428.533801427258, 'learning_rate': 5.679042042178684e-06, 'epoch': 0.9} +{'loss': 122.9657, 'grad_norm': 2160.227922506122, 'learning_rate': 5.675111036363935e-06, 'epoch': 0.9} +{'loss': 116.9291, 'grad_norm': 1447.7901058368602, 'learning_rate': 5.671180852462317e-06, 'epoch': 0.9} +{'loss': 118.9068, 'grad_norm': 1639.169063936548, 'learning_rate': 5.667251491220731e-06, 'epoch': 0.9} +{'loss': 117.5218, 'grad_norm': 1462.7236830041438, 'learning_rate': 5.6633229533859194e-06, 'epoch': 0.9} +{'loss': 116.8392, 'grad_norm': 1697.0053911463262, 'learning_rate': 5.659395239704477e-06, 'epoch': 0.91} +{'loss': 119.6912, 'grad_norm': 1608.0742708834618, 'learning_rate': 5.655468350922839e-06, 'epoch': 0.91} +{'loss': 120.7083, 'grad_norm': 1474.907823815801, 'learning_rate': 5.65154228778728e-06, 'epoch': 0.91} +{'loss': 123.035, 'grad_norm': 1504.6443343551284, 'learning_rate': 5.647617051043916e-06, 'epoch': 0.91} +{'loss': 118.4743, 'grad_norm': 1538.2215994315302, 'learning_rate': 5.643692641438707e-06, 'epoch': 0.91} +{'loss': 119.7843, 'grad_norm': 1717.854660159738, 'learning_rate': 5.639769059717462e-06, 'epoch': 0.91} +{'loss': 117.3209, 'grad_norm': 1447.6119704076407, 'learning_rate': 5.635846306625833e-06, 'epoch': 0.91} +{'loss': 117.2082, 'grad_norm': 1455.1934885830526, 'learning_rate': 5.631924382909304e-06, 'epoch': 0.91} +{'loss': 117.9178, 'grad_norm': 1567.6806004014484, 'learning_rate': 5.6280032893132105e-06, 'epoch': 0.91} +{'loss': 116.5286, 'grad_norm': 1557.8441405517217, 'learning_rate': 5.6240830265827215e-06, 'epoch': 0.91} +{'loss': 116.2359, 'grad_norm': 1828.9081549876284, 'learning_rate': 5.620163595462859e-06, 'epoch': 0.91} +{'loss': 121.2459, 'grad_norm': 1575.2851719308774, 'learning_rate': 5.616244996698484e-06, 'epoch': 0.91} +{'loss': 117.6413, 'grad_norm': 1675.398368181119, 'learning_rate': 5.612327231034296e-06, 'epoch': 0.91} +{'loss': 122.127, 'grad_norm': 1591.8594351965282, 'learning_rate': 5.608410299214835e-06, 'epoch': 0.91} +{'loss': 126.1225, 'grad_norm': 1666.7166264668863, 'learning_rate': 5.604494201984485e-06, 'epoch': 0.91} +{'loss': 116.5195, 'grad_norm': 1690.7983842768556, 'learning_rate': 5.6005789400874685e-06, 'epoch': 0.91} +{'loss': 121.2355, 'grad_norm': 1670.0563994759157, 'learning_rate': 5.5966645142678555e-06, 'epoch': 0.91} +{'loss': 119.2165, 'grad_norm': 1599.088955441129, 'learning_rate': 5.592750925269558e-06, 'epoch': 0.91} +{'loss': 116.9707, 'grad_norm': 1439.6680480955545, 'learning_rate': 5.58883817383632e-06, 'epoch': 0.91} +{'loss': 118.7501, 'grad_norm': 1376.2628971854717, 'learning_rate': 5.584926260711732e-06, 'epoch': 0.91} +{'loss': 116.4207, 'grad_norm': 1536.2756859838485, 'learning_rate': 5.581015186639218e-06, 'epoch': 0.91} +{'loss': 118.6002, 'grad_norm': 1652.1347714549843, 'learning_rate': 5.577104952362055e-06, 'epoch': 0.91} +{'loss': 121.3181, 'grad_norm': 1417.0624862150084, 'learning_rate': 5.5731955586233566e-06, 'epoch': 0.91} +{'loss': 118.6499, 'grad_norm': 1477.0540030486354, 'learning_rate': 5.569287006166071e-06, 'epoch': 0.91} +{'loss': 120.889, 'grad_norm': 1682.6422444439509, 'learning_rate': 5.5653792957329885e-06, 'epoch': 0.91} +{'loss': 115.6098, 'grad_norm': 1643.4905457053069, 'learning_rate': 5.561472428066739e-06, 'epoch': 0.91} +{'loss': 116.0993, 'grad_norm': 1328.7112136385401, 'learning_rate': 5.557566403909794e-06, 'epoch': 0.91} +{'loss': 118.2599, 'grad_norm': 1582.1478916627032, 'learning_rate': 5.553661224004472e-06, 'epoch': 0.91} +{'loss': 114.4967, 'grad_norm': 1465.605316142713, 'learning_rate': 5.549756889092917e-06, 'epoch': 0.91} +{'loss': 116.0175, 'grad_norm': 1472.5999212807071, 'learning_rate': 5.545853399917121e-06, 'epoch': 0.91} +{'loss': 121.7048, 'grad_norm': 1671.7983995541715, 'learning_rate': 5.541950757218908e-06, 'epoch': 0.91} +{'loss': 118.6059, 'grad_norm': 1476.79932538679, 'learning_rate': 5.538048961739951e-06, 'epoch': 0.91} +{'loss': 115.5423, 'grad_norm': 1686.4627930363313, 'learning_rate': 5.534148014221761e-06, 'epoch': 0.91} +{'loss': 117.2204, 'grad_norm': 1438.8631322936403, 'learning_rate': 5.53024791540568e-06, 'epoch': 0.91} +{'loss': 128.2576, 'grad_norm': 1484.818535838287, 'learning_rate': 5.526348666032894e-06, 'epoch': 0.91} +{'loss': 118.1756, 'grad_norm': 1622.9954410753808, 'learning_rate': 5.5224502668444206e-06, 'epoch': 0.91} +{'loss': 116.4052, 'grad_norm': 1714.0668953042516, 'learning_rate': 5.518552718581126e-06, 'epoch': 0.91} +{'loss': 118.0438, 'grad_norm': 1572.1072443402638, 'learning_rate': 5.514656021983716e-06, 'epoch': 0.91} +{'loss': 118.4876, 'grad_norm': 1716.799365681272, 'learning_rate': 5.510760177792724e-06, 'epoch': 0.91} +{'loss': 120.3698, 'grad_norm': 1679.487380029543, 'learning_rate': 5.506865186748527e-06, 'epoch': 0.91} +{'loss': 121.1773, 'grad_norm': 1573.9068103378556, 'learning_rate': 5.502971049591332e-06, 'epoch': 0.91} +{'loss': 117.8166, 'grad_norm': 1611.11860928472, 'learning_rate': 5.499077767061201e-06, 'epoch': 0.91} +{'loss': 120.9648, 'grad_norm': 1418.5707645988255, 'learning_rate': 5.4951853398980216e-06, 'epoch': 0.91} +{'loss': 116.5344, 'grad_norm': 1403.3012563984041, 'learning_rate': 5.491293768841519e-06, 'epoch': 0.91} +{'loss': 118.6063, 'grad_norm': 1558.4768871492415, 'learning_rate': 5.487403054631259e-06, 'epoch': 0.91} +{'loss': 118.5631, 'grad_norm': 1504.298949591543, 'learning_rate': 5.483513198006638e-06, 'epoch': 0.91} +{'loss': 123.0781, 'grad_norm': 1467.0056615419178, 'learning_rate': 5.479624199706897e-06, 'epoch': 0.91} +{'loss': 116.8469, 'grad_norm': 1533.087191383909, 'learning_rate': 5.475736060471117e-06, 'epoch': 0.91} +{'loss': 118.1169, 'grad_norm': 1633.2460455775513, 'learning_rate': 5.471848781038205e-06, 'epoch': 0.91} +{'loss': 115.3213, 'grad_norm': 1407.192741765947, 'learning_rate': 5.4679623621469116e-06, 'epoch': 0.91} +{'loss': 119.6728, 'grad_norm': 1861.6380519586066, 'learning_rate': 5.464076804535816e-06, 'epoch': 0.91} +{'loss': 118.8871, 'grad_norm': 1725.3003851695082, 'learning_rate': 5.460192108943348e-06, 'epoch': 0.91} +{'loss': 119.2191, 'grad_norm': 1665.661571694492, 'learning_rate': 5.456308276107757e-06, 'epoch': 0.92} +{'loss': 115.7916, 'grad_norm': 1640.7188817336191, 'learning_rate': 5.4524253067671465e-06, 'epoch': 0.92} +{'loss': 117.5675, 'grad_norm': 1639.4446701428014, 'learning_rate': 5.44854320165944e-06, 'epoch': 0.92} +{'loss': 117.6143, 'grad_norm': 1461.8690233318216, 'learning_rate': 5.444661961522399e-06, 'epoch': 0.92} +{'loss': 118.7611, 'grad_norm': 1362.7739289561587, 'learning_rate': 5.4407815870936334e-06, 'epoch': 0.92} +{'loss': 116.7051, 'grad_norm': 1493.5278111465382, 'learning_rate': 5.436902079110571e-06, 'epoch': 0.92} +{'loss': 120.5018, 'grad_norm': 1391.5878474992705, 'learning_rate': 5.433023438310492e-06, 'epoch': 0.92} +{'loss': 118.6904, 'grad_norm': 1686.7142424069978, 'learning_rate': 5.4291456654305e-06, 'epoch': 0.92} +{'loss': 116.306, 'grad_norm': 1502.115767372949, 'learning_rate': 5.425268761207532e-06, 'epoch': 0.92} +{'loss': 118.2124, 'grad_norm': 1482.0798681817828, 'learning_rate': 5.4213927263783725e-06, 'epoch': 0.92} +{'loss': 116.2898, 'grad_norm': 1682.4931453274708, 'learning_rate': 5.417517561679626e-06, 'epoch': 0.92} +{'loss': 117.6512, 'grad_norm': 1737.550344489076, 'learning_rate': 5.413643267847748e-06, 'epoch': 0.92} +{'loss': 115.3887, 'grad_norm': 1622.7216514430338, 'learning_rate': 5.409769845619013e-06, 'epoch': 0.92} +{'loss': 115.289, 'grad_norm': 1453.9892232512916, 'learning_rate': 5.4058972957295345e-06, 'epoch': 0.92} +{'loss': 117.1857, 'grad_norm': 1539.2353227595977, 'learning_rate': 5.402025618915268e-06, 'epoch': 0.92} +{'loss': 115.4653, 'grad_norm': 1473.685343671591, 'learning_rate': 5.398154815911989e-06, 'epoch': 0.92} +{'loss': 117.9891, 'grad_norm': 1625.4188313554384, 'learning_rate': 5.3942848874553235e-06, 'epoch': 0.92} +{'loss': 115.1535, 'grad_norm': 1428.0091578224797, 'learning_rate': 5.39041583428072e-06, 'epoch': 0.92} +{'loss': 116.618, 'grad_norm': 1302.4339976827207, 'learning_rate': 5.38654765712346e-06, 'epoch': 0.92} +{'loss': 115.0911, 'grad_norm': 1538.765898787972, 'learning_rate': 5.3826803567186656e-06, 'epoch': 0.92} +{'loss': 121.0972, 'grad_norm': 2528.372022954965, 'learning_rate': 5.378813933801286e-06, 'epoch': 0.92} +{'loss': 116.3098, 'grad_norm': 1434.1401429424748, 'learning_rate': 5.37494838910611e-06, 'epoch': 0.92} +{'loss': 115.6877, 'grad_norm': 1389.4647539986086, 'learning_rate': 5.3710837233677525e-06, 'epoch': 0.92} +{'loss': 118.0066, 'grad_norm': 1667.5608050005255, 'learning_rate': 5.367219937320663e-06, 'epoch': 0.92} +{'loss': 117.8424, 'grad_norm': 1643.262087063445, 'learning_rate': 5.3633570316991315e-06, 'epoch': 0.92} +{'loss': 120.6111, 'grad_norm': 1632.1033530815346, 'learning_rate': 5.359495007237268e-06, 'epoch': 0.92} +{'loss': 117.9233, 'grad_norm': 1557.6644165245268, 'learning_rate': 5.355633864669027e-06, 'epoch': 0.92} +{'loss': 121.5949, 'grad_norm': 1444.952440211182, 'learning_rate': 5.351773604728189e-06, 'epoch': 0.92} +{'loss': 119.6926, 'grad_norm': 1462.4596658307755, 'learning_rate': 5.347914228148361e-06, 'epoch': 0.92} +{'loss': 111.3259, 'grad_norm': 1382.8473362791708, 'learning_rate': 5.344055735663001e-06, 'epoch': 0.92} +{'loss': 116.8441, 'grad_norm': 1486.4790412657896, 'learning_rate': 5.3401981280053745e-06, 'epoch': 0.92} +{'loss': 119.2072, 'grad_norm': 1683.7045577151598, 'learning_rate': 5.336341405908603e-06, 'epoch': 0.92} +{'loss': 117.9103, 'grad_norm': 1533.0290384299506, 'learning_rate': 5.332485570105621e-06, 'epoch': 0.92} +{'loss': 114.5179, 'grad_norm': 1426.526061305341, 'learning_rate': 5.328630621329199e-06, 'epoch': 0.92} +{'loss': 126.8431, 'grad_norm': 1625.6013565668316, 'learning_rate': 5.324776560311949e-06, 'epoch': 0.92} +{'loss': 115.7566, 'grad_norm': 1365.1594659211057, 'learning_rate': 5.320923387786303e-06, 'epoch': 0.92} +{'loss': 115.0927, 'grad_norm': 1405.0144341368311, 'learning_rate': 5.317071104484524e-06, 'epoch': 0.92} +{'loss': 117.2279, 'grad_norm': 1565.6498414429918, 'learning_rate': 5.313219711138717e-06, 'epoch': 0.92} +{'loss': 116.154, 'grad_norm': 1578.6379169448699, 'learning_rate': 5.309369208480806e-06, 'epoch': 0.92} +{'loss': 118.5215, 'grad_norm': 1357.8123000701812, 'learning_rate': 5.305519597242555e-06, 'epoch': 0.92} +{'loss': 117.7968, 'grad_norm': 1545.9535190937606, 'learning_rate': 5.301670878155552e-06, 'epoch': 0.92} +{'loss': 119.6328, 'grad_norm': 1659.5102905426618, 'learning_rate': 5.297823051951211e-06, 'epoch': 0.92} +{'loss': 115.2504, 'grad_norm': 1527.7409726462133, 'learning_rate': 5.2939761193607945e-06, 'epoch': 0.92} +[2025-05-29 14:18:59,594] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step48000 is about to be saved! +[2025-05-29 14:18:59,680] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 14:18:59,680] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 14:18:59,814] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 14:18:59,940] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 14:24:43,963] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 14:24:44,031] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 14:24:44,043] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step48000 is ready now! +[2025-05-29 14:24:44,206] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step48000 is about to be saved! +[2025-05-29 14:24:44,232] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 14:24:44,232] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 14:24:44,287] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 14:24:45,079] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 14:30:48,576] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 14:30:48,772] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-48000/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 14:30:59,830] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step48000 is ready now! +{'loss': 119.0513, 'grad_norm': 1769.37055985582, 'learning_rate': 5.290130081115372e-06, 'epoch': 0.92} +{'loss': 118.3235, 'grad_norm': 1509.6729375923478, 'learning_rate': 5.286284937945866e-06, 'epoch': 0.92} +{'loss': 117.3566, 'grad_norm': 1388.2659583174595, 'learning_rate': 5.2824406905830115e-06, 'epoch': 0.92} +{'loss': 119.714, 'grad_norm': 1499.1852490428096, 'learning_rate': 5.2785973397573766e-06, 'epoch': 0.92} +{'loss': 113.8628, 'grad_norm': 1369.630822966051, 'learning_rate': 5.274754886199367e-06, 'epoch': 0.92} +{'loss': 116.3204, 'grad_norm': 1710.9160731976724, 'learning_rate': 5.270913330639207e-06, 'epoch': 0.92} +{'loss': 113.8798, 'grad_norm': 1608.188287616272, 'learning_rate': 5.2670726738069634e-06, 'epoch': 0.92} +{'loss': 116.2394, 'grad_norm': 1492.9591603167335, 'learning_rate': 5.263232916432519e-06, 'epoch': 0.92} +{'loss': 119.4055, 'grad_norm': 1694.9329936182505, 'learning_rate': 5.25939405924559e-06, 'epoch': 0.92} +{'loss': 119.802, 'grad_norm': 1732.1708225803268, 'learning_rate': 5.25555610297573e-06, 'epoch': 0.93} +{'loss': 119.7799, 'grad_norm': 1771.709450682253, 'learning_rate': 5.2517190483523036e-06, 'epoch': 0.93} +{'loss': 113.019, 'grad_norm': 1726.6373980128249, 'learning_rate': 5.247882896104525e-06, 'epoch': 0.93} +{'loss': 116.8943, 'grad_norm': 1426.3843071393112, 'learning_rate': 5.244047646961422e-06, 'epoch': 0.93} +{'loss': 116.2907, 'grad_norm': 1722.6618445552206, 'learning_rate': 5.240213301651852e-06, 'epoch': 0.93} +{'loss': 116.1004, 'grad_norm': 1440.968583985474, 'learning_rate': 5.2363798609045115e-06, 'epoch': 0.93} +{'loss': 118.538, 'grad_norm': 1538.6006446035851, 'learning_rate': 5.232547325447908e-06, 'epoch': 0.93} +{'loss': 115.8984, 'grad_norm': 1448.6814125795268, 'learning_rate': 5.228715696010398e-06, 'epoch': 0.93} +{'loss': 117.1385, 'grad_norm': 1480.084708066596, 'learning_rate': 5.224884973320146e-06, 'epoch': 0.93} +{'loss': 115.9785, 'grad_norm': 1567.2936230011223, 'learning_rate': 5.2210551581051516e-06, 'epoch': 0.93} +{'loss': 117.1378, 'grad_norm': 1486.020397844992, 'learning_rate': 5.21722625109325e-06, 'epoch': 0.93} +{'loss': 116.727, 'grad_norm': 1439.764654884647, 'learning_rate': 5.213398253012087e-06, 'epoch': 0.93} +{'loss': 117.9755, 'grad_norm': 1701.349710039242, 'learning_rate': 5.209571164589158e-06, 'epoch': 0.93} +{'loss': 117.168, 'grad_norm': 1487.8212630278456, 'learning_rate': 5.205744986551763e-06, 'epoch': 0.93} +{'loss': 126.8661, 'grad_norm': 1536.4779210854942, 'learning_rate': 5.201919719627042e-06, 'epoch': 0.93} +{'loss': 117.5909, 'grad_norm': 1594.9822578015744, 'learning_rate': 5.198095364541954e-06, 'epoch': 0.93} +{'loss': 117.8262, 'grad_norm': 1425.7641467840847, 'learning_rate': 5.194271922023294e-06, 'epoch': 0.93} +{'loss': 116.6719, 'grad_norm': 1613.7945520818973, 'learning_rate': 5.190449392797682e-06, 'epoch': 0.93} +{'loss': 116.7869, 'grad_norm': 1512.9135114777873, 'learning_rate': 5.1866277775915575e-06, 'epoch': 0.93} +{'loss': 115.7048, 'grad_norm': 1467.2450804576804, 'learning_rate': 5.182807077131191e-06, 'epoch': 0.93} +{'loss': 118.4908, 'grad_norm': 1509.5456160700874, 'learning_rate': 5.178987292142674e-06, 'epoch': 0.93} +{'loss': 117.3417, 'grad_norm': 1433.9781725289834, 'learning_rate': 5.175168423351935e-06, 'epoch': 0.93} +{'loss': 117.4876, 'grad_norm': 1757.606813283585, 'learning_rate': 5.171350471484716e-06, 'epoch': 0.93} +{'loss': 116.2637, 'grad_norm': 1404.2436002213658, 'learning_rate': 5.167533437266598e-06, 'epoch': 0.93} +{'loss': 119.9294, 'grad_norm': 1638.635414795771, 'learning_rate': 5.163717321422974e-06, 'epoch': 0.93} +{'loss': 112.8596, 'grad_norm': 1414.3961636786846, 'learning_rate': 5.159902124679066e-06, 'epoch': 0.93} +{'loss': 121.5992, 'grad_norm': 1641.7081413778888, 'learning_rate': 5.156087847759933e-06, 'epoch': 0.93} +{'loss': 121.9122, 'grad_norm': 1603.3863448911939, 'learning_rate': 5.152274491390441e-06, 'epoch': 0.93} +{'loss': 112.8364, 'grad_norm': 1563.2900590429938, 'learning_rate': 5.148462056295298e-06, 'epoch': 0.93} +{'loss': 118.3221, 'grad_norm': 1404.233617896539, 'learning_rate': 5.144650543199025e-06, 'epoch': 0.93} +{'loss': 117.0353, 'grad_norm': 1433.2561721458962, 'learning_rate': 5.140839952825968e-06, 'epoch': 0.93} +{'loss': 111.8769, 'grad_norm': 1314.0557261070821, 'learning_rate': 5.137030285900309e-06, 'epoch': 0.93} +{'loss': 117.9931, 'grad_norm': 1478.4096062581152, 'learning_rate': 5.1332215431460405e-06, 'epoch': 0.93} +{'loss': 116.4159, 'grad_norm': 1824.1757737183373, 'learning_rate': 5.1294137252869935e-06, 'epoch': 0.93} +{'loss': 117.9768, 'grad_norm': 1488.845254040143, 'learning_rate': 5.12560683304681e-06, 'epoch': 0.93} +{'loss': 115.5903, 'grad_norm': 1430.4562783654358, 'learning_rate': 5.121800867148961e-06, 'epoch': 0.93} +{'loss': 119.0031, 'grad_norm': 1642.0136938415226, 'learning_rate': 5.117995828316744e-06, 'epoch': 0.93} +{'loss': 115.5563, 'grad_norm': 1437.6863515819082, 'learning_rate': 5.114191717273277e-06, 'epoch': 0.93} +{'loss': 111.5152, 'grad_norm': 1548.2136159360136, 'learning_rate': 5.110388534741508e-06, 'epoch': 0.93} +{'loss': 116.9412, 'grad_norm': 1563.4679353153213, 'learning_rate': 5.1065862814442e-06, 'epoch': 0.93} +{'loss': 116.7768, 'grad_norm': 1367.7395087190607, 'learning_rate': 5.102784958103939e-06, 'epoch': 0.93} +{'loss': 118.1029, 'grad_norm': 1501.9047184517813, 'learning_rate': 5.098984565443148e-06, 'epoch': 0.93} +{'loss': 118.3706, 'grad_norm': 1513.4763635645972, 'learning_rate': 5.095185104184053e-06, 'epoch': 0.93} +{'loss': 114.9024, 'grad_norm': 1517.244724373099, 'learning_rate': 5.091386575048722e-06, 'epoch': 0.93} +{'loss': 121.0167, 'grad_norm': 1772.8417406329781, 'learning_rate': 5.0875889787590345e-06, 'epoch': 0.93} +{'loss': 118.8679, 'grad_norm': 1477.599097245853, 'learning_rate': 5.083792316036691e-06, 'epoch': 0.93} +{'loss': 117.615, 'grad_norm': 1456.237995828082, 'learning_rate': 5.0799965876032264e-06, 'epoch': 0.93} +{'loss': 115.0603, 'grad_norm': 1667.1858104388425, 'learning_rate': 5.0762017941799845e-06, 'epoch': 0.93} +{'loss': 115.2922, 'grad_norm': 1359.3258985334826, 'learning_rate': 5.072407936488145e-06, 'epoch': 0.93} +{'loss': 113.8147, 'grad_norm': 1392.56278759809, 'learning_rate': 5.068615015248698e-06, 'epoch': 0.93} +{'loss': 114.3841, 'grad_norm': 1527.7217523446266, 'learning_rate': 5.06482303118246e-06, 'epoch': 0.93} +{'loss': 117.1534, 'grad_norm': 1748.4810463618273, 'learning_rate': 5.061031985010065e-06, 'epoch': 0.93} +{'loss': 113.8527, 'grad_norm': 1326.5723774451428, 'learning_rate': 5.057241877451979e-06, 'epoch': 0.94} +{'loss': 117.6719, 'grad_norm': 1614.8810259788156, 'learning_rate': 5.053452709228488e-06, 'epoch': 0.94} +{'loss': 119.5042, 'grad_norm': 1710.275695587888, 'learning_rate': 5.049664481059688e-06, 'epoch': 0.94} +{'loss': 115.5046, 'grad_norm': 1628.4445042173938, 'learning_rate': 5.045877193665508e-06, 'epoch': 0.94} +{'loss': 116.6796, 'grad_norm': 1653.238166997296, 'learning_rate': 5.042090847765688e-06, 'epoch': 0.94} +{'loss': 114.379, 'grad_norm': 1802.5567665170006, 'learning_rate': 5.0383054440797985e-06, 'epoch': 0.94} +{'loss': 114.7528, 'grad_norm': 1722.1786560864841, 'learning_rate': 5.034520983327231e-06, 'epoch': 0.94} +{'loss': 118.0437, 'grad_norm': 1419.055980491394, 'learning_rate': 5.030737466227191e-06, 'epoch': 0.94} +{'loss': 114.1707, 'grad_norm': 1514.1568184731295, 'learning_rate': 5.02695489349871e-06, 'epoch': 0.94} +{'loss': 116.6626, 'grad_norm': 1499.55675531486, 'learning_rate': 5.02317326586063e-06, 'epoch': 0.94} +{'loss': 114.5841, 'grad_norm': 1576.6509780902843, 'learning_rate': 5.019392584031628e-06, 'epoch': 0.94} +{'loss': 113.1819, 'grad_norm': 1316.9961744922964, 'learning_rate': 5.015612848730198e-06, 'epoch': 0.94} +{'loss': 116.1795, 'grad_norm': 1479.9848709335108, 'learning_rate': 5.011834060674648e-06, 'epoch': 0.94} +{'loss': 116.5348, 'grad_norm': 1570.9716513125277, 'learning_rate': 5.008056220583106e-06, 'epoch': 0.94} +{'loss': 115.1433, 'grad_norm': 1489.8515226551951, 'learning_rate': 5.00427932917352e-06, 'epoch': 0.94} +{'loss': 115.3371, 'grad_norm': 1472.8568742971074, 'learning_rate': 5.000503387163664e-06, 'epoch': 0.94} +{'loss': 114.7046, 'grad_norm': 1437.7039559940458, 'learning_rate': 4.996728395271133e-06, 'epoch': 0.94} +{'loss': 119.8167, 'grad_norm': 1578.625747831156, 'learning_rate': 4.992954354213331e-06, 'epoch': 0.94} +{'loss': 114.8673, 'grad_norm': 1469.8411119458933, 'learning_rate': 4.989181264707485e-06, 'epoch': 0.94} +{'loss': 113.4577, 'grad_norm': 1497.76996717879, 'learning_rate': 4.985409127470643e-06, 'epoch': 0.94} +{'loss': 118.2154, 'grad_norm': 1628.9520712165597, 'learning_rate': 4.981637943219671e-06, 'epoch': 0.94} +{'loss': 119.7538, 'grad_norm': 2364.7593705665213, 'learning_rate': 4.977867712671263e-06, 'epoch': 0.94} +{'loss': 114.192, 'grad_norm': 1486.495499388884, 'learning_rate': 4.974098436541917e-06, 'epoch': 0.94} +{'loss': 116.2455, 'grad_norm': 1607.3379660782464, 'learning_rate': 4.9703301155479555e-06, 'epoch': 0.94} +{'loss': 117.2561, 'grad_norm': 1550.7043242827408, 'learning_rate': 4.966562750405517e-06, 'epoch': 0.94} +{'loss': 115.3665, 'grad_norm': 1547.821326213576, 'learning_rate': 4.962796341830566e-06, 'epoch': 0.94} +{'loss': 118.9341, 'grad_norm': 1423.7943775274261, 'learning_rate': 4.959030890538883e-06, 'epoch': 0.94} +{'loss': 113.9113, 'grad_norm': 1674.3875763734995, 'learning_rate': 4.955266397246061e-06, 'epoch': 0.94} +{'loss': 113.7418, 'grad_norm': 1777.694254937776, 'learning_rate': 4.951502862667516e-06, 'epoch': 0.94} +{'loss': 114.6381, 'grad_norm': 1357.3676439235974, 'learning_rate': 4.947740287518471e-06, 'epoch': 0.94} +{'loss': 118.2854, 'grad_norm': 1428.3347904032833, 'learning_rate': 4.943978672513984e-06, 'epoch': 0.94} +{'loss': 113.8807, 'grad_norm': 1353.3264010646612, 'learning_rate': 4.940218018368924e-06, 'epoch': 0.94} +{'loss': 114.349, 'grad_norm': 1344.71550288635, 'learning_rate': 4.936458325797972e-06, 'epoch': 0.94} +{'loss': 114.6093, 'grad_norm': 1627.2024358590102, 'learning_rate': 4.932699595515631e-06, 'epoch': 0.94} +{'loss': 117.7034, 'grad_norm': 1911.9936369549089, 'learning_rate': 4.928941828236218e-06, 'epoch': 0.94} +{'loss': 113.7138, 'grad_norm': 1630.1971304585434, 'learning_rate': 4.9251850246738644e-06, 'epoch': 0.94} +{'loss': 116.2365, 'grad_norm': 1636.1974049700211, 'learning_rate': 4.921429185542529e-06, 'epoch': 0.94} +{'loss': 115.4178, 'grad_norm': 1526.9994379382367, 'learning_rate': 4.917674311555984e-06, 'epoch': 0.94} +{'loss': 118.6052, 'grad_norm': 1426.2019277877773, 'learning_rate': 4.913920403427812e-06, 'epoch': 0.94} +{'loss': 116.6065, 'grad_norm': 1354.1879164357342, 'learning_rate': 4.910167461871414e-06, 'epoch': 0.94} +{'loss': 115.2056, 'grad_norm': 1578.2054782233918, 'learning_rate': 4.906415487600007e-06, 'epoch': 0.94} +{'loss': 114.7946, 'grad_norm': 1615.2834861051151, 'learning_rate': 4.9026644813266285e-06, 'epoch': 0.94} +{'loss': 116.2821, 'grad_norm': 1764.6322823721441, 'learning_rate': 4.898914443764134e-06, 'epoch': 0.94} +{'loss': 114.1762, 'grad_norm': 1458.2651258103822, 'learning_rate': 4.895165375625185e-06, 'epoch': 0.94} +{'loss': 118.1206, 'grad_norm': 1453.7379644450011, 'learning_rate': 4.891417277622265e-06, 'epoch': 0.94} +{'loss': 113.8716, 'grad_norm': 1723.1506718812886, 'learning_rate': 4.887670150467671e-06, 'epoch': 0.94} +{'loss': 116.2815, 'grad_norm': 1878.093693474404, 'learning_rate': 4.8839239948735175e-06, 'epoch': 0.94} +{'loss': 118.3308, 'grad_norm': 1718.0545576459162, 'learning_rate': 4.880178811551737e-06, 'epoch': 0.94} +{'loss': 114.6106, 'grad_norm': 1470.8891849722845, 'learning_rate': 4.8764346012140715e-06, 'epoch': 0.94} +{'loss': 116.0882, 'grad_norm': 1561.7050691090938, 'learning_rate': 4.872691364572082e-06, 'epoch': 0.94} +{'loss': 116.6659, 'grad_norm': 1517.7965147558932, 'learning_rate': 4.868949102337136e-06, 'epoch': 0.94} +{'loss': 115.563, 'grad_norm': 1750.7048799378877, 'learning_rate': 4.865207815220427e-06, 'epoch': 0.94} +{'loss': 115.5461, 'grad_norm': 1907.160211589921, 'learning_rate': 4.861467503932965e-06, 'epoch': 0.95} +{'loss': 114.6503, 'grad_norm': 1564.0785952629042, 'learning_rate': 4.857728169185564e-06, 'epoch': 0.95} +{'loss': 113.8059, 'grad_norm': 1402.0867795906743, 'learning_rate': 4.8539898116888555e-06, 'epoch': 0.95} +{'loss': 113.0458, 'grad_norm': 1510.9523309091442, 'learning_rate': 4.850252432153283e-06, 'epoch': 0.95} +{'loss': 115.5173, 'grad_norm': 1447.1241834889934, 'learning_rate': 4.846516031289112e-06, 'epoch': 0.95} +{'loss': 113.0391, 'grad_norm': 1670.3654190181894, 'learning_rate': 4.842780609806423e-06, 'epoch': 0.95} +{'loss': 113.4133, 'grad_norm': 1499.4139476571704, 'learning_rate': 4.839046168415099e-06, 'epoch': 0.95} +{'loss': 112.7578, 'grad_norm': 1477.279305794388, 'learning_rate': 4.8353127078248455e-06, 'epoch': 0.95} +{'loss': 115.2532, 'grad_norm': 1552.777646985015, 'learning_rate': 4.831580228745173e-06, 'epoch': 0.95} +{'loss': 114.3653, 'grad_norm': 1545.6962651871263, 'learning_rate': 4.827848731885416e-06, 'epoch': 0.95} +{'loss': 116.8868, 'grad_norm': 1625.8449583805648, 'learning_rate': 4.824118217954722e-06, 'epoch': 0.95} +{'loss': 116.9738, 'grad_norm': 1662.622271771699, 'learning_rate': 4.820388687662043e-06, 'epoch': 0.95} +{'loss': 115.9051, 'grad_norm': 1784.615689410141, 'learning_rate': 4.81666014171615e-06, 'epoch': 0.95} +{'loss': 112.9873, 'grad_norm': 1579.9641215132779, 'learning_rate': 4.812932580825621e-06, 'epoch': 0.95} +{'loss': 114.0112, 'grad_norm': 1349.403626173006, 'learning_rate': 4.809206005698856e-06, 'epoch': 0.95} +{'loss': 116.4572, 'grad_norm': 1555.6962481348626, 'learning_rate': 4.805480417044065e-06, 'epoch': 0.95} +{'loss': 115.3302, 'grad_norm': 1559.2492256223588, 'learning_rate': 4.801755815569265e-06, 'epoch': 0.95} +{'loss': 114.1702, 'grad_norm': 1338.045756921532, 'learning_rate': 4.798032201982291e-06, 'epoch': 0.95} +{'loss': 113.4219, 'grad_norm': 1816.7588230916592, 'learning_rate': 4.794309576990782e-06, 'epoch': 0.95} +{'loss': 115.5313, 'grad_norm': 1601.6794046259736, 'learning_rate': 4.790587941302205e-06, 'epoch': 0.95} +{'loss': 114.9728, 'grad_norm': 1446.2497266555026, 'learning_rate': 4.786867295623821e-06, 'epoch': 0.95} +{'loss': 117.5639, 'grad_norm': 1301.5465197581489, 'learning_rate': 4.783147640662716e-06, 'epoch': 0.95} +{'loss': 119.4091, 'grad_norm': 1536.6733551842888, 'learning_rate': 4.779428977125784e-06, 'epoch': 0.95} +{'loss': 115.6098, 'grad_norm': 1621.4927576675566, 'learning_rate': 4.775711305719723e-06, 'epoch': 0.95} +{'loss': 118.6901, 'grad_norm': 1465.8780005081585, 'learning_rate': 4.771994627151055e-06, 'epoch': 0.95} +{'loss': 113.7891, 'grad_norm': 1677.7450048236774, 'learning_rate': 4.768278942126102e-06, 'epoch': 0.95} +{'loss': 117.3851, 'grad_norm': 1396.489487219724, 'learning_rate': 4.7645642513510106e-06, 'epoch': 0.95} +{'loss': 112.9227, 'grad_norm': 1353.4447907712567, 'learning_rate': 4.760850555531724e-06, 'epoch': 0.95} +{'loss': 116.602, 'grad_norm': 1453.4764528586536, 'learning_rate': 4.757137855374001e-06, 'epoch': 0.95} +{'loss': 118.041, 'grad_norm': 1678.070345958945, 'learning_rate': 4.753426151583419e-06, 'epoch': 0.95} +{'loss': 113.3022, 'grad_norm': 1281.2837347882396, 'learning_rate': 4.749715444865353e-06, 'epoch': 0.95} +{'loss': 114.6754, 'grad_norm': 1498.9838521840425, 'learning_rate': 4.7460057359250024e-06, 'epoch': 0.95} +{'loss': 114.6542, 'grad_norm': 1635.5381488389057, 'learning_rate': 4.7422970254673665e-06, 'epoch': 0.95} +{'loss': 117.5044, 'grad_norm': 1640.7338284304506, 'learning_rate': 4.7385893141972535e-06, 'epoch': 0.95} +{'loss': 113.0769, 'grad_norm': 1406.7797325284678, 'learning_rate': 4.734882602819296e-06, 'epoch': 0.95} +{'loss': 115.2845, 'grad_norm': 1847.775487354379, 'learning_rate': 4.731176892037918e-06, 'epoch': 0.95} +{'loss': 114.412, 'grad_norm': 1376.90029709217, 'learning_rate': 4.72747218255737e-06, 'epoch': 0.95} +{'loss': 115.3645, 'grad_norm': 1494.7234600054182, 'learning_rate': 4.723768475081701e-06, 'epoch': 0.95} +{'loss': 118.3385, 'grad_norm': 1918.9314734792274, 'learning_rate': 4.720065770314771e-06, 'epoch': 0.95} +{'loss': 114.3544, 'grad_norm': 1584.1126352803046, 'learning_rate': 4.716364068960257e-06, 'epoch': 0.95} +{'loss': 113.051, 'grad_norm': 1431.1539774377252, 'learning_rate': 4.712663371721633e-06, 'epoch': 0.95} +{'loss': 111.3072, 'grad_norm': 1386.4422071229617, 'learning_rate': 4.708963679302196e-06, 'epoch': 0.95} +{'loss': 115.6449, 'grad_norm': 1530.5204046424067, 'learning_rate': 4.705264992405043e-06, 'epoch': 0.95} +{'loss': 112.9609, 'grad_norm': 1386.8994095733833, 'learning_rate': 4.701567311733079e-06, 'epoch': 0.95} +{'loss': 115.7431, 'grad_norm': 1747.8150749790764, 'learning_rate': 4.697870637989025e-06, 'epoch': 0.95} +{'loss': 116.9922, 'grad_norm': 1655.5380088209533, 'learning_rate': 4.694174971875401e-06, 'epoch': 0.95} +{'loss': 116.5599, 'grad_norm': 1469.0236692740434, 'learning_rate': 4.690480314094547e-06, 'epoch': 0.95} +{'loss': 116.0679, 'grad_norm': 1374.0334205301197, 'learning_rate': 4.686786665348606e-06, 'epoch': 0.95} +{'loss': 114.1623, 'grad_norm': 1613.931666635479, 'learning_rate': 4.68309402633952e-06, 'epoch': 0.95} +{'loss': 115.1868, 'grad_norm': 1274.871858312715, 'learning_rate': 4.679402397769057e-06, 'epoch': 0.95} +{'loss': 113.5682, 'grad_norm': 1765.4560927218797, 'learning_rate': 4.675711780338777e-06, 'epoch': 0.95} +{'loss': 115.3506, 'grad_norm': 1404.786610125801, 'learning_rate': 4.672022174750059e-06, 'epoch': 0.95} +{'loss': 117.7865, 'grad_norm': 1354.5831237343443, 'learning_rate': 4.668333581704086e-06, 'epoch': 0.96} +{'loss': 115.183, 'grad_norm': 1440.8166387764236, 'learning_rate': 4.664646001901842e-06, 'epoch': 0.96} +{'loss': 117.0062, 'grad_norm': 1607.4570365984923, 'learning_rate': 4.660959436044129e-06, 'epoch': 0.96} +{'loss': 117.6301, 'grad_norm': 1633.378833411496, 'learning_rate': 4.6572738848315526e-06, 'epoch': 0.96} +{'loss': 116.221, 'grad_norm': 1390.8836952934998, 'learning_rate': 4.653589348964517e-06, 'epoch': 0.96} +{'loss': 119.2332, 'grad_norm': 1398.661317886182, 'learning_rate': 4.649905829143249e-06, 'epoch': 0.96} +{'loss': 116.2382, 'grad_norm': 1593.2707566263914, 'learning_rate': 4.64622332606777e-06, 'epoch': 0.96} +{'loss': 119.5661, 'grad_norm': 1490.631537807182, 'learning_rate': 4.642541840437916e-06, 'epoch': 0.96} +{'loss': 116.2185, 'grad_norm': 1585.075600312214, 'learning_rate': 4.638861372953325e-06, 'epoch': 0.96} +{'loss': 113.1263, 'grad_norm': 1438.297061904441, 'learning_rate': 4.635181924313438e-06, 'epoch': 0.96} +{'loss': 118.0417, 'grad_norm': 1551.3567874727075, 'learning_rate': 4.631503495217513e-06, 'epoch': 0.96} +{'loss': 115.7001, 'grad_norm': 1418.4777625007225, 'learning_rate': 4.627826086364603e-06, 'epoch': 0.96} +{'loss': 116.6675, 'grad_norm': 1305.8694486092438, 'learning_rate': 4.6241496984535785e-06, 'epoch': 0.96} +{'loss': 113.9852, 'grad_norm': 1341.9333879854507, 'learning_rate': 4.620474332183107e-06, 'epoch': 0.96} +{'loss': 112.8473, 'grad_norm': 1332.9661994163434, 'learning_rate': 4.616799988251659e-06, 'epoch': 0.96} +{'loss': 114.4395, 'grad_norm': 1440.0586178475114, 'learning_rate': 4.613126667357526e-06, 'epoch': 0.96} +{'loss': 117.7685, 'grad_norm': 1470.65287270099, 'learning_rate': 4.609454370198787e-06, 'epoch': 0.96} +{'loss': 117.0254, 'grad_norm': 1751.2392955188493, 'learning_rate': 4.605783097473342e-06, 'epoch': 0.96} +{'loss': 113.8362, 'grad_norm': 1352.7105397674172, 'learning_rate': 4.6021128498788855e-06, 'epoch': 0.96} +{'loss': 116.6477, 'grad_norm': 1390.3206960629882, 'learning_rate': 4.59844362811292e-06, 'epoch': 0.96} +{'loss': 114.0189, 'grad_norm': 1503.1190486270016, 'learning_rate': 4.594775432872757e-06, 'epoch': 0.96} +{'loss': 115.5226, 'grad_norm': 1558.2880741944848, 'learning_rate': 4.591108264855509e-06, 'epoch': 0.96} +{'loss': 113.8581, 'grad_norm': 1580.0994110126803, 'learning_rate': 4.587442124758088e-06, 'epoch': 0.96} +{'loss': 115.1841, 'grad_norm': 1422.8359575652341, 'learning_rate': 4.583777013277227e-06, 'epoch': 0.96} +{'loss': 114.5227, 'grad_norm': 1429.4810033556132, 'learning_rate': 4.580112931109444e-06, 'epoch': 0.96} +{'loss': 118.3298, 'grad_norm': 1754.0885436952676, 'learning_rate': 4.576449878951079e-06, 'epoch': 0.96} +{'loss': 115.8669, 'grad_norm': 1530.0311966242632, 'learning_rate': 4.5727878574982646e-06, 'epoch': 0.96} +{'loss': 114.3816, 'grad_norm': 1436.358013889361, 'learning_rate': 4.5691268674469346e-06, 'epoch': 0.96} +{'loss': 114.7035, 'grad_norm': 1464.5929030108362, 'learning_rate': 4.565466909492844e-06, 'epoch': 0.96} +{'loss': 111.921, 'grad_norm': 1439.6306087901742, 'learning_rate': 4.561807984331532e-06, 'epoch': 0.96} +{'loss': 112.8367, 'grad_norm': 1631.861451350336, 'learning_rate': 4.558150092658357e-06, 'epoch': 0.96} +{'loss': 114.9175, 'grad_norm': 1746.8599105332862, 'learning_rate': 4.55449323516847e-06, 'epoch': 0.96} +{'loss': 113.4995, 'grad_norm': 1502.0761671260916, 'learning_rate': 4.550837412556828e-06, 'epoch': 0.96} +{'loss': 116.9044, 'grad_norm': 1776.1271319107232, 'learning_rate': 4.547182625518198e-06, 'epoch': 0.96} +{'loss': 114.1977, 'grad_norm': 1585.000737094526, 'learning_rate': 4.543528874747144e-06, 'epoch': 0.96} +[2025-05-29 16:19:39,287] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step50000 is about to be saved! +[2025-05-29 16:19:39,318] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 16:19:39,318] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 16:19:39,426] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 16:19:39,533] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 16:25:16,811] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 16:25:16,902] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 16:25:20,993] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step50000 is ready now! +[2025-05-29 16:25:21,046] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step50000 is about to be saved! +[2025-05-29 16:25:21,072] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 16:25:21,072] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 16:25:21,198] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 16:25:27,862] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 16:31:32,007] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 16:31:32,082] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-50000/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 16:31:32,094] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step50000 is ready now! +{'loss': 115.2465, 'grad_norm': 1517.7046582077735, 'learning_rate': 4.539876160938028e-06, 'epoch': 0.96} +{'loss': 113.0816, 'grad_norm': 1480.945495769529, 'learning_rate': 4.536224484785031e-06, 'epoch': 0.96} +{'loss': 115.8995, 'grad_norm': 1773.7893419540453, 'learning_rate': 4.532573846982119e-06, 'epoch': 0.96} +{'loss': 112.558, 'grad_norm': 1551.3053821692884, 'learning_rate': 4.528924248223075e-06, 'epoch': 0.96} +{'loss': 114.591, 'grad_norm': 1397.957551290573, 'learning_rate': 4.525275689201476e-06, 'epoch': 0.96} +{'loss': 116.4884, 'grad_norm': 1451.750855887909, 'learning_rate': 4.5216281706106976e-06, 'epoch': 0.96} +{'loss': 108.557, 'grad_norm': 1334.5935236063603, 'learning_rate': 4.517981693143935e-06, 'epoch': 0.96} +{'loss': 113.1759, 'grad_norm': 1480.134120951325, 'learning_rate': 4.514336257494163e-06, 'epoch': 0.96} +{'loss': 113.5265, 'grad_norm': 1461.3311172611277, 'learning_rate': 4.510691864354178e-06, 'epoch': 0.96} +{'loss': 119.193, 'grad_norm': 1552.7264264282312, 'learning_rate': 4.507048514416569e-06, 'epoch': 0.96} +{'loss': 118.3999, 'grad_norm': 1644.1560594247378, 'learning_rate': 4.50340620837372e-06, 'epoch': 0.96} +{'loss': 114.5466, 'grad_norm': 1526.1987310801123, 'learning_rate': 4.499764946917834e-06, 'epoch': 0.96} +{'loss': 114.3988, 'grad_norm': 1659.8365885041853, 'learning_rate': 4.496124730740899e-06, 'epoch': 0.96} +{'loss': 112.5388, 'grad_norm': 1402.9547640531111, 'learning_rate': 4.492485560534715e-06, 'epoch': 0.96} +{'loss': 111.8898, 'grad_norm': 1507.8295849302904, 'learning_rate': 4.488847436990879e-06, 'epoch': 0.96} +{'loss': 113.5598, 'grad_norm': 1461.5059058539186, 'learning_rate': 4.485210360800786e-06, 'epoch': 0.96} +{'loss': 112.9845, 'grad_norm': 1605.7740021382697, 'learning_rate': 4.481574332655641e-06, 'epoch': 0.96} +{'loss': 115.1071, 'grad_norm': 1540.746126761207, 'learning_rate': 4.4779393532464365e-06, 'epoch': 0.97} +{'loss': 115.5531, 'grad_norm': 1528.201384169696, 'learning_rate': 4.474305423263984e-06, 'epoch': 0.97} +{'loss': 116.2988, 'grad_norm': 1524.0226566947915, 'learning_rate': 4.470672543398877e-06, 'epoch': 0.97} +{'loss': 113.8641, 'grad_norm': 1714.0861586777544, 'learning_rate': 4.467040714341516e-06, 'epoch': 0.97} +{'loss': 115.473, 'grad_norm': 1498.777584365415, 'learning_rate': 4.463409936782111e-06, 'epoch': 0.97} +{'loss': 111.3988, 'grad_norm': 1457.4191063342103, 'learning_rate': 4.459780211410657e-06, 'epoch': 0.97} +{'loss': 117.1959, 'grad_norm': 1715.944894519595, 'learning_rate': 4.456151538916963e-06, 'epoch': 0.97} +{'loss': 118.1416, 'grad_norm': 1490.1011032085062, 'learning_rate': 4.452523919990626e-06, 'epoch': 0.97} +{'loss': 111.2452, 'grad_norm': 1458.2573548058365, 'learning_rate': 4.448897355321049e-06, 'epoch': 0.97} +{'loss': 113.0824, 'grad_norm': 1455.0993787843863, 'learning_rate': 4.445271845597438e-06, 'epoch': 0.97} +{'loss': 113.6094, 'grad_norm': 1427.2380383075995, 'learning_rate': 4.441647391508787e-06, 'epoch': 0.97} +{'loss': 115.1949, 'grad_norm': 1387.9221914366747, 'learning_rate': 4.438023993743904e-06, 'epoch': 0.97} +{'loss': 113.3402, 'grad_norm': 1581.7915955911476, 'learning_rate': 4.434401652991386e-06, 'epoch': 0.97} +{'loss': 117.2713, 'grad_norm': 1362.3124357958457, 'learning_rate': 4.430780369939627e-06, 'epoch': 0.97} +{'loss': 114.7593, 'grad_norm': 1594.7859792700108, 'learning_rate': 4.427160145276834e-06, 'epoch': 0.97} +{'loss': 113.173, 'grad_norm': 1583.8509500806886, 'learning_rate': 4.4235409796909965e-06, 'epoch': 0.97} +{'loss': 115.7675, 'grad_norm': 1673.1212776091293, 'learning_rate': 4.419922873869917e-06, 'epoch': 0.97} +{'loss': 116.0898, 'grad_norm': 1421.9221834535597, 'learning_rate': 4.416305828501186e-06, 'epoch': 0.97} +{'loss': 114.2114, 'grad_norm': 1529.232164137315, 'learning_rate': 4.412689844272197e-06, 'epoch': 0.97} +{'loss': 112.3604, 'grad_norm': 1359.9475989072816, 'learning_rate': 4.409074921870135e-06, 'epoch': 0.97} +{'loss': 114.6239, 'grad_norm': 1581.8748290583844, 'learning_rate': 4.405461061981995e-06, 'epoch': 0.97} +{'loss': 113.9555, 'grad_norm': 1572.7683535683466, 'learning_rate': 4.401848265294567e-06, 'epoch': 0.97} +{'loss': 114.4621, 'grad_norm': 1531.8184143105625, 'learning_rate': 4.398236532494434e-06, 'epoch': 0.97} +{'loss': 117.2084, 'grad_norm': 1522.3795282551587, 'learning_rate': 4.3946258642679776e-06, 'epoch': 0.97} +{'loss': 111.8737, 'grad_norm': 1497.071820431437, 'learning_rate': 4.391016261301375e-06, 'epoch': 0.97} +{'loss': 114.0745, 'grad_norm': 1715.6209328418072, 'learning_rate': 4.387407724280609e-06, 'epoch': 0.97} +{'loss': 113.1339, 'grad_norm': 1355.807141201652, 'learning_rate': 4.383800253891459e-06, 'epoch': 0.97} +{'loss': 115.9851, 'grad_norm': 1383.0775262359464, 'learning_rate': 4.380193850819494e-06, 'epoch': 0.97} +{'loss': 115.3405, 'grad_norm': 1630.0460198356866, 'learning_rate': 4.376588515750085e-06, 'epoch': 0.97} +{'loss': 113.4051, 'grad_norm': 1384.2460713753126, 'learning_rate': 4.372984249368393e-06, 'epoch': 0.97} +{'loss': 115.0936, 'grad_norm': 1629.4477124677355, 'learning_rate': 4.369381052359388e-06, 'epoch': 0.97} +{'loss': 113.0888, 'grad_norm': 1655.2719033819233, 'learning_rate': 4.365778925407836e-06, 'epoch': 0.97} +{'loss': 113.8379, 'grad_norm': 1551.9559443703354, 'learning_rate': 4.362177869198287e-06, 'epoch': 0.97} +{'loss': 113.3485, 'grad_norm': 1455.2304287645673, 'learning_rate': 4.358577884415097e-06, 'epoch': 0.97} +{'loss': 111.3181, 'grad_norm': 1463.3648871943528, 'learning_rate': 4.354978971742414e-06, 'epoch': 0.97} +{'loss': 109.7847, 'grad_norm': 1494.019371511242, 'learning_rate': 4.351381131864188e-06, 'epoch': 0.97} +{'loss': 115.2636, 'grad_norm': 1432.2046484260718, 'learning_rate': 4.347784365464163e-06, 'epoch': 0.97} +{'loss': 110.1613, 'grad_norm': 1517.6487190869473, 'learning_rate': 4.344188673225877e-06, 'epoch': 0.97} +{'loss': 112.6293, 'grad_norm': 1933.0640656128833, 'learning_rate': 4.340594055832663e-06, 'epoch': 0.97} +{'loss': 113.8864, 'grad_norm': 1625.2935050346202, 'learning_rate': 4.337000513967649e-06, 'epoch': 0.97} +{'loss': 113.5097, 'grad_norm': 1471.4263540183015, 'learning_rate': 4.333408048313765e-06, 'epoch': 0.97} +{'loss': 113.4709, 'grad_norm': 1605.5785044606514, 'learning_rate': 4.329816659553734e-06, 'epoch': 0.97} +{'loss': 112.2017, 'grad_norm': 1540.7092304478335, 'learning_rate': 4.326226348370071e-06, 'epoch': 0.97} +{'loss': 115.764, 'grad_norm': 1449.6499576305387, 'learning_rate': 4.322637115445088e-06, 'epoch': 0.97} +{'loss': 113.8623, 'grad_norm': 1467.0009929993005, 'learning_rate': 4.319048961460886e-06, 'epoch': 0.97} +{'loss': 110.4894, 'grad_norm': 1279.8116553700215, 'learning_rate': 4.3154618870993755e-06, 'epoch': 0.97} +{'loss': 114.6898, 'grad_norm': 1488.842880458186, 'learning_rate': 4.311875893042253e-06, 'epoch': 0.97} +{'loss': 111.8933, 'grad_norm': 1470.7464026000246, 'learning_rate': 4.308290979971008e-06, 'epoch': 0.97} +{'loss': 113.6029, 'grad_norm': 1435.7867771929427, 'learning_rate': 4.3047071485669265e-06, 'epoch': 0.97} +{'loss': 114.6961, 'grad_norm': 1384.9684529435071, 'learning_rate': 4.301124399511086e-06, 'epoch': 0.97} +{'loss': 113.9656, 'grad_norm': 1289.1324348907826, 'learning_rate': 4.297542733484364e-06, 'epoch': 0.97} +{'loss': 115.5499, 'grad_norm': 1313.3219493129347, 'learning_rate': 4.293962151167433e-06, 'epoch': 0.97} +{'loss': 113.5984, 'grad_norm': 1597.105390991814, 'learning_rate': 4.290382653240755e-06, 'epoch': 0.98} +{'loss': 116.6376, 'grad_norm': 1539.3171847639865, 'learning_rate': 4.286804240384585e-06, 'epoch': 0.98} +{'loss': 111.5669, 'grad_norm': 1395.518647844515, 'learning_rate': 4.283226913278974e-06, 'epoch': 0.98} +{'loss': 116.6945, 'grad_norm': 2003.4450831511288, 'learning_rate': 4.279650672603762e-06, 'epoch': 0.98} +{'loss': 116.3036, 'grad_norm': 1418.4130988456845, 'learning_rate': 4.276075519038594e-06, 'epoch': 0.98} +{'loss': 112.2738, 'grad_norm': 1372.7593204811158, 'learning_rate': 4.2725014532629015e-06, 'epoch': 0.98} +{'loss': 113.6254, 'grad_norm': 1397.2221786540224, 'learning_rate': 4.268928475955908e-06, 'epoch': 0.98} +{'loss': 113.3914, 'grad_norm': 1355.5909573589743, 'learning_rate': 4.265356587796631e-06, 'epoch': 0.98} +{'loss': 116.2427, 'grad_norm': 1608.837322324851, 'learning_rate': 4.2617857894638755e-06, 'epoch': 0.98} +{'loss': 113.681, 'grad_norm': 1356.3279118942162, 'learning_rate': 4.2582160816362526e-06, 'epoch': 0.98} +{'loss': 112.3928, 'grad_norm': 1686.043702420675, 'learning_rate': 4.2546474649921595e-06, 'epoch': 0.98} +{'loss': 114.8057, 'grad_norm': 1597.0067583749437, 'learning_rate': 4.251079940209783e-06, 'epoch': 0.98} +{'loss': 110.3436, 'grad_norm': 1435.1758084832509, 'learning_rate': 4.2475135079671045e-06, 'epoch': 0.98} +{'loss': 113.2323, 'grad_norm': 1310.5921564610517, 'learning_rate': 4.243948168941896e-06, 'epoch': 0.98} +{'loss': 111.3226, 'grad_norm': 1841.437206755618, 'learning_rate': 4.240383923811724e-06, 'epoch': 0.98} +{'loss': 113.6019, 'grad_norm': 1520.8671499826137, 'learning_rate': 4.236820773253954e-06, 'epoch': 0.98} +{'loss': 115.432, 'grad_norm': 1629.7570627838618, 'learning_rate': 4.23325871794573e-06, 'epoch': 0.98} +{'loss': 114.6432, 'grad_norm': 1439.6830753795864, 'learning_rate': 4.229697758563995e-06, 'epoch': 0.98} +{'loss': 112.8957, 'grad_norm': 1636.9247382202618, 'learning_rate': 4.226137895785482e-06, 'epoch': 0.98} +{'loss': 114.9948, 'grad_norm': 1387.1003937717494, 'learning_rate': 4.222579130286716e-06, 'epoch': 0.98} +{'loss': 113.5275, 'grad_norm': 1349.5295786312245, 'learning_rate': 4.219021462744021e-06, 'epoch': 0.98} +{'loss': 114.4135, 'grad_norm': 1247.4059644121717, 'learning_rate': 4.215464893833498e-06, 'epoch': 0.98} +{'loss': 113.3427, 'grad_norm': 1372.6189013142919, 'learning_rate': 4.211909424231052e-06, 'epoch': 0.98} +{'loss': 111.9288, 'grad_norm': 1460.0072714840123, 'learning_rate': 4.208355054612365e-06, 'epoch': 0.98} +{'loss': 113.6063, 'grad_norm': 1364.8666667965426, 'learning_rate': 4.204801785652925e-06, 'epoch': 0.98} +{'loss': 111.3437, 'grad_norm': 1384.6041179885526, 'learning_rate': 4.201249618028008e-06, 'epoch': 0.98} +{'loss': 111.819, 'grad_norm': 1369.651092775729, 'learning_rate': 4.197698552412672e-06, 'epoch': 0.98} +{'loss': 110.6443, 'grad_norm': 1584.1084397705786, 'learning_rate': 4.194148589481772e-06, 'epoch': 0.98} +{'loss': 112.2845, 'grad_norm': 1349.142143002644, 'learning_rate': 4.190599729909949e-06, 'epoch': 0.98} +{'loss': 141.9409, 'grad_norm': 1540.8068567832413, 'learning_rate': 4.18705197437164e-06, 'epoch': 0.98} +{'loss': 113.469, 'grad_norm': 1511.4096184774353, 'learning_rate': 4.1835053235410726e-06, 'epoch': 0.98} +{'loss': 112.8677, 'grad_norm': 1868.1998723846311, 'learning_rate': 4.17995977809226e-06, 'epoch': 0.98} +{'loss': 111.4304, 'grad_norm': 1627.282505586079, 'learning_rate': 4.1764153386990045e-06, 'epoch': 0.98} +{'loss': 112.1024, 'grad_norm': 1520.1133126801603, 'learning_rate': 4.172872006034899e-06, 'epoch': 0.98} +{'loss': 109.0457, 'grad_norm': 1581.7610718733285, 'learning_rate': 4.169329780773331e-06, 'epoch': 0.98} +{'loss': 112.1104, 'grad_norm': 1477.304378469318, 'learning_rate': 4.1657886635874775e-06, 'epoch': 0.98} +{'loss': 113.0462, 'grad_norm': 1445.330699950917, 'learning_rate': 4.162248655150297e-06, 'epoch': 0.98} +{'loss': 119.7843, 'grad_norm': 1554.9070431633677, 'learning_rate': 4.1587097561345435e-06, 'epoch': 0.98} +{'loss': 112.7558, 'grad_norm': 1465.5510938454981, 'learning_rate': 4.155171967212754e-06, 'epoch': 0.98} +{'loss': 116.2319, 'grad_norm': 1557.3614487088187, 'learning_rate': 4.151635289057265e-06, 'epoch': 0.98} +{'loss': 112.8955, 'grad_norm': 1599.3291568033533, 'learning_rate': 4.148099722340192e-06, 'epoch': 0.98} +{'loss': 113.206, 'grad_norm': 1562.599629061088, 'learning_rate': 4.144565267733448e-06, 'epoch': 0.98} +{'loss': 115.0878, 'grad_norm': 1487.159281307663, 'learning_rate': 4.141031925908727e-06, 'epoch': 0.98} +{'loss': 115.783, 'grad_norm': 1443.671132742109, 'learning_rate': 4.137499697537512e-06, 'epoch': 0.98} +{'loss': 108.9107, 'grad_norm': 1357.3224166634632, 'learning_rate': 4.133968583291082e-06, 'epoch': 0.98} +{'loss': 110.3964, 'grad_norm': 1588.8049584082776, 'learning_rate': 4.130438583840492e-06, 'epoch': 0.98} +{'loss': 110.7462, 'grad_norm': 1520.692400950371, 'learning_rate': 4.126909699856603e-06, 'epoch': 0.98} +{'loss': 116.6037, 'grad_norm': 1462.1718748497815, 'learning_rate': 4.123381932010044e-06, 'epoch': 0.98} +{'loss': 114.9287, 'grad_norm': 1435.5066379823402, 'learning_rate': 4.119855280971243e-06, 'epoch': 0.98} +{'loss': 113.4878, 'grad_norm': 1732.2895039725945, 'learning_rate': 4.116329747410418e-06, 'epoch': 0.98} +{'loss': 112.671, 'grad_norm': 1455.5743052095959, 'learning_rate': 4.112805331997563e-06, 'epoch': 0.98} +{'loss': 111.9667, 'grad_norm': 1504.623878962298, 'learning_rate': 4.109282035402476e-06, 'epoch': 0.98} +{'loss': 113.0525, 'grad_norm': 1424.9896096331122, 'learning_rate': 4.10575985829473e-06, 'epoch': 0.99} +{'loss': 112.7144, 'grad_norm': 1321.6067206193984, 'learning_rate': 4.102238801343684e-06, 'epoch': 0.99} +{'loss': 109.8775, 'grad_norm': 1521.1394006389653, 'learning_rate': 4.098718865218496e-06, 'epoch': 0.99} +{'loss': 113.6617, 'grad_norm': 1371.3264553836962, 'learning_rate': 4.095200050588096e-06, 'epoch': 0.99} +{'loss': 112.5875, 'grad_norm': 1778.797927804825, 'learning_rate': 4.091682358121219e-06, 'epoch': 0.99} +{'loss': 113.042, 'grad_norm': 1585.1517065386292, 'learning_rate': 4.088165788486368e-06, 'epoch': 0.99} +{'loss': 109.4879, 'grad_norm': 1634.4868802252556, 'learning_rate': 4.084650342351841e-06, 'epoch': 0.99} +{'loss': 115.7097, 'grad_norm': 1845.061067921527, 'learning_rate': 4.08113602038573e-06, 'epoch': 0.99} +{'loss': 116.0515, 'grad_norm': 1582.1816928363367, 'learning_rate': 4.0776228232558955e-06, 'epoch': 0.99} +{'loss': 113.7896, 'grad_norm': 1483.389686664255, 'learning_rate': 4.074110751630005e-06, 'epoch': 0.99} +{'loss': 115.0078, 'grad_norm': 1532.1522630767897, 'learning_rate': 4.070599806175496e-06, 'epoch': 0.99} +{'loss': 112.0119, 'grad_norm': 1563.3573222744285, 'learning_rate': 4.067089987559595e-06, 'epoch': 0.99} +{'loss': 113.9477, 'grad_norm': 1492.5832889596143, 'learning_rate': 4.063581296449324e-06, 'epoch': 0.99} +{'loss': 114.3865, 'grad_norm': 1600.120830571315, 'learning_rate': 4.0600737335114756e-06, 'epoch': 0.99} +{'loss': 112.4798, 'grad_norm': 1344.2593888192198, 'learning_rate': 4.056567299412647e-06, 'epoch': 0.99} +{'loss': 111.1371, 'grad_norm': 1385.930748232, 'learning_rate': 4.053061994819202e-06, 'epoch': 0.99} +{'loss': 114.0146, 'grad_norm': 1407.0418299855066, 'learning_rate': 4.049557820397297e-06, 'epoch': 0.99} +{'loss': 113.5493, 'grad_norm': 1574.8421742065025, 'learning_rate': 4.04605477681288e-06, 'epoch': 0.99} +{'loss': 114.155, 'grad_norm': 1351.08555778987, 'learning_rate': 4.042552864731672e-06, 'epoch': 0.99} +{'loss': 113.1824, 'grad_norm': 1482.3853831780093, 'learning_rate': 4.039052084819194e-06, 'epoch': 0.99} +{'loss': 129.4149, 'grad_norm': 1611.2960461573566, 'learning_rate': 4.035552437740737e-06, 'epoch': 0.99} +{'loss': 111.5354, 'grad_norm': 1526.97704855883, 'learning_rate': 4.032053924161385e-06, 'epoch': 0.99} +{'loss': 113.1389, 'grad_norm': 7341.844693472737, 'learning_rate': 4.028556544746e-06, 'epoch': 0.99} +{'loss': 113.4582, 'grad_norm': 1792.9492599764126, 'learning_rate': 4.0250603001592416e-06, 'epoch': 0.99} +{'loss': 113.6575, 'grad_norm': 1733.5278470748067, 'learning_rate': 4.021565191065536e-06, 'epoch': 0.99} +{'loss': 111.8638, 'grad_norm': 1329.660324883457, 'learning_rate': 4.018071218129112e-06, 'epoch': 0.99} +{'loss': 116.3446, 'grad_norm': 1658.084518543948, 'learning_rate': 4.014578382013967e-06, 'epoch': 0.99} +{'loss': 111.8539, 'grad_norm': 1339.7562149927853, 'learning_rate': 4.011086683383888e-06, 'epoch': 0.99} +{'loss': 116.5356, 'grad_norm': 1364.373992601231, 'learning_rate': 4.007596122902453e-06, 'epoch': 0.99} +{'loss': 113.596, 'grad_norm': 1458.4302013874471, 'learning_rate': 4.004106701233009e-06, 'epoch': 0.99} +{'loss': 110.8878, 'grad_norm': 1572.3464183467338, 'learning_rate': 4.000618419038702e-06, 'epoch': 0.99} +{'loss': 111.3539, 'grad_norm': 1935.8159730422428, 'learning_rate': 3.997131276982452e-06, 'epoch': 0.99} +{'loss': 111.8716, 'grad_norm': 1903.13938641954, 'learning_rate': 3.9936452757269595e-06, 'epoch': 0.99} +{'loss': 119.2853, 'grad_norm': 10775.883581407465, 'learning_rate': 3.990160415934721e-06, 'epoch': 0.99} +{'loss': 113.8712, 'grad_norm': 1704.2577662534961, 'learning_rate': 3.986676698268001e-06, 'epoch': 0.99} +{'loss': 114.0094, 'grad_norm': 1315.714843807229, 'learning_rate': 3.98319412338886e-06, 'epoch': 0.99} +{'loss': 117.375, 'grad_norm': 2605.4648575029696, 'learning_rate': 3.979712691959136e-06, 'epoch': 0.99} +{'loss': 111.1759, 'grad_norm': 1558.1358642285802, 'learning_rate': 3.976232404640441e-06, 'epoch': 0.99} +{'loss': 112.499, 'grad_norm': 1510.7479097094124, 'learning_rate': 3.972753262094188e-06, 'epoch': 0.99} +{'loss': 113.3904, 'grad_norm': 1403.8659971031793, 'learning_rate': 3.969275264981554e-06, 'epoch': 0.99} +{'loss': 111.0138, 'grad_norm': 1443.981742065408, 'learning_rate': 3.965798413963514e-06, 'epoch': 0.99} +{'loss': 110.3525, 'grad_norm': 1379.3298669496082, 'learning_rate': 3.962322709700814e-06, 'epoch': 0.99} +{'loss': 113.9848, 'grad_norm': 1428.7702392661188, 'learning_rate': 3.958848152853983e-06, 'epoch': 0.99} +{'loss': 109.5108, 'grad_norm': 1394.6945413397525, 'learning_rate': 3.955374744083342e-06, 'epoch': 0.99} +{'loss': 113.2774, 'grad_norm': 1260.0548624473522, 'learning_rate': 3.951902484048978e-06, 'epoch': 0.99} +{'loss': 115.5628, 'grad_norm': 1419.4759997720646, 'learning_rate': 3.9484313734107775e-06, 'epoch': 0.99} +{'loss': 110.4783, 'grad_norm': 1509.4817674301232, 'learning_rate': 3.944961412828393e-06, 'epoch': 0.99} +{'loss': 111.7885, 'grad_norm': 1585.065645009544, 'learning_rate': 3.941492602961264e-06, 'epoch': 0.99} +{'loss': 112.7781, 'grad_norm': 1537.689890497647, 'learning_rate': 3.938024944468618e-06, 'epoch': 0.99} +{'loss': 112.1025, 'grad_norm': 1490.3899054108458, 'learning_rate': 3.934558438009452e-06, 'epoch': 0.99} +{'loss': 110.1594, 'grad_norm': 1415.568341487176, 'learning_rate': 3.931093084242554e-06, 'epoch': 0.99} +{'loss': 112.7513, 'grad_norm': 1374.77714603213, 'learning_rate': 3.927628883826488e-06, 'epoch': 0.99} +{'loss': 116.0627, 'grad_norm': 1806.66839101904, 'learning_rate': 3.924165837419596e-06, 'epoch': 1.0} +{'loss': 112.3063, 'grad_norm': 1357.1116312132233, 'learning_rate': 3.92070394568001e-06, 'epoch': 1.0} +{'loss': 116.2372, 'grad_norm': 1529.4185315049115, 'learning_rate': 3.9172432092656345e-06, 'epoch': 1.0} +{'loss': 110.1143, 'grad_norm': 1346.9378291419719, 'learning_rate': 3.913783628834152e-06, 'epoch': 1.0} +{'loss': 112.7523, 'grad_norm': 1298.8748605595954, 'learning_rate': 3.91032520504304e-06, 'epoch': 1.0} +{'loss': 115.142, 'grad_norm': 1682.0884940247297, 'learning_rate': 3.906867938549536e-06, 'epoch': 1.0} +{'loss': 113.7981, 'grad_norm': 1357.3009085152614, 'learning_rate': 3.903411830010676e-06, 'epoch': 1.0} +{'loss': 112.0204, 'grad_norm': 1380.137015654999, 'learning_rate': 3.899956880083265e-06, 'epoch': 1.0} +{'loss': 115.33, 'grad_norm': 1620.1078753250843, 'learning_rate': 3.896503089423888e-06, 'epoch': 1.0} +{'loss': 110.2936, 'grad_norm': 1397.8354875776752, 'learning_rate': 3.893050458688916e-06, 'epoch': 1.0} +{'loss': 113.3123, 'grad_norm': 1537.3197669754588, 'learning_rate': 3.889598988534493e-06, 'epoch': 1.0} +{'loss': 112.7646, 'grad_norm': 1420.3178766626584, 'learning_rate': 3.886148679616549e-06, 'epoch': 1.0} +{'loss': 111.7486, 'grad_norm': 1694.1045942795447, 'learning_rate': 3.882699532590789e-06, 'epoch': 1.0} +{'loss': 110.7354, 'grad_norm': 1503.4326803121714, 'learning_rate': 3.879251548112692e-06, 'epoch': 1.0} +{'loss': 111.8096, 'grad_norm': 1744.7060019404141, 'learning_rate': 3.87580472683753e-06, 'epoch': 1.0} +{'loss': 114.3077, 'grad_norm': 1556.995241709018, 'learning_rate': 3.8723590694203385e-06, 'epoch': 1.0} +{'loss': 115.6621, 'grad_norm': 1357.9241046634688, 'learning_rate': 3.868914576515947e-06, 'epoch': 1.0} +{'loss': 109.7489, 'grad_norm': 1724.4795765629576, 'learning_rate': 3.865471248778951e-06, 'epoch': 1.0} +{'loss': 109.8162, 'grad_norm': 1579.1657014386274, 'learning_rate': 3.8620290868637274e-06, 'epoch': 1.0} +{'loss': 114.0322, 'grad_norm': 1455.881447850924, 'learning_rate': 3.85858809142444e-06, 'epoch': 1.0} +{'loss': 112.2055, 'grad_norm': 1376.4707998175115, 'learning_rate': 3.855148263115017e-06, 'epoch': 1.0} +{'loss': 111.1098, 'grad_norm': 1267.0174152427903, 'learning_rate': 3.851709602589181e-06, 'epoch': 1.0} +{'loss': 115.1362, 'grad_norm': 1517.862121065589, 'learning_rate': 3.848272110500418e-06, 'epoch': 1.0} +{'loss': 109.7807, 'grad_norm': 1388.0258644559053, 'learning_rate': 3.844835787501996e-06, 'epoch': 1.0} +{'loss': 108.8583, 'grad_norm': 1481.9600871921612, 'learning_rate': 3.841400634246969e-06, 'epoch': 1.0} +{'loss': 110.5648, 'grad_norm': 1489.7661133054448, 'learning_rate': 3.837966651388155e-06, 'epoch': 1.0} +{'eval_loss': 124.3641128540039, 'eval_runtime': 18.1924, 'eval_samples_per_second': 38.258, 'eval_steps_per_second': 2.419, 'epoch': 1.0} +{'loss': 103.8209, 'grad_norm': 1131.038519340876, 'learning_rate': 3.834533839578168e-06, 'epoch': 1.0} +[2025-05-29 18:20:24,816] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step52000 is about to be saved! +[2025-05-29 18:20:24,890] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 18:20:24,890] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 18:20:24,927] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 18:20:25,305] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 18:26:15,450] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 18:26:15,779] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 18:26:15,790] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step52000 is ready now! +[2025-05-29 18:26:15,883] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step52000 is about to be saved! +[2025-05-29 18:26:15,906] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 18:26:15,906] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 18:26:15,941] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 18:26:16,249] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 18:31:58,813] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 18:31:58,965] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-52000/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 18:32:08,372] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step52000 is ready now! +{'loss': 96.943, 'grad_norm': 1401.5813486758857, 'learning_rate': 3.831102199469379e-06, 'epoch': 1.0} +{'loss': 96.9975, 'grad_norm': 1327.6360806285359, 'learning_rate': 3.827671731713945e-06, 'epoch': 1.0} +{'loss': 94.9161, 'grad_norm': 1325.6095898627998, 'learning_rate': 3.82424243696381e-06, 'epoch': 1.0} +{'loss': 99.3973, 'grad_norm': 1514.1324051935328, 'learning_rate': 3.820814315870676e-06, 'epoch': 1.0} +{'loss': 96.8743, 'grad_norm': 1198.6929764487516, 'learning_rate': 3.817387369086041e-06, 'epoch': 1.0} +{'loss': 95.3924, 'grad_norm': 1228.6874240143984, 'learning_rate': 3.8139615972611654e-06, 'epoch': 1.0} +{'loss': 95.212, 'grad_norm': 1163.7666469033616, 'learning_rate': 3.81053700104709e-06, 'epoch': 1.0} +{'loss': 97.7919, 'grad_norm': 1155.2997359581148, 'learning_rate': 3.8071135810946415e-06, 'epoch': 1.0} +{'loss': 97.4762, 'grad_norm': 1213.380329441513, 'learning_rate': 3.803691338054406e-06, 'epoch': 1.0} +{'loss': 95.3431, 'grad_norm': 1161.3896120659874, 'learning_rate': 3.8002702725767646e-06, 'epoch': 1.0} +{'loss': 96.489, 'grad_norm': 1373.4722468569412, 'learning_rate': 3.796850385311861e-06, 'epoch': 1.0} +{'loss': 94.3933, 'grad_norm': 1388.6973317527227, 'learning_rate': 3.7934316769096203e-06, 'epoch': 1.0} +{'loss': 97.9782, 'grad_norm': 1196.4765269472364, 'learning_rate': 3.790014148019738e-06, 'epoch': 1.0} +{'loss': 94.377, 'grad_norm': 1180.778607156742, 'learning_rate': 3.7865977992916945e-06, 'epoch': 1.0} +{'loss': 98.107, 'grad_norm': 1199.203265571379, 'learning_rate': 3.7831826313747454e-06, 'epoch': 1.0} +{'loss': 95.8131, 'grad_norm': 1179.318116914337, 'learning_rate': 3.779768644917914e-06, 'epoch': 1.0} +{'loss': 95.7177, 'grad_norm': 1270.095520739344, 'learning_rate': 3.776355840570003e-06, 'epoch': 1.0} +{'loss': 97.389, 'grad_norm': 1291.1363769756763, 'learning_rate': 3.7729442189795873e-06, 'epoch': 1.0} +{'loss': 96.8909, 'grad_norm': 1094.1538843491603, 'learning_rate': 3.7695337807950236e-06, 'epoch': 1.0} +{'loss': 95.7944, 'grad_norm': 1214.5832551169842, 'learning_rate': 3.7661245266644453e-06, 'epoch': 1.0} +{'loss': 97.4081, 'grad_norm': 1138.807474687909, 'learning_rate': 3.762716457235751e-06, 'epoch': 1.0} +{'loss': 97.4186, 'grad_norm': 1297.7008632416182, 'learning_rate': 3.7593095731566186e-06, 'epoch': 1.0} +{'loss': 97.543, 'grad_norm': 1195.8180921707321, 'learning_rate': 3.755903875074499e-06, 'epoch': 1.0} +{'loss': 97.0451, 'grad_norm': 1410.5646816965184, 'learning_rate': 3.7524993636366223e-06, 'epoch': 1.0} +{'loss': 95.9479, 'grad_norm': 1338.5572553759605, 'learning_rate': 3.7490960394899954e-06, 'epoch': 1.0} +{'loss': 96.8482, 'grad_norm': 1317.8433930740446, 'learning_rate': 3.745693903281391e-06, 'epoch': 1.01} +{'loss': 95.5979, 'grad_norm': 1221.4766107099788, 'learning_rate': 3.7422929556573595e-06, 'epoch': 1.01} +{'loss': 95.4295, 'grad_norm': 1207.0556845016679, 'learning_rate': 3.738893197264222e-06, 'epoch': 1.01} +{'loss': 97.1923, 'grad_norm': 1238.7571331914583, 'learning_rate': 3.735494628748082e-06, 'epoch': 1.01} +{'loss': 96.3542, 'grad_norm': 1262.9427961778288, 'learning_rate': 3.7320972507548146e-06, 'epoch': 1.01} +{'loss': 94.9349, 'grad_norm': 1102.3662385543628, 'learning_rate': 3.7287010639300635e-06, 'epoch': 1.01} +{'loss': 97.3288, 'grad_norm': 1259.4873054355885, 'learning_rate': 3.725306068919249e-06, 'epoch': 1.01} +{'loss': 96.8945, 'grad_norm': 1333.5255861737403, 'learning_rate': 3.72191226636756e-06, 'epoch': 1.01} +{'loss': 96.606, 'grad_norm': 1266.9157613722198, 'learning_rate': 3.718519656919969e-06, 'epoch': 1.01} +{'loss': 94.7858, 'grad_norm': 1314.7170170492354, 'learning_rate': 3.7151282412212197e-06, 'epoch': 1.01} +{'loss': 94.7236, 'grad_norm': 1346.9523364268985, 'learning_rate': 3.7117380199158204e-06, 'epoch': 1.01} +{'loss': 97.2838, 'grad_norm': 1634.1011929511992, 'learning_rate': 3.7083489936480576e-06, 'epoch': 1.01} +{'loss': 95.1772, 'grad_norm': 1169.1872792823265, 'learning_rate': 3.7049611630619888e-06, 'epoch': 1.01} +{'loss': 95.0149, 'grad_norm': 1126.9088571696816, 'learning_rate': 3.7015745288014492e-06, 'epoch': 1.01} +{'loss': 96.5316, 'grad_norm': 1232.5645356840143, 'learning_rate': 3.698189091510046e-06, 'epoch': 1.01} +{'loss': 99.1108, 'grad_norm': 1202.5366178767915, 'learning_rate': 3.694804851831154e-06, 'epoch': 1.01} +{'loss': 95.9944, 'grad_norm': 1073.8649660005792, 'learning_rate': 3.6914218104079227e-06, 'epoch': 1.01} +{'loss': 94.3818, 'grad_norm': 1039.962675804095, 'learning_rate': 3.688039967883269e-06, 'epoch': 1.01} +{'loss': 95.7866, 'grad_norm': 1124.9643323076543, 'learning_rate': 3.684659324899894e-06, 'epoch': 1.01} +{'loss': 97.4815, 'grad_norm': 1129.2662445033823, 'learning_rate': 3.6812798821002636e-06, 'epoch': 1.01} +{'loss': 96.3079, 'grad_norm': 1349.9847617885484, 'learning_rate': 3.6779016401266153e-06, 'epoch': 1.01} +{'loss': 97.0402, 'grad_norm': 1307.7872053861051, 'learning_rate': 3.6745245996209566e-06, 'epoch': 1.01} +{'loss': 95.5863, 'grad_norm': 1248.4371935498966, 'learning_rate': 3.671148761225072e-06, 'epoch': 1.01} +{'loss': 95.1227, 'grad_norm': 1359.4788904910636, 'learning_rate': 3.6677741255805076e-06, 'epoch': 1.01} +{'loss': 95.1693, 'grad_norm': 1210.891260749001, 'learning_rate': 3.664400693328595e-06, 'epoch': 1.01} +{'loss': 94.6436, 'grad_norm': 1182.5310024266082, 'learning_rate': 3.66102846511043e-06, 'epoch': 1.01} +{'loss': 96.2782, 'grad_norm': 1258.9900340712934, 'learning_rate': 3.6576574415668787e-06, 'epoch': 1.01} +{'loss': 94.2418, 'grad_norm': 1196.1965856269674, 'learning_rate': 3.6542876233385773e-06, 'epoch': 1.01} +{'loss': 95.5562, 'grad_norm': 1366.755535888022, 'learning_rate': 3.650919011065933e-06, 'epoch': 1.01} +{'loss': 94.1258, 'grad_norm': 1220.367761147946, 'learning_rate': 3.647551605389128e-06, 'epoch': 1.01} +{'loss': 94.236, 'grad_norm': 1138.99649299921, 'learning_rate': 3.6441854069481174e-06, 'epoch': 1.01} +{'loss': 96.9785, 'grad_norm': 1141.2772684114989, 'learning_rate': 3.640820416382618e-06, 'epoch': 1.01} +{'loss': 95.0246, 'grad_norm': 1121.7919260364815, 'learning_rate': 3.6374566343321203e-06, 'epoch': 1.01} +{'loss': 94.138, 'grad_norm': 1178.6804376951116, 'learning_rate': 3.6340940614358853e-06, 'epoch': 1.01} +{'loss': 97.1416, 'grad_norm': 1227.6965431490964, 'learning_rate': 3.6307326983329463e-06, 'epoch': 1.01} +{'loss': 99.0159, 'grad_norm': 1213.2490916191698, 'learning_rate': 3.62737254566211e-06, 'epoch': 1.01} +{'loss': 95.8824, 'grad_norm': 1209.527554529029, 'learning_rate': 3.6240136040619445e-06, 'epoch': 1.01} +{'loss': 95.5024, 'grad_norm': 1420.5970449105707, 'learning_rate': 3.620655874170792e-06, 'epoch': 1.01} +{'loss': 95.3845, 'grad_norm': 1099.2609366694232, 'learning_rate': 3.6172993566267623e-06, 'epoch': 1.01} +{'loss': 97.7594, 'grad_norm': 1288.3198945969225, 'learning_rate': 3.613944052067737e-06, 'epoch': 1.01} +{'loss': 96.565, 'grad_norm': 1135.1256952348085, 'learning_rate': 3.6105899611313732e-06, 'epoch': 1.01} +{'loss': 95.7735, 'grad_norm': 1340.6339459739847, 'learning_rate': 3.6072370844550865e-06, 'epoch': 1.01} +{'loss': 95.3582, 'grad_norm': 1292.6871485984173, 'learning_rate': 3.6038854226760655e-06, 'epoch': 1.01} +{'loss': 96.8321, 'grad_norm': 1211.857960126768, 'learning_rate': 3.600534976431266e-06, 'epoch': 1.01} +{'loss': 97.5602, 'grad_norm': 1351.7271334559368, 'learning_rate': 3.597185746357419e-06, 'epoch': 1.01} +{'loss': 95.3473, 'grad_norm': 1200.2989830942986, 'learning_rate': 3.5938377330910245e-06, 'epoch': 1.01} +{'loss': 96.8528, 'grad_norm': 1145.6964583572333, 'learning_rate': 3.590490937268345e-06, 'epoch': 1.01} +{'loss': 96.0142, 'grad_norm': 1215.5148984507719, 'learning_rate': 3.5871453595254122e-06, 'epoch': 1.01} +{'loss': 94.2589, 'grad_norm': 1135.912955937718, 'learning_rate': 3.5838010004980263e-06, 'epoch': 1.01} +{'loss': 97.5026, 'grad_norm': 1189.7971138013445, 'learning_rate': 3.580457860821761e-06, 'epoch': 1.01} +{'loss': 96.5355, 'grad_norm': 1431.9516652245295, 'learning_rate': 3.577115941131959e-06, 'epoch': 1.01} +{'loss': 94.9596, 'grad_norm': 1198.2777624456805, 'learning_rate': 3.573775242063723e-06, 'epoch': 1.01} +{'loss': 96.4997, 'grad_norm': 1177.6028560555444, 'learning_rate': 3.5704357642519295e-06, 'epoch': 1.02} +{'loss': 93.3366, 'grad_norm': 1460.442346830118, 'learning_rate': 3.567097508331218e-06, 'epoch': 1.02} +{'loss': 97.0301, 'grad_norm': 1265.999750253904, 'learning_rate': 3.563760474936001e-06, 'epoch': 1.02} +{'loss': 96.1556, 'grad_norm': 1398.9051247350853, 'learning_rate': 3.5604246647004627e-06, 'epoch': 1.02} +{'loss': 95.1777, 'grad_norm': 1224.3214709452898, 'learning_rate': 3.557090078258545e-06, 'epoch': 1.02} +{'loss': 96.6686, 'grad_norm': 1210.579181543313, 'learning_rate': 3.55375671624396e-06, 'epoch': 1.02} +{'loss': 95.6142, 'grad_norm': 1391.2323073013247, 'learning_rate': 3.550424579290187e-06, 'epoch': 1.02} +{'loss': 95.3759, 'grad_norm': 1311.702355576851, 'learning_rate': 3.547093668030479e-06, 'epoch': 1.02} +{'loss': 96.449, 'grad_norm': 1427.7901032286893, 'learning_rate': 3.5437639830978456e-06, 'epoch': 1.02} +{'loss': 99.167, 'grad_norm': 1272.3699995644295, 'learning_rate': 3.5404355251250755e-06, 'epoch': 1.02} +{'loss': 95.3254, 'grad_norm': 1123.4574530962452, 'learning_rate': 3.5371082947447133e-06, 'epoch': 1.02} +{'loss': 94.9779, 'grad_norm': 1179.0177845240657, 'learning_rate': 3.533782292589071e-06, 'epoch': 1.02} +{'loss': 95.2919, 'grad_norm': 1128.419673527548, 'learning_rate': 3.530457519290238e-06, 'epoch': 1.02} +{'loss': 95.3862, 'grad_norm': 1306.5431394538298, 'learning_rate': 3.527133975480057e-06, 'epoch': 1.02} +{'loss': 95.4976, 'grad_norm': 1211.5752368413712, 'learning_rate': 3.5238116617901486e-06, 'epoch': 1.02} +{'loss': 95.4304, 'grad_norm': 1109.229316556971, 'learning_rate': 3.5204905788518893e-06, 'epoch': 1.02} +{'loss': 96.0964, 'grad_norm': 1224.9168917223978, 'learning_rate': 3.517170727296426e-06, 'epoch': 1.02} +{'loss': 95.2547, 'grad_norm': 1210.5085957326887, 'learning_rate': 3.5138521077546762e-06, 'epoch': 1.02} +{'loss': 95.2714, 'grad_norm': 1176.2662735253036, 'learning_rate': 3.510534720857313e-06, 'epoch': 1.02} +{'loss': 96.2522, 'grad_norm': 1366.6615689099963, 'learning_rate': 3.507218567234788e-06, 'epoch': 1.02} +{'loss': 94.735, 'grad_norm': 1290.0280363224867, 'learning_rate': 3.503903647517307e-06, 'epoch': 1.02} +{'loss': 95.5039, 'grad_norm': 1090.1334064543544, 'learning_rate': 3.5005899623348493e-06, 'epoch': 1.02} +{'loss': 94.9572, 'grad_norm': 1166.3297827917545, 'learning_rate': 3.49727751231715e-06, 'epoch': 1.02} +{'loss': 96.9611, 'grad_norm': 1471.4091698324569, 'learning_rate': 3.4939662980937194e-06, 'epoch': 1.02} +{'loss': 94.9742, 'grad_norm': 1262.5623100858586, 'learning_rate': 3.4906563202938326e-06, 'epoch': 1.02} +{'loss': 93.3166, 'grad_norm': 1346.6842260678745, 'learning_rate': 3.487347579546524e-06, 'epoch': 1.02} +{'loss': 98.2698, 'grad_norm': 1271.4316403282758, 'learning_rate': 3.4840400764805948e-06, 'epoch': 1.02} +{'loss': 94.6518, 'grad_norm': 3852.952876224986, 'learning_rate': 3.4807338117246082e-06, 'epoch': 1.02} +{'loss': 95.1775, 'grad_norm': 1123.3582718432137, 'learning_rate': 3.4774287859068988e-06, 'epoch': 1.02} +{'loss': 97.2023, 'grad_norm': 1303.6779490128067, 'learning_rate': 3.474124999655565e-06, 'epoch': 1.02} +{'loss': 95.0988, 'grad_norm': 1171.6296431285875, 'learning_rate': 3.4708224535984634e-06, 'epoch': 1.02} +{'loss': 94.2217, 'grad_norm': 1276.5727095074872, 'learning_rate': 3.46752114836322e-06, 'epoch': 1.02} +{'loss': 95.699, 'grad_norm': 1276.4996488619965, 'learning_rate': 3.4642210845772183e-06, 'epoch': 1.02} +{'loss': 99.1064, 'grad_norm': 1232.0056653027934, 'learning_rate': 3.4609222628676153e-06, 'epoch': 1.02} +{'loss': 95.81, 'grad_norm': 1149.7187557805103, 'learning_rate': 3.4576246838613313e-06, 'epoch': 1.02} +{'loss': 95.453, 'grad_norm': 1278.3554062272033, 'learning_rate': 3.454328348185042e-06, 'epoch': 1.02} +{'loss': 95.3043, 'grad_norm': 1291.9024940370032, 'learning_rate': 3.4510332564651926e-06, 'epoch': 1.02} +{'loss': 94.5264, 'grad_norm': 1086.7412381464674, 'learning_rate': 3.4477394093279883e-06, 'epoch': 1.02} +{'loss': 96.8152, 'grad_norm': 1229.3038036560938, 'learning_rate': 3.444446807399402e-06, 'epoch': 1.02} +{'loss': 95.804, 'grad_norm': 1178.3619325681107, 'learning_rate': 3.4411554513051737e-06, 'epoch': 1.02} +{'loss': 94.3625, 'grad_norm': 1189.2441978750442, 'learning_rate': 3.437865341670795e-06, 'epoch': 1.02} +{'loss': 96.831, 'grad_norm': 1115.8172373171806, 'learning_rate': 3.4345764791215298e-06, 'epoch': 1.02} +{'loss': 96.2886, 'grad_norm': 1133.4419777079904, 'learning_rate': 3.431288864282398e-06, 'epoch': 1.02} +{'loss': 93.7242, 'grad_norm': 1395.269711255663, 'learning_rate': 3.428002497778192e-06, 'epoch': 1.02} +{'loss': 94.297, 'grad_norm': 1444.156832256435, 'learning_rate': 3.4247173802334554e-06, 'epoch': 1.02} +{'loss': 96.1714, 'grad_norm': 1247.947680748035, 'learning_rate': 3.4214335122725074e-06, 'epoch': 1.02} +{'loss': 96.0519, 'grad_norm': 1141.8629942116338, 'learning_rate': 3.4181508945194185e-06, 'epoch': 1.02} +{'loss': 95.1635, 'grad_norm': 1188.2422886214738, 'learning_rate': 3.414869527598025e-06, 'epoch': 1.02} +{'loss': 94.7542, 'grad_norm': 1270.4465881941326, 'learning_rate': 3.4115894121319303e-06, 'epoch': 1.02} +{'loss': 94.5409, 'grad_norm': 1114.0781540350254, 'learning_rate': 3.40831054874449e-06, 'epoch': 1.02} +{'loss': 97.6894, 'grad_norm': 1071.5326132252408, 'learning_rate': 3.405032938058835e-06, 'epoch': 1.02} +{'loss': 94.0826, 'grad_norm': 1410.9780626367892, 'learning_rate': 3.401756580697849e-06, 'epoch': 1.02} +{'loss': 94.4411, 'grad_norm': 1333.4333722657998, 'learning_rate': 3.3984814772841734e-06, 'epoch': 1.03} +{'loss': 95.4117, 'grad_norm': 1295.8220082492687, 'learning_rate': 3.3952076284402257e-06, 'epoch': 1.03} +{'loss': 96.6864, 'grad_norm': 1427.2067276596658, 'learning_rate': 3.3919350347881696e-06, 'epoch': 1.03} +{'loss': 92.348, 'grad_norm': 1196.8260409667168, 'learning_rate': 3.3886636969499444e-06, 'epoch': 1.03} +{'loss': 97.2459, 'grad_norm': 1109.4291894486882, 'learning_rate': 3.385393615547239e-06, 'epoch': 1.03} +{'loss': 97.2092, 'grad_norm': 1111.2039561732292, 'learning_rate': 3.3821247912015066e-06, 'epoch': 1.03} +{'loss': 94.9778, 'grad_norm': 1152.04540047209, 'learning_rate': 3.378857224533968e-06, 'epoch': 1.03} +{'loss': 95.1833, 'grad_norm': 1204.9850438276637, 'learning_rate': 3.375590916165594e-06, 'epoch': 1.03} +{'loss': 94.269, 'grad_norm': 1183.6937782043804, 'learning_rate': 3.3723258667171298e-06, 'epoch': 1.03} +{'loss': 94.2471, 'grad_norm': 1027.193218230385, 'learning_rate': 3.3690620768090708e-06, 'epoch': 1.03} +{'loss': 95.7896, 'grad_norm': 1237.9704018872128, 'learning_rate': 3.365799547061671e-06, 'epoch': 1.03} +{'loss': 95.4652, 'grad_norm': 1185.0820701408302, 'learning_rate': 3.3625382780949576e-06, 'epoch': 1.03} +{'loss': 94.325, 'grad_norm': 1216.3449840720782, 'learning_rate': 3.3592782705287043e-06, 'epoch': 1.03} +{'loss': 94.6448, 'grad_norm': 1157.3719247867086, 'learning_rate': 3.356019524982457e-06, 'epoch': 1.03} +{'loss': 94.1866, 'grad_norm': 1114.478227678605, 'learning_rate': 3.3527620420755135e-06, 'epoch': 1.03} +{'loss': 93.4485, 'grad_norm': 1119.3086736750836, 'learning_rate': 3.3495058224269317e-06, 'epoch': 1.03} +{'loss': 95.2965, 'grad_norm': 1176.1971533084163, 'learning_rate': 3.346250866655538e-06, 'epoch': 1.03} +{'loss': 98.1841, 'grad_norm': 1049.691439115736, 'learning_rate': 3.342997175379906e-06, 'epoch': 1.03} +{'loss': 92.8122, 'grad_norm': 1003.7481793028064, 'learning_rate': 3.3397447492183833e-06, 'epoch': 1.03} +{'loss': 95.2925, 'grad_norm': 1052.1874354097208, 'learning_rate': 3.336493588789064e-06, 'epoch': 1.03} +{'loss': 96.0388, 'grad_norm': 1095.913037101014, 'learning_rate': 3.3332436947098047e-06, 'epoch': 1.03} +{'loss': 95.1454, 'grad_norm': 1234.08035966421, 'learning_rate': 3.3299950675982305e-06, 'epoch': 1.03} +{'loss': 93.8114, 'grad_norm': 1241.133331248731, 'learning_rate': 3.326747708071717e-06, 'epoch': 1.03} +{'loss': 93.4797, 'grad_norm': 1096.7537873850079, 'learning_rate': 3.3235016167473943e-06, 'epoch': 1.03} +{'loss': 93.7127, 'grad_norm': 1391.3305544770471, 'learning_rate': 3.3202567942421684e-06, 'epoch': 1.03} +{'loss': 93.3312, 'grad_norm': 1245.9915414473442, 'learning_rate': 3.317013241172684e-06, 'epoch': 1.03} +{'loss': 93.1607, 'grad_norm': 1204.5863618700257, 'learning_rate': 3.313770958155362e-06, 'epoch': 1.03} +{'loss': 96.0404, 'grad_norm': 1297.57391976549, 'learning_rate': 3.3105299458063724e-06, 'epoch': 1.03} +{'loss': 93.8894, 'grad_norm': 1265.782685841648, 'learning_rate': 3.307290204741641e-06, 'epoch': 1.03} +{'loss': 97.5559, 'grad_norm': 2230.1562314226358, 'learning_rate': 3.304051735576862e-06, 'epoch': 1.03} +{'loss': 96.0277, 'grad_norm': 1233.583354532451, 'learning_rate': 3.300814538927478e-06, 'epoch': 1.03} +{'loss': 96.0528, 'grad_norm': 1153.91781795926, 'learning_rate': 3.297578615408701e-06, 'epoch': 1.03} +{'loss': 95.0207, 'grad_norm': 1073.4983824105718, 'learning_rate': 3.294343965635489e-06, 'epoch': 1.03} +{'loss': 94.43, 'grad_norm': 1126.4948117734493, 'learning_rate': 3.2911105902225615e-06, 'epoch': 1.03} +{'loss': 94.7633, 'grad_norm': 1367.5248267141442, 'learning_rate': 3.287878489784404e-06, 'epoch': 1.03} +{'loss': 93.051, 'grad_norm': 1171.5083975373561, 'learning_rate': 3.2846476649352455e-06, 'epoch': 1.03} +{'loss': 97.6404, 'grad_norm': 1181.540175571969, 'learning_rate': 3.281418116289089e-06, 'epoch': 1.03} +{'loss': 94.6051, 'grad_norm': 1398.3617619841955, 'learning_rate': 3.2781898444596804e-06, 'epoch': 1.03} +{'loss': 95.4397, 'grad_norm': 1160.185668669416, 'learning_rate': 3.274962850060528e-06, 'epoch': 1.03} +{'loss': 96.016, 'grad_norm': 1144.9979673246064, 'learning_rate': 3.271737133704904e-06, 'epoch': 1.03} +{'loss': 94.3073, 'grad_norm': 1381.0556610648498, 'learning_rate': 3.2685126960058246e-06, 'epoch': 1.03} +{'loss': 94.0489, 'grad_norm': 1174.7212192844402, 'learning_rate': 3.2652895375760772e-06, 'epoch': 1.03} +{'loss': 94.5677, 'grad_norm': 1454.1472136338182, 'learning_rate': 3.262067659028196e-06, 'epoch': 1.03} +{'loss': 96.0216, 'grad_norm': 1284.335354103843, 'learning_rate': 3.2588470609744725e-06, 'epoch': 1.03} +{'loss': 94.9646, 'grad_norm': 1366.9751499825775, 'learning_rate': 3.2556277440269637e-06, 'epoch': 1.03} +{'loss': 97.109, 'grad_norm': 1192.9703932223736, 'learning_rate': 3.2524097087974706e-06, 'epoch': 1.03} +{'loss': 94.9755, 'grad_norm': 1240.8456723700656, 'learning_rate': 3.249192955897562e-06, 'epoch': 1.03} +{'loss': 93.4792, 'grad_norm': 1067.9275325666872, 'learning_rate': 3.245977485938556e-06, 'epoch': 1.03} +{'loss': 95.0422, 'grad_norm': 1208.5527130138876, 'learning_rate': 3.242763299531526e-06, 'epoch': 1.03} +{'loss': 96.0672, 'grad_norm': 1279.161701726287, 'learning_rate': 3.239550397287311e-06, 'epoch': 1.03} +{'loss': 94.6802, 'grad_norm': 1202.3017155656262, 'learning_rate': 3.2363387798164915e-06, 'epoch': 1.03} +{'loss': 94.1852, 'grad_norm': 1237.1051858518065, 'learning_rate': 3.233128447729419e-06, 'epoch': 1.03} +{'loss': 92.4375, 'grad_norm': 1148.5783317585847, 'learning_rate': 3.229919401636189e-06, 'epoch': 1.04} +{'loss': 95.4828, 'grad_norm': 1293.6732552612896, 'learning_rate': 3.226711642146655e-06, 'epoch': 1.04} +{'loss': 93.6796, 'grad_norm': 1114.8371373468958, 'learning_rate': 3.2235051698704345e-06, 'epoch': 1.04} +{'loss': 94.101, 'grad_norm': 1178.453637861665, 'learning_rate': 3.220299985416886e-06, 'epoch': 1.04} +{'loss': 95.9366, 'grad_norm': 1171.5290323228292, 'learning_rate': 3.2170960893951398e-06, 'epoch': 1.04} +{'loss': 92.6026, 'grad_norm': 1119.0359666328022, 'learning_rate': 3.2138934824140667e-06, 'epoch': 1.04} +{'loss': 95.2066, 'grad_norm': 1253.4781088844907, 'learning_rate': 3.2106921650822997e-06, 'epoch': 1.04} +{'loss': 94.505, 'grad_norm': 1138.6607729167956, 'learning_rate': 3.2074921380082224e-06, 'epoch': 1.04} +{'loss': 95.8093, 'grad_norm': 1298.0958017234452, 'learning_rate': 3.2042934017999795e-06, 'epoch': 1.04} +{'loss': 93.8494, 'grad_norm': 1311.5828535474589, 'learning_rate': 3.20109595706547e-06, 'epoch': 1.04} +{'loss': 95.3795, 'grad_norm': 1171.8450972905214, 'learning_rate': 3.197899804412342e-06, 'epoch': 1.04} +{'loss': 95.0683, 'grad_norm': 1216.843320278211, 'learning_rate': 3.1947049444480006e-06, 'epoch': 1.04} +{'loss': 94.9724, 'grad_norm': 1164.7313845955066, 'learning_rate': 3.1915113777796e-06, 'epoch': 1.04} +{'loss': 95.6665, 'grad_norm': 1526.0451480516845, 'learning_rate': 3.1883191050140605e-06, 'epoch': 1.04} +{'loss': 92.7482, 'grad_norm': 1071.0393592649575, 'learning_rate': 3.1851281267580504e-06, 'epoch': 1.04} +{'loss': 94.8518, 'grad_norm': 1368.726908483801, 'learning_rate': 3.18193844361799e-06, 'epoch': 1.04} +{'loss': 94.3677, 'grad_norm': 1187.369487563253, 'learning_rate': 3.178750056200054e-06, 'epoch': 1.04} +{'loss': 93.9137, 'grad_norm': 1176.8333046664843, 'learning_rate': 3.1755629651101684e-06, 'epoch': 1.04} +{'loss': 94.7363, 'grad_norm': 1322.721227019864, 'learning_rate': 3.1723771709540207e-06, 'epoch': 1.04} +[2025-05-29 20:20:34,258] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step54000 is about to be saved! +[2025-05-29 20:20:34,347] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 20:20:34,347] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 20:20:34,434] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 20:20:34,507] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 20:26:14,835] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 20:26:14,903] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 20:26:15,318] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step54000 is ready now! +[2025-05-29 20:26:15,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step54000 is about to be saved! +[2025-05-29 20:26:15,500] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 20:26:15,500] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 20:26:15,597] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 20:26:15,707] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 20:33:07,522] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 20:33:07,634] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-54000/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 20:33:12,542] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step54000 is ready now! +{'loss': 97.2771, 'grad_norm': 1329.5281659132604, 'learning_rate': 3.169192674337049e-06, 'epoch': 1.04} +{'loss': 93.8634, 'grad_norm': 1213.4455113686206, 'learning_rate': 3.166009475864442e-06, 'epoch': 1.04} +{'loss': 94.7479, 'grad_norm': 1061.4259085527233, 'learning_rate': 3.162827576141142e-06, 'epoch': 1.04} +{'loss': 93.8547, 'grad_norm': 1088.0267767309651, 'learning_rate': 3.159646975771842e-06, 'epoch': 1.04} +{'loss': 96.0777, 'grad_norm': 1187.5126579056202, 'learning_rate': 3.1564676753609948e-06, 'epoch': 1.04} +{'loss': 96.9366, 'grad_norm': 1269.8441306613508, 'learning_rate': 3.153289675512807e-06, 'epoch': 1.04} +{'loss': 94.0383, 'grad_norm': 1066.0270558190232, 'learning_rate': 3.15011297683123e-06, 'epoch': 1.04} +{'loss': 96.2089, 'grad_norm': 1155.1223098159455, 'learning_rate': 3.1469375799199717e-06, 'epoch': 1.04} +{'loss': 96.9195, 'grad_norm': 1279.2248484612007, 'learning_rate': 3.1437634853824907e-06, 'epoch': 1.04} +{'loss': 95.3075, 'grad_norm': 1253.7243271443747, 'learning_rate': 3.140590693822002e-06, 'epoch': 1.04} +{'loss': 94.6421, 'grad_norm': 1276.942313587443, 'learning_rate': 3.1374192058414755e-06, 'epoch': 1.04} +{'loss': 101.3536, 'grad_norm': 1191.8721842907237, 'learning_rate': 3.134249022043625e-06, 'epoch': 1.04} +{'loss': 94.6031, 'grad_norm': 1270.8981887034206, 'learning_rate': 3.13108014303092e-06, 'epoch': 1.04} +{'loss': 94.6992, 'grad_norm': 1103.404085064421, 'learning_rate': 3.12791256940558e-06, 'epoch': 1.04} +{'loss': 93.6259, 'grad_norm': 1192.6932636652937, 'learning_rate': 3.1247463017695833e-06, 'epoch': 1.04} +{'loss': 93.0395, 'grad_norm': 1151.617817733111, 'learning_rate': 3.1215813407246566e-06, 'epoch': 1.04} +{'loss': 94.4499, 'grad_norm': 1154.4340164486505, 'learning_rate': 3.118417686872276e-06, 'epoch': 1.04} +{'loss': 95.4579, 'grad_norm': 1219.5038453878622, 'learning_rate': 3.1152553408136686e-06, 'epoch': 1.04} +{'loss': 93.2123, 'grad_norm': 1161.59603342642, 'learning_rate': 3.112094303149813e-06, 'epoch': 1.04} +{'loss': 95.8171, 'grad_norm': 1253.144245833372, 'learning_rate': 3.108934574481445e-06, 'epoch': 1.04} +{'loss': 93.0828, 'grad_norm': 1240.124367550652, 'learning_rate': 3.1057761554090483e-06, 'epoch': 1.04} +{'loss': 95.8987, 'grad_norm': 1243.9983878185392, 'learning_rate': 3.1026190465328553e-06, 'epoch': 1.04} +{'loss': 96.4302, 'grad_norm': 1211.610405942876, 'learning_rate': 3.099463248452852e-06, 'epoch': 1.04} +{'loss': 93.9572, 'grad_norm': 1259.2291386021666, 'learning_rate': 3.0963087617687737e-06, 'epoch': 1.04} +{'loss': 97.3918, 'grad_norm': 1114.2929434874627, 'learning_rate': 3.0931555870801033e-06, 'epoch': 1.04} +{'loss': 94.4295, 'grad_norm': 1271.1913712594157, 'learning_rate': 3.0900037249860827e-06, 'epoch': 1.04} +{'loss': 94.0365, 'grad_norm': 1059.5455763865336, 'learning_rate': 3.0868531760857037e-06, 'epoch': 1.04} +{'loss': 93.8506, 'grad_norm': 1122.282475200121, 'learning_rate': 3.0837039409776993e-06, 'epoch': 1.04} +{'loss': 94.8126, 'grad_norm': 1194.203345316825, 'learning_rate': 3.0805560202605587e-06, 'epoch': 1.04} +{'loss': 93.7029, 'grad_norm': 1197.0916722655948, 'learning_rate': 3.0774094145325195e-06, 'epoch': 1.04} +{'loss': 96.3828, 'grad_norm': 1121.798022721816, 'learning_rate': 3.074264124391574e-06, 'epoch': 1.04} +{'loss': 94.0371, 'grad_norm': 1090.0192129701213, 'learning_rate': 3.0711201504354628e-06, 'epoch': 1.04} +{'loss': 93.9056, 'grad_norm': 1207.992269323312, 'learning_rate': 3.0679774932616723e-06, 'epoch': 1.04} +{'loss': 93.1835, 'grad_norm': 1136.0222818842983, 'learning_rate': 3.064836153467442e-06, 'epoch': 1.05} +{'loss': 95.9167, 'grad_norm': 1253.0338030203447, 'learning_rate': 3.0616961316497563e-06, 'epoch': 1.05} +{'loss': 93.5756, 'grad_norm': 1100.0076770971991, 'learning_rate': 3.0585574284053552e-06, 'epoch': 1.05} +{'loss': 91.4423, 'grad_norm': 1211.673589385108, 'learning_rate': 3.0554200443307324e-06, 'epoch': 1.05} +{'loss': 98.4185, 'grad_norm': 1248.3616892122511, 'learning_rate': 3.052283980022118e-06, 'epoch': 1.05} +{'loss': 95.4338, 'grad_norm': 1185.000028195395, 'learning_rate': 3.0491492360755003e-06, 'epoch': 1.05} +{'loss': 93.4778, 'grad_norm': 1096.5873484974688, 'learning_rate': 3.04601581308661e-06, 'epoch': 1.05} +{'loss': 92.8917, 'grad_norm': 1053.131355795886, 'learning_rate': 3.0428837116509334e-06, 'epoch': 1.05} +{'loss': 93.3649, 'grad_norm': 1162.2696187616893, 'learning_rate': 3.0397529323637075e-06, 'epoch': 1.05} +{'loss': 94.3535, 'grad_norm': 1257.7753071132859, 'learning_rate': 3.0366234758199097e-06, 'epoch': 1.05} +{'loss': 92.8138, 'grad_norm': 1163.6370868426773, 'learning_rate': 3.033495342614271e-06, 'epoch': 1.05} +{'loss': 94.0311, 'grad_norm': 1208.7069560753437, 'learning_rate': 3.0303685333412657e-06, 'epoch': 1.05} +{'loss': 96.296, 'grad_norm': 1159.2716080014907, 'learning_rate': 3.0272430485951244e-06, 'epoch': 1.05} +{'loss': 95.3154, 'grad_norm': 1208.4778759420647, 'learning_rate': 3.024118888969826e-06, 'epoch': 1.05} +{'loss': 95.0888, 'grad_norm': 1120.7692957421166, 'learning_rate': 3.02099605505909e-06, 'epoch': 1.05} +{'loss': 90.5309, 'grad_norm': 1062.2910626593243, 'learning_rate': 3.0178745474563886e-06, 'epoch': 1.05} +{'loss': 93.0295, 'grad_norm': 1065.3906007594335, 'learning_rate': 3.0147543667549374e-06, 'epoch': 1.05} +{'loss': 92.6384, 'grad_norm': 1214.1782016717166, 'learning_rate': 3.011635513547706e-06, 'epoch': 1.05} +{'loss': 94.3193, 'grad_norm': 1250.4030564745058, 'learning_rate': 3.008517988427415e-06, 'epoch': 1.05} +{'loss': 95.046, 'grad_norm': 1382.9262992587824, 'learning_rate': 3.005401791986522e-06, 'epoch': 1.05} +{'loss': 94.0776, 'grad_norm': 1318.5297002603265, 'learning_rate': 3.002286924817237e-06, 'epoch': 1.05} +{'loss': 93.3159, 'grad_norm': 1257.700995043592, 'learning_rate': 2.9991733875115183e-06, 'epoch': 1.05} +{'loss': 92.4024, 'grad_norm': 1110.7604274312785, 'learning_rate': 2.996061180661063e-06, 'epoch': 1.05} +{'loss': 94.3185, 'grad_norm': 1182.0147616540614, 'learning_rate': 2.9929503048573372e-06, 'epoch': 1.05} +{'loss': 92.823, 'grad_norm': 1187.7711051043696, 'learning_rate': 2.9898407606915323e-06, 'epoch': 1.05} +{'loss': 94.2954, 'grad_norm': 1179.4863665882815, 'learning_rate': 2.986732548754595e-06, 'epoch': 1.05} +{'loss': 94.0649, 'grad_norm': 1246.7846933664684, 'learning_rate': 2.9836256696372178e-06, 'epoch': 1.05} +{'loss': 93.2366, 'grad_norm': 1097.0867660543265, 'learning_rate': 2.9805201239298363e-06, 'epoch': 1.05} +{'loss': 95.0524, 'grad_norm': 1604.16418800446, 'learning_rate': 2.97741591222264e-06, 'epoch': 1.05} +{'loss': 95.3135, 'grad_norm': 1392.2598064771566, 'learning_rate': 2.974313035105565e-06, 'epoch': 1.05} +{'loss': 97.7969, 'grad_norm': 1164.2814842075452, 'learning_rate': 2.9712114931682857e-06, 'epoch': 1.05} +{'loss': 94.0738, 'grad_norm': 1140.5915126888747, 'learning_rate': 2.9681112870002283e-06, 'epoch': 1.05} +{'loss': 94.2875, 'grad_norm': 1333.4956799887848, 'learning_rate': 2.9650124171905613e-06, 'epoch': 1.05} +{'loss': 94.4277, 'grad_norm': 1156.862465774802, 'learning_rate': 2.961914884328203e-06, 'epoch': 1.05} +{'loss': 94.256, 'grad_norm': 1076.727570931445, 'learning_rate': 2.9588186890018223e-06, 'epoch': 1.05} +{'loss': 95.4419, 'grad_norm': 1265.2951832432468, 'learning_rate': 2.9557238317998236e-06, 'epoch': 1.05} +{'loss': 92.5574, 'grad_norm': 1188.477009763779, 'learning_rate': 2.952630313310362e-06, 'epoch': 1.05} +{'loss': 93.1784, 'grad_norm': 1091.1517885093613, 'learning_rate': 2.9495381341213336e-06, 'epoch': 1.05} +{'loss': 93.1353, 'grad_norm': 1293.9999647093784, 'learning_rate': 2.946447294820389e-06, 'epoch': 1.05} +{'loss': 95.5411, 'grad_norm': 1323.2968661230357, 'learning_rate': 2.9433577959949223e-06, 'epoch': 1.05} +{'loss': 94.5687, 'grad_norm': 1195.426619734143, 'learning_rate': 2.940269638232065e-06, 'epoch': 1.05} +{'loss': 95.8826, 'grad_norm': 1342.0794719643343, 'learning_rate': 2.9371828221187004e-06, 'epoch': 1.05} +{'loss': 93.2316, 'grad_norm': 1254.118277404053, 'learning_rate': 2.9340973482414515e-06, 'epoch': 1.05} +{'loss': 92.7395, 'grad_norm': 1186.4089140335623, 'learning_rate': 2.9310132171866924e-06, 'epoch': 1.05} +{'loss': 93.7226, 'grad_norm': 1600.315526515834, 'learning_rate': 2.927930429540542e-06, 'epoch': 1.05} +{'loss': 94.1023, 'grad_norm': 1096.7169493872173, 'learning_rate': 2.9248489858888595e-06, 'epoch': 1.05} +{'loss': 95.965, 'grad_norm': 1261.4041738689657, 'learning_rate': 2.92176888681725e-06, 'epoch': 1.05} +{'loss': 95.3241, 'grad_norm': 1173.1479351972418, 'learning_rate': 2.9186901329110605e-06, 'epoch': 1.05} +{'loss': 92.8183, 'grad_norm': 1137.5746553922288, 'learning_rate': 2.915612724755388e-06, 'epoch': 1.05} +{'loss': 93.8451, 'grad_norm': 1277.8054040347474, 'learning_rate': 2.9125366629350747e-06, 'epoch': 1.05} +{'loss': 96.501, 'grad_norm': 1089.516172308136, 'learning_rate': 2.9094619480347007e-06, 'epoch': 1.05} +{'loss': 94.7015, 'grad_norm': 1107.4980714583112, 'learning_rate': 2.9063885806385916e-06, 'epoch': 1.05} +{'loss': 91.9382, 'grad_norm': 1069.2984099415166, 'learning_rate': 2.9033165613308155e-06, 'epoch': 1.06} +{'loss': 93.5144, 'grad_norm': 1147.6531181162306, 'learning_rate': 2.9002458906951904e-06, 'epoch': 1.06} +{'loss': 94.9286, 'grad_norm': 1190.2637116870537, 'learning_rate': 2.8971765693152767e-06, 'epoch': 1.06} +{'loss': 97.9872, 'grad_norm': 1271.4289696465812, 'learning_rate': 2.894108597774374e-06, 'epoch': 1.06} +{'loss': 93.6789, 'grad_norm': 1151.9190861987686, 'learning_rate': 2.8910419766555274e-06, 'epoch': 1.06} +{'loss': 95.7379, 'grad_norm': 1238.942043003239, 'learning_rate': 2.887976706541523e-06, 'epoch': 1.06} +{'loss': 91.7652, 'grad_norm': 1099.6963645090634, 'learning_rate': 2.8849127880148954e-06, 'epoch': 1.06} +{'loss': 91.749, 'grad_norm': 1406.384298411642, 'learning_rate': 2.8818502216579215e-06, 'epoch': 1.06} +{'loss': 91.038, 'grad_norm': 1025.1239428280237, 'learning_rate': 2.8787890080526183e-06, 'epoch': 1.06} +{'loss': 96.538, 'grad_norm': 1256.3800421433907, 'learning_rate': 2.875729147780745e-06, 'epoch': 1.06} +{'loss': 92.8858, 'grad_norm': 1465.8467861052618, 'learning_rate': 2.8726706414238038e-06, 'epoch': 1.06} +{'loss': 94.489, 'grad_norm': 1427.7299380610896, 'learning_rate': 2.869613489563046e-06, 'epoch': 1.06} +{'loss': 94.7126, 'grad_norm': 1371.1476889534094, 'learning_rate': 2.8665576927794546e-06, 'epoch': 1.06} +{'loss': 94.3613, 'grad_norm': 1291.4948269276376, 'learning_rate': 2.86350325165377e-06, 'epoch': 1.06} +{'loss': 93.7832, 'grad_norm': 1297.0587277170896, 'learning_rate': 2.8604501667664588e-06, 'epoch': 1.06} +{'loss': 95.7223, 'grad_norm': 1281.108843079447, 'learning_rate': 2.857398438697737e-06, 'epoch': 1.06} +{'loss': 92.8446, 'grad_norm': 1156.4704693222363, 'learning_rate': 2.854348068027568e-06, 'epoch': 1.06} +{'loss': 92.5611, 'grad_norm': 1217.5311754716574, 'learning_rate': 2.8512990553356467e-06, 'epoch': 1.06} +{'loss': 95.3202, 'grad_norm': 1288.0882245791126, 'learning_rate': 2.8482514012014216e-06, 'epoch': 1.06} +{'loss': 94.6699, 'grad_norm': 1163.19071224782, 'learning_rate': 2.845205106204073e-06, 'epoch': 1.06} +{'loss': 95.7905, 'grad_norm': 1170.480525315819, 'learning_rate': 2.8421601709225233e-06, 'epoch': 1.06} +{'loss': 93.6983, 'grad_norm': 1332.417678496253, 'learning_rate': 2.8391165959354473e-06, 'epoch': 1.06} +{'loss': 94.1996, 'grad_norm': 1199.0501273173015, 'learning_rate': 2.8360743818212475e-06, 'epoch': 1.06} +{'loss': 94.3362, 'grad_norm': 1190.7267526379194, 'learning_rate': 2.833033529158079e-06, 'epoch': 1.06} +{'loss': 93.2982, 'grad_norm': 1197.7025775570294, 'learning_rate': 2.829994038523832e-06, 'epoch': 1.06} +{'loss': 97.3018, 'grad_norm': 1187.3107602488094, 'learning_rate': 2.8269559104961353e-06, 'epoch': 1.06} +{'loss': 93.3215, 'grad_norm': 1152.269599322288, 'learning_rate': 2.8239191456523683e-06, 'epoch': 1.06} +{'loss': 93.0425, 'grad_norm': 1180.0566861675977, 'learning_rate': 2.82088374456964e-06, 'epoch': 1.06} +{'loss': 93.0442, 'grad_norm': 1178.8411819568512, 'learning_rate': 2.8178497078248122e-06, 'epoch': 1.06} +{'loss': 95.7132, 'grad_norm': 1176.4804143756492, 'learning_rate': 2.814817035994479e-06, 'epoch': 1.06} +{'loss': 92.2459, 'grad_norm': 1267.6228585862916, 'learning_rate': 2.811785729654972e-06, 'epoch': 1.06} +{'loss': 95.4757, 'grad_norm': 1127.370214914609, 'learning_rate': 2.808755789382376e-06, 'epoch': 1.06} +{'loss': 92.2852, 'grad_norm': 1197.9622080951426, 'learning_rate': 2.805727215752503e-06, 'epoch': 1.06} +{'loss': 95.3621, 'grad_norm': 1164.5153390986852, 'learning_rate': 2.8027000093409153e-06, 'epoch': 1.06} +{'loss': 93.2367, 'grad_norm': 1294.7882971373538, 'learning_rate': 2.79967417072291e-06, 'epoch': 1.06} +{'loss': 95.1136, 'grad_norm': 1091.7695467425374, 'learning_rate': 2.7966497004735214e-06, 'epoch': 1.06} +{'loss': 94.5369, 'grad_norm': 1138.0532208066859, 'learning_rate': 2.7936265991675336e-06, 'epoch': 1.06} +{'loss': 93.7835, 'grad_norm': 1155.3054048497356, 'learning_rate': 2.7906048673794593e-06, 'epoch': 1.06} +{'loss': 94.6832, 'grad_norm': 1129.8746309530777, 'learning_rate': 2.7875845056835617e-06, 'epoch': 1.06} +{'loss': 92.909, 'grad_norm': 1218.802567414967, 'learning_rate': 2.7845655146538354e-06, 'epoch': 1.06} +{'loss': 94.1185, 'grad_norm': 1238.330680232797, 'learning_rate': 2.781547894864014e-06, 'epoch': 1.06} +{'loss': 94.4944, 'grad_norm': 1111.8611296144338, 'learning_rate': 2.778531646887579e-06, 'epoch': 1.06} +{'loss': 93.4169, 'grad_norm': 1239.3769237155193, 'learning_rate': 2.7755167712977447e-06, 'epoch': 1.06} +{'loss': 92.3506, 'grad_norm': 1149.4610054493205, 'learning_rate': 2.7725032686674625e-06, 'epoch': 1.06} +{'loss': 93.4334, 'grad_norm': 1177.2879501409493, 'learning_rate': 2.7694911395694324e-06, 'epoch': 1.06} +{'loss': 92.268, 'grad_norm': 1107.834641983209, 'learning_rate': 2.766480384576079e-06, 'epoch': 1.06} +{'loss': 94.7856, 'grad_norm': 1132.0894787461464, 'learning_rate': 2.7634710042595836e-06, 'epoch': 1.06} +{'loss': 94.5319, 'grad_norm': 1231.151851362583, 'learning_rate': 2.7604629991918506e-06, 'epoch': 1.06} +{'loss': 93.6531, 'grad_norm': 1151.2570597996637, 'learning_rate': 2.757456369944528e-06, 'epoch': 1.06} +{'loss': 93.0871, 'grad_norm': 1176.1002656207936, 'learning_rate': 2.754451117089009e-06, 'epoch': 1.06} +{'loss': 93.2573, 'grad_norm': 1130.5450849306158, 'learning_rate': 2.7514472411964123e-06, 'epoch': 1.06} +{'loss': 93.7976, 'grad_norm': 1150.3535808546637, 'learning_rate': 2.7484447428376094e-06, 'epoch': 1.06} +{'loss': 92.9182, 'grad_norm': 1319.8488119509893, 'learning_rate': 2.7454436225832005e-06, 'epoch': 1.07} +{'loss': 95.2446, 'grad_norm': 1231.04524532164, 'learning_rate': 2.7424438810035224e-06, 'epoch': 1.07} +{'loss': 92.6905, 'grad_norm': 1151.7902799552412, 'learning_rate': 2.73944551866866e-06, 'epoch': 1.07} +{'loss': 91.2183, 'grad_norm': 1433.0118846946978, 'learning_rate': 2.736448536148424e-06, 'epoch': 1.07} +{'loss': 94.6309, 'grad_norm': 1193.9977661508365, 'learning_rate': 2.733452934012375e-06, 'epoch': 1.07} +{'loss': 92.8988, 'grad_norm': 1212.5778889801993, 'learning_rate': 2.730458712829801e-06, 'epoch': 1.07} +{'loss': 93.1523, 'grad_norm': 1075.7892326420015, 'learning_rate': 2.727465873169729e-06, 'epoch': 1.07} +{'loss': 93.3285, 'grad_norm': 1228.1254998434054, 'learning_rate': 2.724474415600933e-06, 'epoch': 1.07} +{'loss': 92.1895, 'grad_norm': 1245.0422292618969, 'learning_rate': 2.72148434069191e-06, 'epoch': 1.07} +{'loss': 93.7469, 'grad_norm': 1164.239272155791, 'learning_rate': 2.718495649010908e-06, 'epoch': 1.07} +{'loss': 92.9895, 'grad_norm': 1072.7908049731209, 'learning_rate': 2.7155083411259044e-06, 'epoch': 1.07} +{'loss': 92.3931, 'grad_norm': 1144.1550871692732, 'learning_rate': 2.7125224176046093e-06, 'epoch': 1.07} +{'loss': 91.9331, 'grad_norm': 1122.7190622211724, 'learning_rate': 2.709537879014482e-06, 'epoch': 1.07} +{'loss': 96.0409, 'grad_norm': 1336.7726201571436, 'learning_rate': 2.7065547259227078e-06, 'epoch': 1.07} +{'loss': 93.5739, 'grad_norm': 1328.6347116128125, 'learning_rate': 2.7035729588962165e-06, 'epoch': 1.07} +{'loss': 92.9827, 'grad_norm': 1075.5741615062261, 'learning_rate': 2.7005925785016697e-06, 'epoch': 1.07} +{'loss': 92.3005, 'grad_norm': 1073.2980486239828, 'learning_rate': 2.697613585305462e-06, 'epoch': 1.07} +{'loss': 92.4719, 'grad_norm': 1133.5006599387186, 'learning_rate': 2.6946359798737367e-06, 'epoch': 1.07} +{'loss': 94.8051, 'grad_norm': 1307.6463986385297, 'learning_rate': 2.6916597627723585e-06, 'epoch': 1.07} +{'loss': 92.4229, 'grad_norm': 1119.7756260160104, 'learning_rate': 2.6886849345669407e-06, 'epoch': 1.07} +{'loss': 94.9342, 'grad_norm': 1116.4592795037986, 'learning_rate': 2.685711495822827e-06, 'epoch': 1.07} +{'loss': 92.5428, 'grad_norm': 1126.5959848962357, 'learning_rate': 2.6827394471050916e-06, 'epoch': 1.07} +{'loss': 92.3541, 'grad_norm': 1130.3239295444132, 'learning_rate': 2.6797687889785585e-06, 'epoch': 1.07} +{'loss': 93.5189, 'grad_norm': 1198.6893117004006, 'learning_rate': 2.676799522007771e-06, 'epoch': 1.07} +{'loss': 93.2274, 'grad_norm': 1250.3517657546736, 'learning_rate': 2.673831646757026e-06, 'epoch': 1.07} +{'loss': 93.2648, 'grad_norm': 1151.6641889024143, 'learning_rate': 2.6708651637903394e-06, 'epoch': 1.07} +{'loss': 94.0883, 'grad_norm': 1089.36263435265, 'learning_rate': 2.667900073671471e-06, 'epoch': 1.07} +{'loss': 94.0853, 'grad_norm': 1191.5853028278657, 'learning_rate': 2.6649363769639103e-06, 'epoch': 1.07} +{'loss': 94.0208, 'grad_norm': 1221.9380105490297, 'learning_rate': 2.661974074230891e-06, 'epoch': 1.07} +{'loss': 93.2518, 'grad_norm': 1279.9092823735068, 'learning_rate': 2.659013166035378e-06, 'epoch': 1.07} +{'loss': 92.8936, 'grad_norm': 1143.8067454247487, 'learning_rate': 2.656053652940067e-06, 'epoch': 1.07} +{'loss': 92.3697, 'grad_norm': 1160.7803528779941, 'learning_rate': 2.6530955355073927e-06, 'epoch': 1.07} +{'loss': 93.1401, 'grad_norm': 1124.09000332089, 'learning_rate': 2.6501388142995187e-06, 'epoch': 1.07} +{'loss': 93.4725, 'grad_norm': 1196.3797921960693, 'learning_rate': 2.6471834898783532e-06, 'epoch': 1.07} +{'loss': 94.2962, 'grad_norm': 1156.781934553396, 'learning_rate': 2.6442295628055346e-06, 'epoch': 1.07} +{'loss': 92.9391, 'grad_norm': 1227.7658276733434, 'learning_rate': 2.6412770336424343e-06, 'epoch': 1.07} +{'loss': 93.8308, 'grad_norm': 1156.8920657198942, 'learning_rate': 2.638325902950155e-06, 'epoch': 1.07} +{'loss': 94.3256, 'grad_norm': 1302.1332216741916, 'learning_rate': 2.6353761712895377e-06, 'epoch': 1.07} +{'loss': 93.8229, 'grad_norm': 1095.6506289708525, 'learning_rate': 2.632427839221159e-06, 'epoch': 1.07} +{'loss': 93.4885, 'grad_norm': 1295.3335632726962, 'learning_rate': 2.6294809073053294e-06, 'epoch': 1.07} +{'loss': 92.5756, 'grad_norm': 1156.6460615611181, 'learning_rate': 2.6265353761020897e-06, 'epoch': 1.07} +{'loss': 92.6477, 'grad_norm': 1257.9234247034217, 'learning_rate': 2.6235912461712167e-06, 'epoch': 1.07} +{'loss': 93.9302, 'grad_norm': 1200.172517433838, 'learning_rate': 2.620648518072215e-06, 'epoch': 1.07} +{'loss': 91.1803, 'grad_norm': 1106.1617354817843, 'learning_rate': 2.617707192364335e-06, 'epoch': 1.07} +{'loss': 94.2245, 'grad_norm': 1121.8154443369122, 'learning_rate': 2.614767269606554e-06, 'epoch': 1.07} +{'loss': 93.7577, 'grad_norm': 1223.9359638781593, 'learning_rate': 2.6118287503575813e-06, 'epoch': 1.07} +{'loss': 92.8195, 'grad_norm': 1085.4732198364438, 'learning_rate': 2.608891635175859e-06, 'epoch': 1.07} +{'loss': 92.0825, 'grad_norm': 1175.390214878512, 'learning_rate': 2.605955924619562e-06, 'epoch': 1.07} +{'loss': 92.1944, 'grad_norm': 1156.451462493239, 'learning_rate': 2.603021619246604e-06, 'epoch': 1.07} +{'loss': 92.2236, 'grad_norm': 1193.6865182657814, 'learning_rate': 2.6000887196146306e-06, 'epoch': 1.07} +{'loss': 91.2247, 'grad_norm': 1173.9857696843599, 'learning_rate': 2.597157226281014e-06, 'epoch': 1.07} +{'loss': 92.9008, 'grad_norm': 1275.4327401600628, 'learning_rate': 2.594227139802864e-06, 'epoch': 1.07} +{'loss': 92.9322, 'grad_norm': 1201.4246343810134, 'learning_rate': 2.5912984607370173e-06, 'epoch': 1.08} +{'loss': 94.4326, 'grad_norm': 1192.8867300648592, 'learning_rate': 2.588371189640052e-06, 'epoch': 1.08} +{'loss': 94.3476, 'grad_norm': 1255.3230974303735, 'learning_rate': 2.585445327068277e-06, 'epoch': 1.08} +{'loss': 93.9514, 'grad_norm': 1152.8752433945353, 'learning_rate': 2.582520873577726e-06, 'epoch': 1.08} +{'loss': 93.1178, 'grad_norm': 1115.1471120672654, 'learning_rate': 2.5795978297241733e-06, 'epoch': 1.08} +{'loss': 93.5557, 'grad_norm': 1338.1829463201816, 'learning_rate': 2.576676196063116e-06, 'epoch': 1.08} +{'loss': 92.5032, 'grad_norm': 1089.5247412167512, 'learning_rate': 2.573755973149793e-06, 'epoch': 1.08} +{'loss': 93.8267, 'grad_norm': 1086.523210711824, 'learning_rate': 2.5708371615391738e-06, 'epoch': 1.08} +{'loss': 95.9426, 'grad_norm': 1232.3832955295954, 'learning_rate': 2.5679197617859543e-06, 'epoch': 1.08} +{'loss': 94.1733, 'grad_norm': 1197.0558324225058, 'learning_rate': 2.5650037744445635e-06, 'epoch': 1.08} +{'loss': 95.4486, 'grad_norm': 1143.7601098402652, 'learning_rate': 2.5620892000691643e-06, 'epoch': 1.08} +[2025-05-29 22:21:46,406] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step56000 is about to be saved! +[2025-05-29 22:21:46,437] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 22:21:46,437] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 22:21:46,540] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 22:21:46,654] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 22:27:32,975] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 22:27:33,042] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 22:27:39,829] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step56000 is ready now! +[2025-05-29 22:27:39,867] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step56000 is about to be saved! +[2025-05-29 22:27:39,925] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-29 22:27:39,925] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-29 22:27:40,038] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-29 22:27:40,125] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-29 22:33:20,347] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-29 22:33:20,418] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-56000/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-29 22:33:27,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step56000 is ready now! +{'loss': 93.8922, 'grad_norm': 1153.5951756608508, 'learning_rate': 2.5591760392136467e-06, 'epoch': 1.08} +{'loss': 97.9346, 'grad_norm': 1121.7848713687508, 'learning_rate': 2.5562642924316396e-06, 'epoch': 1.08} +{'loss': 93.5951, 'grad_norm': 1067.6620215926514, 'learning_rate': 2.553353960276499e-06, 'epoch': 1.08} +{'loss': 93.9908, 'grad_norm': 1049.7958254736636, 'learning_rate': 2.550445043301312e-06, 'epoch': 1.08} +{'loss': 93.3327, 'grad_norm': 1168.0824270088578, 'learning_rate': 2.5475375420588944e-06, 'epoch': 1.08} +{'loss': 93.4701, 'grad_norm': 1121.867512877792, 'learning_rate': 2.5446314571017937e-06, 'epoch': 1.08} +{'loss': 94.0896, 'grad_norm': 1167.0451019995764, 'learning_rate': 2.541726788982294e-06, 'epoch': 1.08} +{'loss': 91.4736, 'grad_norm': 1172.5124905157945, 'learning_rate': 2.538823538252405e-06, 'epoch': 1.08} +{'loss': 93.8921, 'grad_norm': 1139.1366484655302, 'learning_rate': 2.5359217054638696e-06, 'epoch': 1.08} +{'loss': 92.025, 'grad_norm': 1115.556926136395, 'learning_rate': 2.5330212911681563e-06, 'epoch': 1.08} +{'loss': 91.9724, 'grad_norm': 1103.4948612077837, 'learning_rate': 2.530122295916465e-06, 'epoch': 1.08} +{'loss': 93.1032, 'grad_norm': 1067.7905352964149, 'learning_rate': 2.5272247202597334e-06, 'epoch': 1.08} +{'loss': 93.2282, 'grad_norm': 1115.7163157911416, 'learning_rate': 2.524328564748625e-06, 'epoch': 1.08} +{'loss': 91.7269, 'grad_norm': 1137.2949202002621, 'learning_rate': 2.5214338299335306e-06, 'epoch': 1.08} +{'loss': 93.3939, 'grad_norm': 1128.712741692944, 'learning_rate': 2.518540516364573e-06, 'epoch': 1.08} +{'loss': 93.2687, 'grad_norm': 1090.2662469413697, 'learning_rate': 2.515648624591601e-06, 'epoch': 1.08} +{'loss': 94.7684, 'grad_norm': 1212.614514117773, 'learning_rate': 2.5127581551642022e-06, 'epoch': 1.08} +{'loss': 94.239, 'grad_norm': 1228.3679086761758, 'learning_rate': 2.5098691086316906e-06, 'epoch': 1.08} +{'loss': 93.3166, 'grad_norm': 1118.3727583861566, 'learning_rate': 2.5069814855431065e-06, 'epoch': 1.08} +{'loss': 93.318, 'grad_norm': 1327.0269547748965, 'learning_rate': 2.504095286447219e-06, 'epoch': 1.08} +{'loss': 91.7302, 'grad_norm': 1173.3072298310958, 'learning_rate': 2.5012105118925267e-06, 'epoch': 1.08} +{'loss': 92.2628, 'grad_norm': 1282.5523834395142, 'learning_rate': 2.498327162427264e-06, 'epoch': 1.08} +{'loss': 90.8093, 'grad_norm': 1098.4109567552434, 'learning_rate': 2.495445238599392e-06, 'epoch': 1.08} +{'loss': 93.1341, 'grad_norm': 1195.3208163888994, 'learning_rate': 2.4925647409565967e-06, 'epoch': 1.08} +{'loss': 93.4319, 'grad_norm': 1161.8582833343978, 'learning_rate': 2.4896856700462946e-06, 'epoch': 1.08} +{'loss': 93.1498, 'grad_norm': 1082.5900890208927, 'learning_rate': 2.4868080264156282e-06, 'epoch': 1.08} +{'loss': 92.992, 'grad_norm': 1116.5837565070963, 'learning_rate': 2.4839318106114774e-06, 'epoch': 1.08} +{'loss': 92.7622, 'grad_norm': 1094.6840626992678, 'learning_rate': 2.4810570231804463e-06, 'epoch': 1.08} +{'loss': 92.7631, 'grad_norm': 1168.5279786474728, 'learning_rate': 2.4781836646688662e-06, 'epoch': 1.08} +{'loss': 92.9565, 'grad_norm': 1234.9404122165338, 'learning_rate': 2.4753117356227964e-06, 'epoch': 1.08} +{'loss': 92.4322, 'grad_norm': 1034.5487534556844, 'learning_rate': 2.4724412365880225e-06, 'epoch': 1.08} +{'loss': 91.56, 'grad_norm': 1110.273464937762, 'learning_rate': 2.469572168110066e-06, 'epoch': 1.08} +{'loss': 91.961, 'grad_norm': 1153.0170154162336, 'learning_rate': 2.466704530734173e-06, 'epoch': 1.08} +{'loss': 92.0856, 'grad_norm': 1109.1837529071865, 'learning_rate': 2.4638383250053156e-06, 'epoch': 1.08} +{'loss': 92.3111, 'grad_norm': 1174.8416458475053, 'learning_rate': 2.460973551468194e-06, 'epoch': 1.08} +{'loss': 93.7508, 'grad_norm': 1147.6702927879476, 'learning_rate': 2.458110210667238e-06, 'epoch': 1.08} +{'loss': 93.8899, 'grad_norm': 1159.1759606159385, 'learning_rate': 2.4552483031466e-06, 'epoch': 1.08} +{'loss': 91.9419, 'grad_norm': 1116.1494024456565, 'learning_rate': 2.452387829450169e-06, 'epoch': 1.08} +{'loss': 92.7967, 'grad_norm': 1132.8340127880888, 'learning_rate': 2.449528790121559e-06, 'epoch': 1.08} +{'loss': 92.6826, 'grad_norm': 1249.0879105251909, 'learning_rate': 2.446671185704106e-06, 'epoch': 1.08} +{'loss': 91.7859, 'grad_norm': 1042.0810968271676, 'learning_rate': 2.443815016740877e-06, 'epoch': 1.08} +{'loss': 94.7396, 'grad_norm': 1196.3609749055977, 'learning_rate': 2.4409602837746625e-06, 'epoch': 1.09} +{'loss': 92.5828, 'grad_norm': 1194.3745896803437, 'learning_rate': 2.4381069873479866e-06, 'epoch': 1.09} +{'loss': 95.0313, 'grad_norm': 1093.9083024100705, 'learning_rate': 2.4352551280031002e-06, 'epoch': 1.09} +{'loss': 92.4631, 'grad_norm': 1178.8606042267666, 'learning_rate': 2.4324047062819744e-06, 'epoch': 1.09} +{'loss': 92.164, 'grad_norm': 1255.7662630761274, 'learning_rate': 2.4295557227263122e-06, 'epoch': 1.09} +{'loss': 93.3323, 'grad_norm': 1328.2649197500991, 'learning_rate': 2.426708177877537e-06, 'epoch': 1.09} +{'loss': 90.7459, 'grad_norm': 1250.879790505826, 'learning_rate': 2.423862072276808e-06, 'epoch': 1.09} +{'loss': 92.3627, 'grad_norm': 1112.7183958384612, 'learning_rate': 2.4210174064650084e-06, 'epoch': 1.09} +{'loss': 96.1408, 'grad_norm': 1173.489594693941, 'learning_rate': 2.418174180982744e-06, 'epoch': 1.09} +{'loss': 93.9447, 'grad_norm': 1152.2383393677037, 'learning_rate': 2.415332396370348e-06, 'epoch': 1.09} +{'loss': 92.584, 'grad_norm': 1146.57294465003, 'learning_rate': 2.4124920531678773e-06, 'epoch': 1.09} +{'loss': 91.933, 'grad_norm': 1197.5316446400145, 'learning_rate': 2.409653151915122e-06, 'epoch': 1.09} +{'loss': 92.8962, 'grad_norm': 1158.1992003678663, 'learning_rate': 2.4068156931515973e-06, 'epoch': 1.09} +{'loss': 94.182, 'grad_norm': 1426.3496067044962, 'learning_rate': 2.403979677416537e-06, 'epoch': 1.09} +{'loss': 92.6112, 'grad_norm': 1126.0023531545503, 'learning_rate': 2.4011451052489064e-06, 'epoch': 1.09} +{'loss': 93.7309, 'grad_norm': 1203.3254817671504, 'learning_rate': 2.398311977187391e-06, 'epoch': 1.09} +{'loss': 92.3467, 'grad_norm': 1115.1891178735932, 'learning_rate': 2.3954802937704092e-06, 'epoch': 1.09} +{'loss': 95.1687, 'grad_norm': 1229.504817542022, 'learning_rate': 2.392650055536103e-06, 'epoch': 1.09} +{'loss': 91.7486, 'grad_norm': 1092.2477174314645, 'learning_rate': 2.3898212630223385e-06, 'epoch': 1.09} +{'loss': 95.7647, 'grad_norm': 1121.7006083065392, 'learning_rate': 2.3869939167667034e-06, 'epoch': 1.09} +{'loss': 92.3753, 'grad_norm': 1086.552837832647, 'learning_rate': 2.3841680173065127e-06, 'epoch': 1.09} +{'loss': 92.6414, 'grad_norm': 1098.0843129589161, 'learning_rate': 2.3813435651788107e-06, 'epoch': 1.09} +{'loss': 91.1204, 'grad_norm': 1106.0522019194275, 'learning_rate': 2.3785205609203655e-06, 'epoch': 1.09} +{'loss': 94.1793, 'grad_norm': 1174.0698410973916, 'learning_rate': 2.3756990050676666e-06, 'epoch': 1.09} +{'loss': 91.8473, 'grad_norm': 1213.4904841865089, 'learning_rate': 2.3728788981569286e-06, 'epoch': 1.09} +{'loss': 92.1211, 'grad_norm': 1085.756898814106, 'learning_rate': 2.3700602407240892e-06, 'epoch': 1.09} +{'loss': 94.8334, 'grad_norm': 1116.5320954481626, 'learning_rate': 2.3672430333048168e-06, 'epoch': 1.09} +{'loss': 93.8238, 'grad_norm': 1157.4499986050864, 'learning_rate': 2.3644272764345034e-06, 'epoch': 1.09} +{'loss': 93.8466, 'grad_norm': 1232.542426801539, 'learning_rate': 2.3616129706482604e-06, 'epoch': 1.09} +{'loss': 95.4907, 'grad_norm': 1182.3768638481877, 'learning_rate': 2.3588001164809247e-06, 'epoch': 1.09} +{'loss': 92.7953, 'grad_norm': 1161.674453879012, 'learning_rate': 2.355988714467057e-06, 'epoch': 1.09} +{'loss': 92.2105, 'grad_norm': 1117.926164129124, 'learning_rate': 2.353178765140949e-06, 'epoch': 1.09} +{'loss': 93.3631, 'grad_norm': 1110.659898007899, 'learning_rate': 2.3503702690366024e-06, 'epoch': 1.09} +{'loss': 92.4889, 'grad_norm': 1251.2324453621573, 'learning_rate': 2.3475632266877602e-06, 'epoch': 1.09} +{'loss': 92.7201, 'grad_norm': 1161.8702195094431, 'learning_rate': 2.344757638627875e-06, 'epoch': 1.09} +{'loss': 93.9576, 'grad_norm': 1139.8721523673928, 'learning_rate': 2.3419535053901264e-06, 'epoch': 1.09} +{'loss': 93.0717, 'grad_norm': 1278.5413612152788, 'learning_rate': 2.339150827507425e-06, 'epoch': 1.09} +{'loss': 92.5534, 'grad_norm': 1238.918184941268, 'learning_rate': 2.3363496055123912e-06, 'epoch': 1.09} +{'loss': 91.3919, 'grad_norm': 1176.3499704464496, 'learning_rate': 2.3335498399373845e-06, 'epoch': 1.09} +{'loss': 93.2523, 'grad_norm': 1126.447189269974, 'learning_rate': 2.330751531314477e-06, 'epoch': 1.09} +{'loss': 93.1706, 'grad_norm': 1115.0639643058723, 'learning_rate': 2.3279546801754615e-06, 'epoch': 1.09} +{'loss': 93.1656, 'grad_norm': 1151.4991816865809, 'learning_rate': 2.325159287051866e-06, 'epoch': 1.09} +{'loss': 92.1046, 'grad_norm': 1076.8627452163123, 'learning_rate': 2.322365352474928e-06, 'epoch': 1.09} +{'loss': 93.1103, 'grad_norm': 1167.5975102395992, 'learning_rate': 2.31957287697562e-06, 'epoch': 1.09} +{'loss': 93.1106, 'grad_norm': 1202.5775998130398, 'learning_rate': 2.3167818610846293e-06, 'epoch': 1.09} +{'loss': 92.1, 'grad_norm': 1796.7542985545876, 'learning_rate': 2.313992305332363e-06, 'epoch': 1.09} +{'loss': 92.5589, 'grad_norm': 1046.6213863430075, 'learning_rate': 2.3112042102489627e-06, 'epoch': 1.09} +{'loss': 90.3963, 'grad_norm': 1070.085878462876, 'learning_rate': 2.30841757636428e-06, 'epoch': 1.09} +{'loss': 92.3804, 'grad_norm': 1131.7950982417462, 'learning_rate': 2.3056324042078994e-06, 'epoch': 1.09} +{'loss': 92.2175, 'grad_norm': 1110.674711565401, 'learning_rate': 2.302848694309118e-06, 'epoch': 1.09} +{'loss': 91.9736, 'grad_norm': 1069.2948843374336, 'learning_rate': 2.300066447196957e-06, 'epoch': 1.09} +{'loss': 94.8186, 'grad_norm': 1273.6329104319207, 'learning_rate': 2.2972856634001682e-06, 'epoch': 1.09} +{'loss': 91.859, 'grad_norm': 986.7103185079551, 'learning_rate': 2.2945063434472135e-06, 'epoch': 1.1} +{'loss': 92.8184, 'grad_norm': 1168.553526382917, 'learning_rate': 2.291728487866287e-06, 'epoch': 1.1} +{'loss': 93.0176, 'grad_norm': 1200.8779705689158, 'learning_rate': 2.2889520971852976e-06, 'epoch': 1.1} +{'loss': 92.9299, 'grad_norm': 1328.3134792655153, 'learning_rate': 2.2861771719318736e-06, 'epoch': 1.1} +{'loss': 92.1217, 'grad_norm': 1240.9607929481604, 'learning_rate': 2.283403712633375e-06, 'epoch': 1.1} +{'loss': 92.1877, 'grad_norm': 1083.766901556073, 'learning_rate': 2.2806317198168737e-06, 'epoch': 1.1} +{'loss': 93.7462, 'grad_norm': 1157.689293770678, 'learning_rate': 2.277861194009169e-06, 'epoch': 1.1} +{'loss': 90.37, 'grad_norm': 1132.0017197969873, 'learning_rate': 2.2750921357367785e-06, 'epoch': 1.1} +{'loss': 91.4832, 'grad_norm': 1238.3238355665271, 'learning_rate': 2.2723245455259357e-06, 'epoch': 1.1} +{'loss': 92.1776, 'grad_norm': 1117.4440437101516, 'learning_rate': 2.2695584239026092e-06, 'epoch': 1.1} +{'loss': 92.0513, 'grad_norm': 1161.0104928354594, 'learning_rate': 2.2667937713924727e-06, 'epoch': 1.1} +{'loss': 93.8148, 'grad_norm': 1049.6181271549256, 'learning_rate': 2.2640305885209336e-06, 'epoch': 1.1} +{'loss': 94.5211, 'grad_norm': 1152.2231403213013, 'learning_rate': 2.261268875813112e-06, 'epoch': 1.1} +{'loss': 89.6618, 'grad_norm': 1042.7383322578985, 'learning_rate': 2.258508633793847e-06, 'epoch': 1.1} +{'loss': 93.508, 'grad_norm': 1106.569400620749, 'learning_rate': 2.2557498629877095e-06, 'epoch': 1.1} +{'loss': 93.5414, 'grad_norm': 1156.5405177118603, 'learning_rate': 2.2529925639189796e-06, 'epoch': 1.1} +{'loss': 92.6635, 'grad_norm': 1113.2177194816347, 'learning_rate': 2.2502367371116595e-06, 'epoch': 1.1} +{'loss': 93.3679, 'grad_norm': 1269.5844970910418, 'learning_rate': 2.247482383089478e-06, 'epoch': 1.1} +{'loss': 94.5449, 'grad_norm': 1074.6413590810675, 'learning_rate': 2.2447295023758755e-06, 'epoch': 1.1} +{'loss': 91.38, 'grad_norm': 1122.4459162165865, 'learning_rate': 2.241978095494022e-06, 'epoch': 1.1} +{'loss': 93.4045, 'grad_norm': 1092.3530227396798, 'learning_rate': 2.239228162966798e-06, 'epoch': 1.1} +{'loss': 92.0434, 'grad_norm': 1045.9162661396474, 'learning_rate': 2.2364797053168052e-06, 'epoch': 1.1} +{'loss': 92.3552, 'grad_norm': 1170.8082027940693, 'learning_rate': 2.2337327230663753e-06, 'epoch': 1.1} +{'loss': 93.121, 'grad_norm': 1099.836038397293, 'learning_rate': 2.2309872167375447e-06, 'epoch': 1.1} +{'loss': 93.1699, 'grad_norm': 1253.1493720342094, 'learning_rate': 2.2282431868520815e-06, 'epoch': 1.1} +{'loss': 90.603, 'grad_norm': 1109.8536982019791, 'learning_rate': 2.2255006339314667e-06, 'epoch': 1.1} +{'loss': 89.9024, 'grad_norm': 1043.7917883202178, 'learning_rate': 2.2227595584968985e-06, 'epoch': 1.1} +{'loss': 93.4572, 'grad_norm': 1168.775962703853, 'learning_rate': 2.2200199610693053e-06, 'epoch': 1.1} +{'loss': 92.6528, 'grad_norm': 1169.4857168166714, 'learning_rate': 2.2172818421693197e-06, 'epoch': 1.1} +{'loss': 91.2978, 'grad_norm': 1037.2267430264014, 'learning_rate': 2.2145452023173074e-06, 'epoch': 1.1} +{'loss': 91.4992, 'grad_norm': 1141.1771776356002, 'learning_rate': 2.2118100420333434e-06, 'epoch': 1.1} +{'loss': 92.1459, 'grad_norm': 1566.2527773888453, 'learning_rate': 2.2090763618372225e-06, 'epoch': 1.1} +{'loss': 92.7353, 'grad_norm': 1015.6568535636756, 'learning_rate': 2.206344162248466e-06, 'epoch': 1.1} +{'loss': 90.4439, 'grad_norm': 1101.5969253036315, 'learning_rate': 2.2036134437863012e-06, 'epoch': 1.1} +{'loss': 94.714, 'grad_norm': 1082.0623496294027, 'learning_rate': 2.200884206969689e-06, 'epoch': 1.1} +{'loss': 89.6477, 'grad_norm': 1161.6916081441757, 'learning_rate': 2.1981564523172947e-06, 'epoch': 1.1} +{'loss': 93.8973, 'grad_norm': 1286.4725565273648, 'learning_rate': 2.1954301803475075e-06, 'epoch': 1.1} +{'loss': 92.4255, 'grad_norm': 1216.3447665095619, 'learning_rate': 2.19270539157844e-06, 'epoch': 1.1} +{'loss': 92.2081, 'grad_norm': 1079.1072601967078, 'learning_rate': 2.189982086527913e-06, 'epoch': 1.1} +{'loss': 93.9001, 'grad_norm': 1200.2319832333842, 'learning_rate': 2.1872602657134757e-06, 'epoch': 1.1} +{'loss': 91.2799, 'grad_norm': 1155.39483515346, 'learning_rate': 2.1845399296523872e-06, 'epoch': 1.1} +{'loss': 92.3871, 'grad_norm': 1093.7184849000032, 'learning_rate': 2.1818210788616244e-06, 'epoch': 1.1} +{'loss': 92.8364, 'grad_norm': 1220.7457393592817, 'learning_rate': 2.1791037138578896e-06, 'epoch': 1.1} +{'loss': 92.8288, 'grad_norm': 1076.833102117835, 'learning_rate': 2.176387835157594e-06, 'epoch': 1.1} +{'loss': 91.9603, 'grad_norm': 1114.0886013262095, 'learning_rate': 2.1736734432768747e-06, 'epoch': 1.1} +{'loss': 93.0926, 'grad_norm': 2553.9126471606546, 'learning_rate': 2.1709605387315792e-06, 'epoch': 1.1} +{'loss': 90.3464, 'grad_norm': 1021.0659526553565, 'learning_rate': 2.168249122037275e-06, 'epoch': 1.1} +{'loss': 91.8834, 'grad_norm': 1164.628470529332, 'learning_rate': 2.1655391937092427e-06, 'epoch': 1.1} +{'loss': 90.6923, 'grad_norm': 1108.120018318992, 'learning_rate': 2.1628307542624894e-06, 'epoch': 1.1} +{'loss': 91.9187, 'grad_norm': 1084.3699529468709, 'learning_rate': 2.1601238042117356e-06, 'epoch': 1.1} +{'loss': 94.3287, 'grad_norm': 1148.3989712343719, 'learning_rate': 2.1574183440714137e-06, 'epoch': 1.1} +{'loss': 91.2647, 'grad_norm': 1247.6037967794298, 'learning_rate': 2.1547143743556765e-06, 'epoch': 1.1} +{'loss': 93.3576, 'grad_norm': 1094.702363878661, 'learning_rate': 2.152011895578391e-06, 'epoch': 1.11} +{'loss': 91.4249, 'grad_norm': 1205.5358580949887, 'learning_rate': 2.1493109082531473e-06, 'epoch': 1.11} +{'loss': 91.5629, 'grad_norm': 1071.7185748536854, 'learning_rate': 2.146611412893248e-06, 'epoch': 1.11} +{'loss': 90.8578, 'grad_norm': 1028.8231270620067, 'learning_rate': 2.143913410011712e-06, 'epoch': 1.11} +{'loss': 92.0505, 'grad_norm': 1298.1635695899463, 'learning_rate': 2.1412169001212733e-06, 'epoch': 1.11} +{'loss': 89.3492, 'grad_norm': 1091.3907929589848, 'learning_rate': 2.1385218837343825e-06, 'epoch': 1.11} +{'loss': 91.9739, 'grad_norm': 1031.0621327172562, 'learning_rate': 2.135828361363208e-06, 'epoch': 1.11} +{'loss': 91.8709, 'grad_norm': 1294.2173309559576, 'learning_rate': 2.1331363335196386e-06, 'epoch': 1.11} +{'loss': 92.9253, 'grad_norm': 1198.801535670884, 'learning_rate': 2.1304458007152694e-06, 'epoch': 1.11} +{'loss': 93.959, 'grad_norm': 1266.4937170575706, 'learning_rate': 2.1277567634614184e-06, 'epoch': 1.11} +{'loss': 90.2867, 'grad_norm': 1134.9121428379433, 'learning_rate': 2.1250692222691127e-06, 'epoch': 1.11} +{'loss': 91.6941, 'grad_norm': 1098.1694864149254, 'learning_rate': 2.1223831776491043e-06, 'epoch': 1.11} +{'loss': 91.1112, 'grad_norm': 1092.8929926803178, 'learning_rate': 2.119698630111857e-06, 'epoch': 1.11} +{'loss': 90.3013, 'grad_norm': 1155.8315190014505, 'learning_rate': 2.1170155801675473e-06, 'epoch': 1.11} +{'loss': 90.6917, 'grad_norm': 1157.6162113335463, 'learning_rate': 2.1143340283260696e-06, 'epoch': 1.11} +{'loss': 91.0453, 'grad_norm': 1084.2666445311072, 'learning_rate': 2.111653975097029e-06, 'epoch': 1.11} +{'loss': 92.9136, 'grad_norm': 1197.286359476684, 'learning_rate': 2.108975420989755e-06, 'epoch': 1.11} +{'loss': 94.7935, 'grad_norm': 1162.3739804839886, 'learning_rate': 2.106298366513283e-06, 'epoch': 1.11} +{'loss': 92.5101, 'grad_norm': 1163.205726206285, 'learning_rate': 2.103622812176371e-06, 'epoch': 1.11} +{'loss': 92.2128, 'grad_norm': 1049.4824741005032, 'learning_rate': 2.1009487584874875e-06, 'epoch': 1.11} +{'loss': 90.9987, 'grad_norm': 1061.4019280642965, 'learning_rate': 2.0982762059548135e-06, 'epoch': 1.11} +{'loss': 91.2017, 'grad_norm': 1158.6007189633124, 'learning_rate': 2.095605155086252e-06, 'epoch': 1.11} +{'loss': 92.74, 'grad_norm': 1122.7690902909917, 'learning_rate': 2.0929356063894125e-06, 'epoch': 1.11} +{'loss': 91.1044, 'grad_norm': 1092.67899223162, 'learning_rate': 2.090267560371627e-06, 'epoch': 1.11} +{'loss': 91.3643, 'grad_norm': 1046.8198364938526, 'learning_rate': 2.0876010175399364e-06, 'epoch': 1.11} +{'loss': 90.4686, 'grad_norm': 1066.8065421526824, 'learning_rate': 2.0849359784010947e-06, 'epoch': 1.11} +{'loss': 91.242, 'grad_norm': 1120.1248315629773, 'learning_rate': 2.082272443461577e-06, 'epoch': 1.11} +{'loss': 91.771, 'grad_norm': 1214.3130643252218, 'learning_rate': 2.0796104132275642e-06, 'epoch': 1.11} +{'loss': 92.776, 'grad_norm': 1263.5120602169025, 'learning_rate': 2.07694988820496e-06, 'epoch': 1.11} +{'loss': 94.8806, 'grad_norm': 1365.391665332687, 'learning_rate': 2.0742908688993746e-06, 'epoch': 1.11} +{'loss': 90.3984, 'grad_norm': 1139.9108346034739, 'learning_rate': 2.071633355816136e-06, 'epoch': 1.11} +{'loss': 92.8764, 'grad_norm': 1132.037069148229, 'learning_rate': 2.0689773494602827e-06, 'epoch': 1.11} +{'loss': 91.7637, 'grad_norm': 1051.8514000487587, 'learning_rate': 2.0663228503365695e-06, 'epoch': 1.11} +{'loss': 91.2552, 'grad_norm': 1189.0577287307126, 'learning_rate': 2.0636698589494684e-06, 'epoch': 1.11} +{'loss': 93.7447, 'grad_norm': 1093.2991315433424, 'learning_rate': 2.061018375803159e-06, 'epoch': 1.11} +{'loss': 94.0268, 'grad_norm': 1239.2694686632713, 'learning_rate': 2.058368401401535e-06, 'epoch': 1.11} +{'loss': 92.986, 'grad_norm': 1083.1316774418815, 'learning_rate': 2.0557199362482005e-06, 'epoch': 1.11} +{'loss': 92.3449, 'grad_norm': 1174.2040621277602, 'learning_rate': 2.0530729808464812e-06, 'epoch': 1.11} +{'loss': 92.0372, 'grad_norm': 1152.389403759043, 'learning_rate': 2.050427535699413e-06, 'epoch': 1.11} +{'loss': 92.05, 'grad_norm': 1158.4397929081993, 'learning_rate': 2.047783601309742e-06, 'epoch': 1.11} +{'loss': 91.7698, 'grad_norm': 1146.9984415481038, 'learning_rate': 2.0451411781799257e-06, 'epoch': 1.11} +{'loss': 91.0068, 'grad_norm': 1119.422277671072, 'learning_rate': 2.0425002668121376e-06, 'epoch': 1.11} +{'loss': 91.5293, 'grad_norm': 1133.8535323503218, 'learning_rate': 2.0398608677082633e-06, 'epoch': 1.11} +{'loss': 91.5558, 'grad_norm': 1165.82145374594, 'learning_rate': 2.037222981369905e-06, 'epoch': 1.11} +{'loss': 91.3472, 'grad_norm': 1074.6350654235355, 'learning_rate': 2.0345866082983713e-06, 'epoch': 1.11} +{'loss': 90.8596, 'grad_norm': 1013.4417934446639, 'learning_rate': 2.031951748994685e-06, 'epoch': 1.11} +{'loss': 91.0929, 'grad_norm': 1128.2894229921328, 'learning_rate': 2.029318403959577e-06, 'epoch': 1.11} +{'loss': 92.1541, 'grad_norm': 1223.9174472854322, 'learning_rate': 2.0266865736935005e-06, 'epoch': 1.11} +{'loss': 90.8837, 'grad_norm': 1199.9476302976816, 'learning_rate': 2.0240562586966174e-06, 'epoch': 1.11} +{'loss': 91.8036, 'grad_norm': 967.6635592630669, 'learning_rate': 2.0214274594687955e-06, 'epoch': 1.11} +{'loss': 89.778, 'grad_norm': 1172.7473131418335, 'learning_rate': 2.0188001765096198e-06, 'epoch': 1.11} +{'loss': 91.3511, 'grad_norm': 1053.0696425464025, 'learning_rate': 2.016174410318383e-06, 'epoch': 1.11} +{'loss': 90.9407, 'grad_norm': 1320.4183261868043, 'learning_rate': 2.0135501613940957e-06, 'epoch': 1.12} +{'loss': 91.9806, 'grad_norm': 1179.0300307595614, 'learning_rate': 2.010927430235479e-06, 'epoch': 1.12} +{'loss': 92.0314, 'grad_norm': 1097.31795189891, 'learning_rate': 2.0083062173409607e-06, 'epoch': 1.12} +[2025-05-30 00:22:01,337] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step58000 is about to be saved! +[2025-05-30 00:22:01,361] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 00:22:01,361] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 00:22:01,412] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 00:22:01,543] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 00:28:02,398] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 00:28:02,433] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 00:28:11,870] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step58000 is ready now! +[2025-05-30 00:28:11,902] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step58000 is about to be saved! +[2025-05-30 00:28:11,926] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 00:28:11,926] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 00:28:12,007] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 00:28:12,113] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 00:33:59,047] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 00:33:59,122] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-58000/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 00:34:07,553] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step58000 is ready now! +{'loss': 90.4744, 'grad_norm': 1076.7180236528795, 'learning_rate': 2.005686523208683e-06, 'epoch': 1.12} +{'loss': 91.7743, 'grad_norm': 1114.6317901797843, 'learning_rate': 2.0030683483364977e-06, 'epoch': 1.12} +{'loss': 93.4553, 'grad_norm': 1142.5054795527717, 'learning_rate': 2.000451693221971e-06, 'epoch': 1.12} +{'loss': 91.396, 'grad_norm': 1059.2698172988833, 'learning_rate': 1.9978365583623827e-06, 'epoch': 1.12} +{'loss': 90.4678, 'grad_norm': 1118.2498432967845, 'learning_rate': 1.9952229442547157e-06, 'epoch': 1.12} +{'loss': 89.5109, 'grad_norm': 1199.088896773199, 'learning_rate': 1.992610851395669e-06, 'epoch': 1.12} +{'loss': 91.0615, 'grad_norm': 1127.171698025415, 'learning_rate': 1.990000280281649e-06, 'epoch': 1.12} +{'loss': 92.8656, 'grad_norm': 1080.1765893381137, 'learning_rate': 1.987391231408777e-06, 'epoch': 1.12} +{'loss': 91.424, 'grad_norm': 1203.0571653786565, 'learning_rate': 1.984783705272887e-06, 'epoch': 1.12} +{'loss': 91.9531, 'grad_norm': 1170.8983259718354, 'learning_rate': 1.9821777023695178e-06, 'epoch': 1.12} +{'loss': 90.6396, 'grad_norm': 1243.1480631641236, 'learning_rate': 1.979573223193919e-06, 'epoch': 1.12} +{'loss': 89.752, 'grad_norm': 1060.661644598094, 'learning_rate': 1.976970268241055e-06, 'epoch': 1.12} +{'loss': 93.0326, 'grad_norm': 1135.6466053659678, 'learning_rate': 1.9743688380055946e-06, 'epoch': 1.12} +{'loss': 92.7848, 'grad_norm': 1131.5211401397967, 'learning_rate': 1.9717689329819224e-06, 'epoch': 1.12} +{'loss': 90.1233, 'grad_norm': 1134.0209254330807, 'learning_rate': 1.9691705536641335e-06, 'epoch': 1.12} +{'loss': 91.2367, 'grad_norm': 1316.4315176756606, 'learning_rate': 1.9665737005460295e-06, 'epoch': 1.12} +{'loss': 93.1513, 'grad_norm': 1146.0841951133375, 'learning_rate': 1.9639783741211218e-06, 'epoch': 1.12} +{'loss': 92.4774, 'grad_norm': 1086.9177924536798, 'learning_rate': 1.961384574882631e-06, 'epoch': 1.12} +{'loss': 95.2095, 'grad_norm': 1178.3094270708539, 'learning_rate': 1.958792303323491e-06, 'epoch': 1.12} +{'loss': 90.3475, 'grad_norm': 1080.835393516925, 'learning_rate': 1.956201559936348e-06, 'epoch': 1.12} +{'loss': 92.7374, 'grad_norm': 1286.8940646156218, 'learning_rate': 1.95361234521355e-06, 'epoch': 1.12} +{'loss': 91.6908, 'grad_norm': 1180.694781088709, 'learning_rate': 1.9510246596471592e-06, 'epoch': 1.12} +{'loss': 90.6723, 'grad_norm': 1082.730385298431, 'learning_rate': 1.948438503728942e-06, 'epoch': 1.12} +{'loss': 89.4194, 'grad_norm': 1200.8680970217365, 'learning_rate': 1.945853877950382e-06, 'epoch': 1.12} +{'loss': 92.0196, 'grad_norm': 1137.704765315387, 'learning_rate': 1.94327078280267e-06, 'epoch': 1.12} +{'loss': 91.6767, 'grad_norm': 1173.6339462636822, 'learning_rate': 1.9406892187767012e-06, 'epoch': 1.12} +{'loss': 93.3837, 'grad_norm': 1266.2036251847237, 'learning_rate': 1.9381091863630843e-06, 'epoch': 1.12} +{'loss': 91.8495, 'grad_norm': 1212.3264580820558, 'learning_rate': 1.9355306860521307e-06, 'epoch': 1.12} +{'loss': 92.3775, 'grad_norm': 1038.9138645779478, 'learning_rate': 1.9329537183338686e-06, 'epoch': 1.12} +{'loss': 90.6867, 'grad_norm': 1033.2128588445423, 'learning_rate': 1.9303782836980355e-06, 'epoch': 1.12} +{'loss': 91.8539, 'grad_norm': 1121.1765622750092, 'learning_rate': 1.92780438263407e-06, 'epoch': 1.12} +{'loss': 89.9388, 'grad_norm': 1165.400089920751, 'learning_rate': 1.9252320156311234e-06, 'epoch': 1.12} +{'loss': 92.0701, 'grad_norm': 1275.0071982067336, 'learning_rate': 1.9226611831780516e-06, 'epoch': 1.12} +{'loss': 91.9294, 'grad_norm': 1093.347716671893, 'learning_rate': 1.9200918857634254e-06, 'epoch': 1.12} +{'loss': 92.1759, 'grad_norm': 1085.5748891512624, 'learning_rate': 1.917524123875524e-06, 'epoch': 1.12} +{'loss': 91.6773, 'grad_norm': 1176.3996433373143, 'learning_rate': 1.9149578980023275e-06, 'epoch': 1.12} +{'loss': 91.2575, 'grad_norm': 1041.473326608437, 'learning_rate': 1.9123932086315287e-06, 'epoch': 1.12} +{'loss': 92.358, 'grad_norm': 1350.0064132035864, 'learning_rate': 1.9098300562505266e-06, 'epoch': 1.12} +{'loss': 92.4882, 'grad_norm': 1096.2926001923959, 'learning_rate': 1.9072684413464305e-06, 'epoch': 1.12} +{'loss': 91.6981, 'grad_norm': 1309.4810946315208, 'learning_rate': 1.9047083644060583e-06, 'epoch': 1.12} +{'loss': 91.0246, 'grad_norm': 1221.1347497770155, 'learning_rate': 1.902149825915932e-06, 'epoch': 1.12} +{'loss': 90.88, 'grad_norm': 1134.7659178439903, 'learning_rate': 1.8995928263622843e-06, 'epoch': 1.12} +{'loss': 91.1439, 'grad_norm': 1038.573977629687, 'learning_rate': 1.8970373662310492e-06, 'epoch': 1.12} +{'loss': 92.604, 'grad_norm': 1070.0816282077387, 'learning_rate': 1.8944834460078754e-06, 'epoch': 1.12} +{'loss': 91.679, 'grad_norm': 1123.80576586423, 'learning_rate': 1.891931066178122e-06, 'epoch': 1.12} +{'loss': 91.7897, 'grad_norm': 1073.3772079779183, 'learning_rate': 1.8893802272268435e-06, 'epoch': 1.12} +{'loss': 92.4703, 'grad_norm': 1071.1954175174278, 'learning_rate': 1.886830929638811e-06, 'epoch': 1.12} +{'loss': 90.4427, 'grad_norm': 949.5489397641165, 'learning_rate': 1.8842831738984957e-06, 'epoch': 1.12} +{'loss': 90.0321, 'grad_norm': 1071.0744999030642, 'learning_rate': 1.881736960490086e-06, 'epoch': 1.12} +{'loss': 91.38, 'grad_norm': 1095.239475246875, 'learning_rate': 1.8791922898974634e-06, 'epoch': 1.13} +{'loss': 91.2303, 'grad_norm': 1122.7525046376638, 'learning_rate': 1.8766491626042316e-06, 'epoch': 1.13} +{'loss': 91.6775, 'grad_norm': 1083.7953666186095, 'learning_rate': 1.874107579093688e-06, 'epoch': 1.13} +{'loss': 91.7136, 'grad_norm': 1104.612381527554, 'learning_rate': 1.8715675398488419e-06, 'epoch': 1.13} +{'loss': 89.5864, 'grad_norm': 1067.236966161828, 'learning_rate': 1.869029045352412e-06, 'epoch': 1.13} +{'loss': 93.5054, 'grad_norm': 1138.5479418866257, 'learning_rate': 1.866492096086815e-06, 'epoch': 1.13} +{'loss': 93.2932, 'grad_norm': 1038.930054543777, 'learning_rate': 1.8639566925341867e-06, 'epoch': 1.13} +{'loss': 92.4221, 'grad_norm': 1217.4577055741277, 'learning_rate': 1.8614228351763563e-06, 'epoch': 1.13} +{'loss': 90.5008, 'grad_norm': 1068.5328325372761, 'learning_rate': 1.8588905244948653e-06, 'epoch': 1.13} +{'loss': 91.3116, 'grad_norm': 1194.6422367726486, 'learning_rate': 1.8563597609709626e-06, 'epoch': 1.13} +{'loss': 90.1267, 'grad_norm': 1101.3772704552125, 'learning_rate': 1.8538305450855987e-06, 'epoch': 1.13} +{'loss': 91.7383, 'grad_norm': 1202.1452938382774, 'learning_rate': 1.8513028773194353e-06, 'epoch': 1.13} +{'loss': 92.4237, 'grad_norm': 1443.5903783473448, 'learning_rate': 1.8487767581528348e-06, 'epoch': 1.13} +{'loss': 91.145, 'grad_norm': 1174.409672645726, 'learning_rate': 1.8462521880658668e-06, 'epoch': 1.13} +{'loss': 89.7865, 'grad_norm': 1182.06989204825, 'learning_rate': 1.8437291675383096e-06, 'epoch': 1.13} +{'loss': 91.0838, 'grad_norm': 1428.9912261637012, 'learning_rate': 1.8412076970496407e-06, 'epoch': 1.13} +{'loss': 90.0997, 'grad_norm': 1269.8021160350488, 'learning_rate': 1.8386877770790524e-06, 'epoch': 1.13} +{'loss': 89.274, 'grad_norm': 1111.0312740440572, 'learning_rate': 1.8361694081054337e-06, 'epoch': 1.13} +{'loss': 90.6828, 'grad_norm': 1079.346449169227, 'learning_rate': 1.8336525906073799e-06, 'epoch': 1.13} +{'loss': 90.2281, 'grad_norm': 1095.6051790339043, 'learning_rate': 1.831137325063198e-06, 'epoch': 1.13} +{'loss': 90.0648, 'grad_norm': 1175.325945992512, 'learning_rate': 1.8286236119508916e-06, 'epoch': 1.13} +{'loss': 91.0581, 'grad_norm': 1409.6690063322756, 'learning_rate': 1.8261114517481771e-06, 'epoch': 1.13} +{'loss': 91.3815, 'grad_norm': 1065.7493832089538, 'learning_rate': 1.8236008449324694e-06, 'epoch': 1.13} +{'loss': 90.6251, 'grad_norm': 1239.8277960488583, 'learning_rate': 1.8210917919808891e-06, 'epoch': 1.13} +{'loss': 92.0595, 'grad_norm': 1017.2748678992663, 'learning_rate': 1.8185842933702668e-06, 'epoch': 1.13} +{'loss': 89.7493, 'grad_norm': 1062.225269977965, 'learning_rate': 1.8160783495771316e-06, 'epoch': 1.13} +{'loss': 94.1852, 'grad_norm': 1112.0882109232593, 'learning_rate': 1.8135739610777224e-06, 'epoch': 1.13} +{'loss': 91.5253, 'grad_norm': 1063.47387408214, 'learning_rate': 1.8110711283479776e-06, 'epoch': 1.13} +{'loss': 90.001, 'grad_norm': 1171.1712384850334, 'learning_rate': 1.8085698518635397e-06, 'epoch': 1.13} +{'loss': 90.1103, 'grad_norm': 1042.185385765338, 'learning_rate': 1.8060701320997631e-06, 'epoch': 1.13} +{'loss': 90.8764, 'grad_norm': 1084.2920359023408, 'learning_rate': 1.8035719695316955e-06, 'epoch': 1.13} +{'loss': 89.6517, 'grad_norm': 1011.8529277441736, 'learning_rate': 1.8010753646340984e-06, 'epoch': 1.13} +{'loss': 91.6948, 'grad_norm': 1077.8672550098095, 'learning_rate': 1.7985803178814332e-06, 'epoch': 1.13} +{'loss': 91.7319, 'grad_norm': 1068.2483107822893, 'learning_rate': 1.7960868297478607e-06, 'epoch': 1.13} +{'loss': 92.8723, 'grad_norm': 1082.1583294888658, 'learning_rate': 1.7935949007072539e-06, 'epoch': 1.13} +{'loss': 91.587, 'grad_norm': 1138.4856505621758, 'learning_rate': 1.7911045312331853e-06, 'epoch': 1.13} +{'loss': 92.3676, 'grad_norm': 1134.759878521146, 'learning_rate': 1.7886157217989275e-06, 'epoch': 1.13} +{'loss': 92.6995, 'grad_norm': 1044.0278513663188, 'learning_rate': 1.7861284728774652e-06, 'epoch': 1.13} +{'loss': 91.6321, 'grad_norm': 1124.3955927060376, 'learning_rate': 1.7836427849414773e-06, 'epoch': 1.13} +{'loss': 91.8382, 'grad_norm': 1146.46471710156, 'learning_rate': 1.7811586584633545e-06, 'epoch': 1.13} +{'loss': 92.0303, 'grad_norm': 1113.6048261716824, 'learning_rate': 1.7786760939151837e-06, 'epoch': 1.13} +{'loss': 91.0163, 'grad_norm': 1164.4042445899706, 'learning_rate': 1.7761950917687577e-06, 'epoch': 1.13} +{'loss': 91.3854, 'grad_norm': 1165.2406876942578, 'learning_rate': 1.7737156524955756e-06, 'epoch': 1.13} +{'loss': 89.6902, 'grad_norm': 1160.5228392331305, 'learning_rate': 1.771237776566832e-06, 'epoch': 1.13} +{'loss': 91.1764, 'grad_norm': 1077.8162530729462, 'learning_rate': 1.7687614644534333e-06, 'epoch': 1.13} +{'loss': 93.2899, 'grad_norm': 1104.5152284602268, 'learning_rate': 1.7662867166259834e-06, 'epoch': 1.13} +{'loss': 91.1116, 'grad_norm': 1166.1559284379118, 'learning_rate': 1.7638135335547856e-06, 'epoch': 1.13} +{'loss': 90.4312, 'grad_norm': 1040.582843035152, 'learning_rate': 1.7613419157098565e-06, 'epoch': 1.13} +{'loss': 93.1976, 'grad_norm': 1007.6731701266442, 'learning_rate': 1.7588718635609026e-06, 'epoch': 1.13} +{'loss': 89.8686, 'grad_norm': 1112.8972876133523, 'learning_rate': 1.7564033775773447e-06, 'epoch': 1.13} +{'loss': 90.9189, 'grad_norm': 1091.1252068411513, 'learning_rate': 1.753936458228297e-06, 'epoch': 1.13} +{'loss': 90.7184, 'grad_norm': 1049.7226219227703, 'learning_rate': 1.7514711059825773e-06, 'epoch': 1.13} +{'loss': 90.4657, 'grad_norm': 1196.0797767928177, 'learning_rate': 1.7490073213087133e-06, 'epoch': 1.14} +{'loss': 91.3435, 'grad_norm': 1159.4968395500152, 'learning_rate': 1.7465451046749227e-06, 'epoch': 1.14} +{'loss': 89.0988, 'grad_norm': 1141.7908952163996, 'learning_rate': 1.7440844565491377e-06, 'epoch': 1.14} +{'loss': 93.1448, 'grad_norm': 1098.1720253712042, 'learning_rate': 1.7416253773989822e-06, 'epoch': 1.14} +{'loss': 92.3111, 'grad_norm': 1074.6554487566805, 'learning_rate': 1.7391678676917856e-06, 'epoch': 1.14} +{'loss': 92.1513, 'grad_norm': 1230.1297682797265, 'learning_rate': 1.7367119278945822e-06, 'epoch': 1.14} +{'loss': 90.3665, 'grad_norm': 1062.357897519684, 'learning_rate': 1.7342575584741018e-06, 'epoch': 1.14} +{'loss': 91.0174, 'grad_norm': 970.6112766774704, 'learning_rate': 1.7318047598967835e-06, 'epoch': 1.14} +{'loss': 89.2126, 'grad_norm': 1137.9249902096499, 'learning_rate': 1.7293535326287614e-06, 'epoch': 1.14} +{'loss': 91.945, 'grad_norm': 1146.0205628114406, 'learning_rate': 1.7269038771358704e-06, 'epoch': 1.14} +{'loss': 89.304, 'grad_norm': 1187.2187937549832, 'learning_rate': 1.7244557938836538e-06, 'epoch': 1.14} +{'loss': 94.7587, 'grad_norm': 1099.781317500202, 'learning_rate': 1.722009283337347e-06, 'epoch': 1.14} +{'loss': 91.1257, 'grad_norm': 1014.7058585795859, 'learning_rate': 1.7195643459618971e-06, 'epoch': 1.14} +{'loss': 88.3381, 'grad_norm': 987.2698314293548, 'learning_rate': 1.7171209822219427e-06, 'epoch': 1.14} +{'loss': 91.8352, 'grad_norm': 1276.135230160353, 'learning_rate': 1.7146791925818273e-06, 'epoch': 1.14} +{'loss': 91.2559, 'grad_norm': 1183.6635636779406, 'learning_rate': 1.7122389775055937e-06, 'epoch': 1.14} +{'loss': 92.3802, 'grad_norm': 1143.279355359735, 'learning_rate': 1.7098003374569893e-06, 'epoch': 1.14} +{'loss': 92.7065, 'grad_norm': 1149.1849813065073, 'learning_rate': 1.7073632728994572e-06, 'epoch': 1.14} +{'loss': 92.4096, 'grad_norm': 1018.6869204128371, 'learning_rate': 1.7049277842961464e-06, 'epoch': 1.14} +{'loss': 89.5448, 'grad_norm': 1100.006004492795, 'learning_rate': 1.7024938721099015e-06, 'epoch': 1.14} +{'loss': 90.869, 'grad_norm': 1030.885369064586, 'learning_rate': 1.700061536803268e-06, 'epoch': 1.14} +{'loss': 91.4983, 'grad_norm': 1051.0793909995107, 'learning_rate': 1.697630778838497e-06, 'epoch': 1.14} +{'loss': 90.0776, 'grad_norm': 1038.6680425124266, 'learning_rate': 1.6952015986775317e-06, 'epoch': 1.14} +{'loss': 90.359, 'grad_norm': 1199.489289573606, 'learning_rate': 1.6927739967820245e-06, 'epoch': 1.14} +{'loss': 91.0476, 'grad_norm': 1298.1532541615509, 'learning_rate': 1.690347973613321e-06, 'epoch': 1.14} +{'loss': 90.5967, 'grad_norm': 1154.515703673051, 'learning_rate': 1.6879235296324658e-06, 'epoch': 1.14} +{'loss': 91.6578, 'grad_norm': 1144.000259411635, 'learning_rate': 1.6855006653002126e-06, 'epoch': 1.14} +{'loss': 90.8412, 'grad_norm': 1122.7191027920817, 'learning_rate': 1.683079381077003e-06, 'epoch': 1.14} +{'loss': 91.93, 'grad_norm': 1403.6894442227167, 'learning_rate': 1.6806596774229888e-06, 'epoch': 1.14} +{'loss': 91.7328, 'grad_norm': 1181.674148951686, 'learning_rate': 1.6782415547980158e-06, 'epoch': 1.14} +{'loss': 90.8155, 'grad_norm': 1088.214848117759, 'learning_rate': 1.675825013661626e-06, 'epoch': 1.14} +{'loss': 91.7679, 'grad_norm': 1050.8342635080185, 'learning_rate': 1.6734100544730713e-06, 'epoch': 1.14} +{'loss': 91.3662, 'grad_norm': 1197.405883972634, 'learning_rate': 1.6709966776912912e-06, 'epoch': 1.14} +{'loss': 90.9868, 'grad_norm': 1008.2747603783155, 'learning_rate': 1.6685848837749353e-06, 'epoch': 1.14} +{'loss': 90.6353, 'grad_norm': 1145.0170515389643, 'learning_rate': 1.6661746731823458e-06, 'epoch': 1.14} +{'loss': 89.9245, 'grad_norm': 1023.6096920991896, 'learning_rate': 1.663766046371561e-06, 'epoch': 1.14} +{'loss': 91.9193, 'grad_norm': 1617.6891679879877, 'learning_rate': 1.661359003800328e-06, 'epoch': 1.14} +{'loss': 93.1332, 'grad_norm': 1407.8785422855926, 'learning_rate': 1.658953545926084e-06, 'epoch': 1.14} +{'loss': 94.6384, 'grad_norm': 1285.241052878103, 'learning_rate': 1.6565496732059715e-06, 'epoch': 1.14} +{'loss': 89.9369, 'grad_norm': 1238.7921414126415, 'learning_rate': 1.6541473860968272e-06, 'epoch': 1.14} +{'loss': 94.4688, 'grad_norm': 999.9969005239598, 'learning_rate': 1.6517466850551855e-06, 'epoch': 1.14} +{'loss': 89.3453, 'grad_norm': 1137.8561962623232, 'learning_rate': 1.6493475705372863e-06, 'epoch': 1.14} +{'loss': 94.3976, 'grad_norm': 1153.7261263533383, 'learning_rate': 1.6469500429990592e-06, 'epoch': 1.14} +{'loss': 89.7919, 'grad_norm': 993.451463606156, 'learning_rate': 1.644554102896142e-06, 'epoch': 1.14} +{'loss': 89.2774, 'grad_norm': 1183.7883504461859, 'learning_rate': 1.6421597506838605e-06, 'epoch': 1.14} +{'loss': 90.5935, 'grad_norm': 1373.1973483229856, 'learning_rate': 1.639766986817244e-06, 'epoch': 1.14} +{'loss': 89.9585, 'grad_norm': 1185.9357051206227, 'learning_rate': 1.637375811751023e-06, 'epoch': 1.14} +{'loss': 92.4697, 'grad_norm': 1314.3955639189257, 'learning_rate': 1.6349862259396189e-06, 'epoch': 1.14} +{'loss': 89.694, 'grad_norm': 1206.8602652866007, 'learning_rate': 1.632598229837158e-06, 'epoch': 1.14} +{'loss': 89.4827, 'grad_norm': 1068.5342321529722, 'learning_rate': 1.6302118238974584e-06, 'epoch': 1.14} +{'loss': 91.2351, 'grad_norm': 1160.9725917637022, 'learning_rate': 1.627827008574042e-06, 'epoch': 1.14} +{'loss': 91.3031, 'grad_norm': 1075.2828680742855, 'learning_rate': 1.62544378432012e-06, 'epoch': 1.14} +{'loss': 92.1406, 'grad_norm': 1348.6154711802108, 'learning_rate': 1.6230621515886102e-06, 'epoch': 1.15} +{'loss': 90.6114, 'grad_norm': 1010.5611513684756, 'learning_rate': 1.620682110832127e-06, 'epoch': 1.15} +{'loss': 92.7677, 'grad_norm': 1134.848291120292, 'learning_rate': 1.6183036625029758e-06, 'epoch': 1.15} +{'loss': 91.3, 'grad_norm': 1061.4741061925074, 'learning_rate': 1.6159268070531642e-06, 'epoch': 1.15} +{'loss': 90.1187, 'grad_norm': 1018.8873224423866, 'learning_rate': 1.613551544934393e-06, 'epoch': 1.15} +{'loss': 92.1799, 'grad_norm': 1510.4090723071204, 'learning_rate': 1.6111778765980657e-06, 'epoch': 1.15} +{'loss': 91.1004, 'grad_norm': 1171.1104437344811, 'learning_rate': 1.6088058024952824e-06, 'epoch': 1.15} +{'loss': 90.9312, 'grad_norm': 1106.3242898834797, 'learning_rate': 1.606435323076837e-06, 'epoch': 1.15} +{'loss': 92.5464, 'grad_norm': 1086.6756472814968, 'learning_rate': 1.6040664387932214e-06, 'epoch': 1.15} +{'loss': 93.3648, 'grad_norm': 1099.7837768831762, 'learning_rate': 1.6016991500946211e-06, 'epoch': 1.15} +{'loss': 89.8291, 'grad_norm': 1175.0048336015861, 'learning_rate': 1.5993334574309238e-06, 'epoch': 1.15} +{'loss': 91.3785, 'grad_norm': 1093.7903351738846, 'learning_rate': 1.5969693612517169e-06, 'epoch': 1.15} +{'loss': 91.7484, 'grad_norm': 1238.4474678660736, 'learning_rate': 1.5946068620062738e-06, 'epoch': 1.15} +{'loss': 89.8274, 'grad_norm': 1083.0303096168495, 'learning_rate': 1.5922459601435713e-06, 'epoch': 1.15} +{'loss': 92.8258, 'grad_norm': 1996.6323752711958, 'learning_rate': 1.5898866561122805e-06, 'epoch': 1.15} +{'loss': 91.7195, 'grad_norm': 1062.3486013241347, 'learning_rate': 1.5875289503607694e-06, 'epoch': 1.15} +{'loss': 90.8937, 'grad_norm': 1038.6487693435554, 'learning_rate': 1.5851728433371072e-06, 'epoch': 1.15} +{'loss': 90.7621, 'grad_norm': 1114.018906661619, 'learning_rate': 1.5828183354890504e-06, 'epoch': 1.15} +{'loss': 92.7422, 'grad_norm': 991.1142525123593, 'learning_rate': 1.5804654272640564e-06, 'epoch': 1.15} +{'loss': 90.5942, 'grad_norm': 1067.037680973182, 'learning_rate': 1.5781141191092764e-06, 'epoch': 1.15} +{'loss': 92.4463, 'grad_norm': 1141.2253421739465, 'learning_rate': 1.5757644114715597e-06, 'epoch': 1.15} +{'loss': 89.6687, 'grad_norm': 1209.0060029292813, 'learning_rate': 1.573416304797455e-06, 'epoch': 1.15} +{'loss': 89.1917, 'grad_norm': 1099.0123077268288, 'learning_rate': 1.5710697995332004e-06, 'epoch': 1.15} +{'loss': 91.4072, 'grad_norm': 1203.6566269582481, 'learning_rate': 1.5687248961247303e-06, 'epoch': 1.15} +{'loss': 88.8233, 'grad_norm': 1083.376158040288, 'learning_rate': 1.5663815950176742e-06, 'epoch': 1.15} +{'loss': 91.0055, 'grad_norm': 1050.6079486461929, 'learning_rate': 1.5640398966573622e-06, 'epoch': 1.15} +{'loss': 93.1069, 'grad_norm': 1118.8118948807462, 'learning_rate': 1.5616998014888185e-06, 'epoch': 1.15} +{'loss': 90.7036, 'grad_norm': 1151.68713970555, 'learning_rate': 1.5593613099567595e-06, 'epoch': 1.15} +{'loss': 90.6793, 'grad_norm': 1066.3682899101361, 'learning_rate': 1.5570244225055965e-06, 'epoch': 1.15} +{'loss': 91.6558, 'grad_norm': 1287.3198218887412, 'learning_rate': 1.5546891395794372e-06, 'epoch': 1.15} +{'loss': 90.3137, 'grad_norm': 1046.9642475416645, 'learning_rate': 1.552355461622086e-06, 'epoch': 1.15} +{'loss': 90.1671, 'grad_norm': 1077.382977923728, 'learning_rate': 1.5500233890770434e-06, 'epoch': 1.15} +{'loss': 90.5086, 'grad_norm': 1082.420975737178, 'learning_rate': 1.5476929223875004e-06, 'epoch': 1.15} +{'loss': 90.6672, 'grad_norm': 1085.7862839946845, 'learning_rate': 1.545364061996346e-06, 'epoch': 1.15} +{'loss': 89.7387, 'grad_norm': 1053.0626798331275, 'learning_rate': 1.5430368083461612e-06, 'epoch': 1.15} +{'loss': 91.7647, 'grad_norm': 1059.0555839351643, 'learning_rate': 1.5407111618792213e-06, 'epoch': 1.15} +{'loss': 90.2322, 'grad_norm': 1260.4433263170497, 'learning_rate': 1.5383871230375025e-06, 'epoch': 1.15} +{'loss': 91.6916, 'grad_norm': 1221.1324065636747, 'learning_rate': 1.5360646922626709e-06, 'epoch': 1.15} +{'loss': 91.1917, 'grad_norm': 1097.5929787867806, 'learning_rate': 1.533743869996086e-06, 'epoch': 1.15} +{'loss': 90.7588, 'grad_norm': 1045.219784856314, 'learning_rate': 1.5314246566788026e-06, 'epoch': 1.15} +{'loss': 92.2184, 'grad_norm': 1213.0734305402061, 'learning_rate': 1.5291070527515684e-06, 'epoch': 1.15} +{'loss': 89.2458, 'grad_norm': 1118.7605749250563, 'learning_rate': 1.5267910586548296e-06, 'epoch': 1.15} +{'loss': 89.0229, 'grad_norm': 1213.2572207839933, 'learning_rate': 1.5244766748287244e-06, 'epoch': 1.15} +{'loss': 90.9032, 'grad_norm': 1169.9660184602064, 'learning_rate': 1.522163901713083e-06, 'epoch': 1.15} +{'loss': 92.6002, 'grad_norm': 1062.6364527154174, 'learning_rate': 1.5198527397474306e-06, 'epoch': 1.15} +{'loss': 88.8741, 'grad_norm': 1089.6023424156765, 'learning_rate': 1.5175431893709836e-06, 'epoch': 1.15} +{'loss': 91.562, 'grad_norm': 1186.0464102228805, 'learning_rate': 1.5152352510226588e-06, 'epoch': 1.15} +[2025-05-30 02:22:27,813] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60000 is about to be saved! +[2025-05-30 02:22:27,855] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 02:22:27,856] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 02:22:27,955] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 02:22:28,022] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 02:28:15,176] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 02:28:15,245] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 02:28:18,641] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60000 is ready now! +[2025-05-30 02:28:18,672] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60000 is about to be saved! +[2025-05-30 02:28:18,688] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 02:28:18,688] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 02:28:18,774] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 02:28:18,836] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 02:34:31,186] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 02:34:31,338] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-60000/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 02:34:33,352] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60000 is ready now! +{'loss': 89.0977, 'grad_norm': 1125.7979674708772, 'learning_rate': 1.512928925141063e-06, 'epoch': 1.15} +{'loss': 90.906, 'grad_norm': 1059.5411590820454, 'learning_rate': 1.5106242121644965e-06, 'epoch': 1.15} +{'loss': 90.6459, 'grad_norm': 1172.592246313393, 'learning_rate': 1.5083211125309505e-06, 'epoch': 1.15} +{'loss': 87.4762, 'grad_norm': 948.667963672447, 'learning_rate': 1.5060196266781114e-06, 'epoch': 1.15} +{'loss': 90.4219, 'grad_norm': 1105.5476492748373, 'learning_rate': 1.5037197550433614e-06, 'epoch': 1.15} +{'loss': 90.5929, 'grad_norm': 1097.874200723919, 'learning_rate': 1.5014214980637754e-06, 'epoch': 1.16} +{'loss': 91.7009, 'grad_norm': 1344.0379776968396, 'learning_rate': 1.4991248561761197e-06, 'epoch': 1.16} +{'loss': 88.9816, 'grad_norm': 1112.0055195892303, 'learning_rate': 1.4968298298168526e-06, 'epoch': 1.16} +{'loss': 93.3202, 'grad_norm': 2542.280140085783, 'learning_rate': 1.4945364194221258e-06, 'epoch': 1.16} +{'loss': 90.9103, 'grad_norm': 1146.23839466887, 'learning_rate': 1.4922446254277856e-06, 'epoch': 1.16} +{'loss': 93.5486, 'grad_norm': 1084.2218384765667, 'learning_rate': 1.489954448269374e-06, 'epoch': 1.16} +{'loss': 87.8553, 'grad_norm': 1124.242912214507, 'learning_rate': 1.4876658883821193e-06, 'epoch': 1.16} +{'loss': 93.0259, 'grad_norm': 1135.8104455315397, 'learning_rate': 1.485378946200946e-06, 'epoch': 1.16} +{'loss': 92.5435, 'grad_norm': 1143.046026829204, 'learning_rate': 1.4830936221604687e-06, 'epoch': 1.16} +{'loss': 91.7693, 'grad_norm': 1133.4531469520844, 'learning_rate': 1.4808099166949974e-06, 'epoch': 1.16} +{'loss': 91.012, 'grad_norm': 1220.1458271198871, 'learning_rate': 1.4785278302385353e-06, 'epoch': 1.16} +{'loss': 87.8741, 'grad_norm': 1141.7112158835905, 'learning_rate': 1.4762473632247754e-06, 'epoch': 1.16} +{'loss': 90.0145, 'grad_norm': 952.1150306911137, 'learning_rate': 1.473968516087103e-06, 'epoch': 1.16} +{'loss': 90.5811, 'grad_norm': 1042.5981702720246, 'learning_rate': 1.4716912892585933e-06, 'epoch': 1.16} +{'loss': 91.6296, 'grad_norm': 1103.4256005528116, 'learning_rate': 1.4694156831720185e-06, 'epoch': 1.16} +{'loss': 91.1479, 'grad_norm': 1163.638329540071, 'learning_rate': 1.467141698259844e-06, 'epoch': 1.16} +{'loss': 89.8507, 'grad_norm': 1112.5645907679564, 'learning_rate': 1.4648693349542208e-06, 'epoch': 1.16} +{'loss': 90.6344, 'grad_norm': 1160.1129545134659, 'learning_rate': 1.4625985936869946e-06, 'epoch': 1.16} +{'loss': 89.1229, 'grad_norm': 1076.5906568274822, 'learning_rate': 1.4603294748897012e-06, 'epoch': 1.16} +{'loss': 90.6389, 'grad_norm': 1124.6193451464226, 'learning_rate': 1.4580619789935745e-06, 'epoch': 1.16} +{'loss': 87.4865, 'grad_norm': 1025.7942716043028, 'learning_rate': 1.4557961064295302e-06, 'epoch': 1.16} +{'loss': 91.0704, 'grad_norm': 1055.1212635886814, 'learning_rate': 1.4535318576281854e-06, 'epoch': 1.16} +{'loss': 89.505, 'grad_norm': 1063.67524479157, 'learning_rate': 1.4512692330198418e-06, 'epoch': 1.16} +{'loss': 89.4679, 'grad_norm': 997.2542809197143, 'learning_rate': 1.4490082330344913e-06, 'epoch': 1.16} +{'loss': 89.4795, 'grad_norm': 1004.9830265497337, 'learning_rate': 1.4467488581018263e-06, 'epoch': 1.16} +{'loss': 88.2693, 'grad_norm': 1180.9236685006051, 'learning_rate': 1.444491108651218e-06, 'epoch': 1.16} +{'loss': 91.0384, 'grad_norm': 1083.3601849247138, 'learning_rate': 1.4422349851117411e-06, 'epoch': 1.16} +{'loss': 91.9622, 'grad_norm': 1210.4486208680632, 'learning_rate': 1.4399804879121516e-06, 'epoch': 1.16} +{'loss': 90.3608, 'grad_norm': 1291.7236769761553, 'learning_rate': 1.4377276174808984e-06, 'epoch': 1.16} +{'loss': 89.4409, 'grad_norm': 1043.7385802142016, 'learning_rate': 1.4354763742461265e-06, 'epoch': 1.16} +{'loss': 89.2999, 'grad_norm': 1029.188585854472, 'learning_rate': 1.4332267586356652e-06, 'epoch': 1.16} +{'loss': 91.2337, 'grad_norm': 1031.7539284074435, 'learning_rate': 1.4309787710770407e-06, 'epoch': 1.16} +{'loss': 89.8042, 'grad_norm': 1013.713713523937, 'learning_rate': 1.4287324119974644e-06, 'epoch': 1.16} +{'loss': 90.3056, 'grad_norm': 1102.0008168706988, 'learning_rate': 1.426487681823837e-06, 'epoch': 1.16} +{'loss': 90.9497, 'grad_norm': 1037.8258287253723, 'learning_rate': 1.4242445809827587e-06, 'epoch': 1.16} +{'loss': 91.546, 'grad_norm': 1130.496736574555, 'learning_rate': 1.4220031099005094e-06, 'epoch': 1.16} +{'loss': 90.83, 'grad_norm': 1187.6637267568076, 'learning_rate': 1.4197632690030682e-06, 'epoch': 1.16} +{'loss': 89.9532, 'grad_norm': 1042.0965218010942, 'learning_rate': 1.4175250587160982e-06, 'epoch': 1.16} +{'loss': 92.1199, 'grad_norm': 1061.8819436854935, 'learning_rate': 1.4152884794649523e-06, 'epoch': 1.16} +{'loss': 89.7792, 'grad_norm': 1132.2671419298595, 'learning_rate': 1.4130535316746818e-06, 'epoch': 1.16} +{'loss': 89.7232, 'grad_norm': 1107.6967214621404, 'learning_rate': 1.4108202157700146e-06, 'epoch': 1.16} +{'loss': 90.7662, 'grad_norm': 1094.2619041817604, 'learning_rate': 1.4085885321753823e-06, 'epoch': 1.16} +{'loss': 89.5576, 'grad_norm': 1114.0995549930315, 'learning_rate': 1.4063584813148979e-06, 'epoch': 1.16} +{'loss': 91.541, 'grad_norm': 1464.931191138752, 'learning_rate': 1.4041300636123622e-06, 'epoch': 1.16} +{'loss': 90.7226, 'grad_norm': 1169.4107517406217, 'learning_rate': 1.4019032794912757e-06, 'epoch': 1.16} +{'loss': 90.3127, 'grad_norm': 1008.6919382178585, 'learning_rate': 1.3996781293748162e-06, 'epoch': 1.16} +{'loss': 90.7074, 'grad_norm': 1136.7012489370559, 'learning_rate': 1.3974546136858624e-06, 'epoch': 1.16} +{'loss': 88.9744, 'grad_norm': 1067.217040854703, 'learning_rate': 1.395232732846975e-06, 'epoch': 1.16} +{'loss': 92.0618, 'grad_norm': 1118.1591231089992, 'learning_rate': 1.393012487280404e-06, 'epoch': 1.16} +{'loss': 90.1638, 'grad_norm': 1083.4233411202968, 'learning_rate': 1.390793877408093e-06, 'epoch': 1.16} +{'loss': 89.9485, 'grad_norm': 1150.4073885362302, 'learning_rate': 1.3885769036516706e-06, 'epoch': 1.16} +{'loss': 89.3697, 'grad_norm': 1102.062207640895, 'learning_rate': 1.3863615664324591e-06, 'epoch': 1.16} +{'loss': 96.1297, 'grad_norm': 1086.8591132190766, 'learning_rate': 1.3841478661714647e-06, 'epoch': 1.17} +{'loss': 88.0585, 'grad_norm': 1180.8709271398457, 'learning_rate': 1.3819358032893837e-06, 'epoch': 1.17} +{'loss': 91.2509, 'grad_norm': 1072.3487941309745, 'learning_rate': 1.3797253782066056e-06, 'epoch': 1.17} +{'loss': 91.4883, 'grad_norm': 1078.7605137270273, 'learning_rate': 1.3775165913432032e-06, 'epoch': 1.17} +{'loss': 94.1038, 'grad_norm': 979.6847842627053, 'learning_rate': 1.3753094431189385e-06, 'epoch': 1.17} +{'loss': 94.9593, 'grad_norm': 1160.5591798729124, 'learning_rate': 1.373103933953268e-06, 'epoch': 1.17} +{'loss': 90.913, 'grad_norm': 998.7975165955437, 'learning_rate': 1.370900064265327e-06, 'epoch': 1.17} +{'loss': 90.5781, 'grad_norm': 982.227236082802, 'learning_rate': 1.36869783447395e-06, 'epoch': 1.17} +{'loss': 91.0284, 'grad_norm': 1039.0672232561233, 'learning_rate': 1.366497244997652e-06, 'epoch': 1.17} +{'loss': 88.6395, 'grad_norm': 1139.2842608248027, 'learning_rate': 1.364298296254637e-06, 'epoch': 1.17} +{'loss': 88.5804, 'grad_norm': 1065.6397510179363, 'learning_rate': 1.3621009886628033e-06, 'epoch': 1.17} +{'loss': 91.4622, 'grad_norm': 1178.9089262982056, 'learning_rate': 1.3599053226397275e-06, 'epoch': 1.17} +{'loss': 91.5504, 'grad_norm': 1165.686353878078, 'learning_rate': 1.3577112986026841e-06, 'epoch': 1.17} +{'loss': 90.8605, 'grad_norm': 1087.6682075633976, 'learning_rate': 1.35551891696863e-06, 'epoch': 1.17} +{'loss': 89.7933, 'grad_norm': 1083.9073033269474, 'learning_rate': 1.3533281781542085e-06, 'epoch': 1.17} +{'loss': 89.0246, 'grad_norm': 1053.39431922564, 'learning_rate': 1.3511390825757575e-06, 'epoch': 1.17} +{'loss': 89.96, 'grad_norm': 1065.8328107771579, 'learning_rate': 1.3489516306492967e-06, 'epoch': 1.17} +{'loss': 88.5511, 'grad_norm': 1119.6408391750163, 'learning_rate': 1.3467658227905323e-06, 'epoch': 1.17} +{'loss': 89.3854, 'grad_norm': 1059.6175715317047, 'learning_rate': 1.3445816594148654e-06, 'epoch': 1.17} +{'loss': 90.0761, 'grad_norm': 1107.7532571621457, 'learning_rate': 1.342399140937376e-06, 'epoch': 1.17} +{'loss': 91.224, 'grad_norm': 1172.527042271721, 'learning_rate': 1.3402182677728392e-06, 'epoch': 1.17} +{'loss': 90.6813, 'grad_norm': 1100.7168978051443, 'learning_rate': 1.3380390403357136e-06, 'epoch': 1.17} +{'loss': 91.7773, 'grad_norm': 1314.285722813821, 'learning_rate': 1.335861459040141e-06, 'epoch': 1.17} +{'loss': 90.9462, 'grad_norm': 1020.1332747054955, 'learning_rate': 1.3336855242999592e-06, 'epoch': 1.17} +{'loss': 89.2261, 'grad_norm': 1073.16248906127, 'learning_rate': 1.3315112365286853e-06, 'epoch': 1.17} +{'loss': 90.1249, 'grad_norm': 1158.6393584827772, 'learning_rate': 1.32933859613953e-06, 'epoch': 1.17} +{'loss': 90.1391, 'grad_norm': 1258.840005604274, 'learning_rate': 1.3271676035453851e-06, 'epoch': 1.17} +{'loss': 88.5608, 'grad_norm': 1123.9205240982642, 'learning_rate': 1.3249982591588307e-06, 'epoch': 1.17} +{'loss': 89.0787, 'grad_norm': 1024.7314019648313, 'learning_rate': 1.3228305633921378e-06, 'epoch': 1.17} +{'loss': 90.9258, 'grad_norm': 1049.4595155725497, 'learning_rate': 1.3206645166572573e-06, 'epoch': 1.17} +{'loss': 93.3209, 'grad_norm': 1048.9063440292632, 'learning_rate': 1.3185001193658342e-06, 'epoch': 1.17} +{'loss': 90.4351, 'grad_norm': 1148.6248469049929, 'learning_rate': 1.316337371929195e-06, 'epoch': 1.17} +{'loss': 90.4412, 'grad_norm': 982.9517646098984, 'learning_rate': 1.3141762747583498e-06, 'epoch': 1.17} +{'loss': 90.5124, 'grad_norm': 1123.9615180956023, 'learning_rate': 1.3120168282640055e-06, 'epoch': 1.17} +{'loss': 87.0383, 'grad_norm': 1084.2415187868903, 'learning_rate': 1.3098590328565453e-06, 'epoch': 1.17} +{'loss': 88.9739, 'grad_norm': 1164.6896793783903, 'learning_rate': 1.3077028889460408e-06, 'epoch': 1.17} +{'loss': 90.2081, 'grad_norm': 1107.9231362399453, 'learning_rate': 1.3055483969422556e-06, 'epoch': 1.17} +{'loss': 89.3806, 'grad_norm': 1180.1761293471613, 'learning_rate': 1.3033955572546297e-06, 'epoch': 1.17} +{'loss': 90.5486, 'grad_norm': 1206.5968419478552, 'learning_rate': 1.3012443702923006e-06, 'epoch': 1.17} +{'loss': 89.8014, 'grad_norm': 1097.4040860749978, 'learning_rate': 1.299094836464081e-06, 'epoch': 1.17} +{'loss': 89.56, 'grad_norm': 1067.0424381847422, 'learning_rate': 1.2969469561784731e-06, 'epoch': 1.17} +{'loss': 90.0458, 'grad_norm': 1051.9520959677955, 'learning_rate': 1.2948007298436692e-06, 'epoch': 1.17} +{'loss': 89.8095, 'grad_norm': 1143.9245595082114, 'learning_rate': 1.29265615786754e-06, 'epoch': 1.17} +{'loss': 89.2557, 'grad_norm': 1080.4813844221621, 'learning_rate': 1.2905132406576492e-06, 'epoch': 1.17} +{'loss': 90.2799, 'grad_norm': 1069.251459021701, 'learning_rate': 1.2883719786212424e-06, 'epoch': 1.17} +{'loss': 89.4095, 'grad_norm': 1038.7594560118112, 'learning_rate': 1.2862323721652447e-06, 'epoch': 1.17} +{'loss': 89.8644, 'grad_norm': 1320.9590656676849, 'learning_rate': 1.2840944216962802e-06, 'epoch': 1.17} +{'loss': 89.9893, 'grad_norm': 1039.3405397090946, 'learning_rate': 1.281958127620644e-06, 'epoch': 1.17} +{'loss': 89.7662, 'grad_norm': 1210.3880385979282, 'learning_rate': 1.2798234903443285e-06, 'epoch': 1.17} +{'loss': 92.0059, 'grad_norm': 1087.1424639700456, 'learning_rate': 1.2776905102730031e-06, 'epoch': 1.17} +{'loss': 90.894, 'grad_norm': 1055.2640333644824, 'learning_rate': 1.275559187812022e-06, 'epoch': 1.17} +{'loss': 90.4957, 'grad_norm': 1261.795125183131, 'learning_rate': 1.273429523366434e-06, 'epoch': 1.17} +{'loss': 89.2893, 'grad_norm': 950.7324996856107, 'learning_rate': 1.2713015173409592e-06, 'epoch': 1.18} +{'loss': 91.3777, 'grad_norm': 1154.5107327004123, 'learning_rate': 1.2691751701400145e-06, 'epoch': 1.18} +{'loss': 89.387, 'grad_norm': 1072.6592317853203, 'learning_rate': 1.2670504821676944e-06, 'epoch': 1.18} +{'loss': 92.2974, 'grad_norm': 1046.9907848214389, 'learning_rate': 1.2649274538277779e-06, 'epoch': 1.18} +{'loss': 90.1446, 'grad_norm': 1048.2597537698446, 'learning_rate': 1.2628060855237357e-06, 'epoch': 1.18} +{'loss': 92.4974, 'grad_norm': 1223.3163112377863, 'learning_rate': 1.260686377658713e-06, 'epoch': 1.18} +{'loss': 89.4453, 'grad_norm': 1027.2770570991522, 'learning_rate': 1.25856833063555e-06, 'epoch': 1.18} +{'loss': 88.7595, 'grad_norm': 1157.620587534671, 'learning_rate': 1.2564519448567615e-06, 'epoch': 1.18} +{'loss': 89.9149, 'grad_norm': 1069.5971401880436, 'learning_rate': 1.2543372207245508e-06, 'epoch': 1.18} +{'loss': 88.0101, 'grad_norm': 1016.8032160659938, 'learning_rate': 1.25222415864081e-06, 'epoch': 1.18} +{'loss': 92.3286, 'grad_norm': 1036.9862875704116, 'learning_rate': 1.250112759007105e-06, 'epoch': 1.18} +{'loss': 90.7075, 'grad_norm': 1116.5934016011431, 'learning_rate': 1.2480030222246974e-06, 'epoch': 1.18} +{'loss': 89.5446, 'grad_norm': 1083.1851976160717, 'learning_rate': 1.245894948694525e-06, 'epoch': 1.18} +{'loss': 88.3145, 'grad_norm': 1066.760671031899, 'learning_rate': 1.2437885388172076e-06, 'epoch': 1.18} +{'loss': 88.1277, 'grad_norm': 1042.0184030952228, 'learning_rate': 1.241683792993058e-06, 'epoch': 1.18} +{'loss': 91.7563, 'grad_norm': 1057.1435067507402, 'learning_rate': 1.2395807116220648e-06, 'epoch': 1.18} +{'loss': 92.1065, 'grad_norm': 1191.9331709524183, 'learning_rate': 1.2374792951039061e-06, 'epoch': 1.18} +{'loss': 90.2147, 'grad_norm': 1224.7319046085638, 'learning_rate': 1.2353795438379378e-06, 'epoch': 1.18} +{'loss': 89.9694, 'grad_norm': 1019.0384026099359, 'learning_rate': 1.2332814582232023e-06, 'epoch': 1.18} +{'loss': 90.1691, 'grad_norm': 1086.0552416207338, 'learning_rate': 1.2311850386584244e-06, 'epoch': 1.18} +{'loss': 88.7939, 'grad_norm': 1086.5143387501753, 'learning_rate': 1.2290902855420139e-06, 'epoch': 1.18} +{'loss': 90.3645, 'grad_norm': 1068.8855048931189, 'learning_rate': 1.2269971992720653e-06, 'epoch': 1.18} +{'loss': 91.1917, 'grad_norm': 1093.2957651635672, 'learning_rate': 1.2249057802463527e-06, 'epoch': 1.18} +{'loss': 87.6212, 'grad_norm': 996.6662290194292, 'learning_rate': 1.2228160288623347e-06, 'epoch': 1.18} +{'loss': 89.9555, 'grad_norm': 1146.409589762545, 'learning_rate': 1.2207279455171516e-06, 'epoch': 1.18} +{'loss': 91.7793, 'grad_norm': 1208.5954200074468, 'learning_rate': 1.2186415306076281e-06, 'epoch': 1.18} +{'loss': 87.4939, 'grad_norm': 1057.674788989821, 'learning_rate': 1.2165567845302762e-06, 'epoch': 1.18} +{'loss': 98.2273, 'grad_norm': 1043.295207960268, 'learning_rate': 1.214473707681284e-06, 'epoch': 1.18} +{'loss': 90.213, 'grad_norm': 1052.6307918145753, 'learning_rate': 1.212392300456523e-06, 'epoch': 1.18} +{'loss': 92.0154, 'grad_norm': 1165.4348933610954, 'learning_rate': 1.21031256325155e-06, 'epoch': 1.18} +{'loss': 89.9582, 'grad_norm': 1065.9751167905565, 'learning_rate': 1.2082344964616032e-06, 'epoch': 1.18} +{'loss': 90.2857, 'grad_norm': 1059.4846422048176, 'learning_rate': 1.2061581004816069e-06, 'epoch': 1.18} +{'loss': 90.675, 'grad_norm': 1145.560548622167, 'learning_rate': 1.204083375706163e-06, 'epoch': 1.18} +{'loss': 89.1117, 'grad_norm': 1105.6037076424357, 'learning_rate': 1.2020103225295587e-06, 'epoch': 1.18} +{'loss': 92.362, 'grad_norm': 1128.9825837295039, 'learning_rate': 1.1999389413457574e-06, 'epoch': 1.18} +{'loss': 88.8208, 'grad_norm': 989.8468066122264, 'learning_rate': 1.1978692325484142e-06, 'epoch': 1.18} +{'loss': 90.2922, 'grad_norm': 1075.3625154300782, 'learning_rate': 1.1958011965308624e-06, 'epoch': 1.18} +{'loss': 89.6188, 'grad_norm': 977.1132502201386, 'learning_rate': 1.1937348336861165e-06, 'epoch': 1.18} +{'loss': 90.6429, 'grad_norm': 1096.6373490417172, 'learning_rate': 1.1916701444068724e-06, 'epoch': 1.18} +{'loss': 90.8466, 'grad_norm': 1116.4252729089176, 'learning_rate': 1.189607129085507e-06, 'epoch': 1.18} +{'loss': 87.4463, 'grad_norm': 1097.2915496699327, 'learning_rate': 1.187545788114084e-06, 'epoch': 1.18} +{'loss': 89.3668, 'grad_norm': 982.6195422346237, 'learning_rate': 1.1854861218843472e-06, 'epoch': 1.18} +{'loss': 90.1935, 'grad_norm': 1070.6026776973467, 'learning_rate': 1.1834281307877194e-06, 'epoch': 1.18} +{'loss': 90.3886, 'grad_norm': 1097.202250472509, 'learning_rate': 1.1813718152153054e-06, 'epoch': 1.18} +{'loss': 90.4274, 'grad_norm': 1379.070439434496, 'learning_rate': 1.1793171755578913e-06, 'epoch': 1.18} +{'loss': 91.8355, 'grad_norm': 1042.3814062028284, 'learning_rate': 1.1772642122059496e-06, 'epoch': 1.18} +{'loss': 90.177, 'grad_norm': 1027.3797419457574, 'learning_rate': 1.1752129255496314e-06, 'epoch': 1.18} +{'loss': 92.054, 'grad_norm': 1061.05141156769, 'learning_rate': 1.173163315978766e-06, 'epoch': 1.18} +{'loss': 90.9753, 'grad_norm': 1099.5772014620666, 'learning_rate': 1.1711153838828683e-06, 'epoch': 1.18} +{'loss': 89.8331, 'grad_norm': 1086.2522216220655, 'learning_rate': 1.1690691296511291e-06, 'epoch': 1.18} +{'loss': 88.4648, 'grad_norm': 1085.0678428039705, 'learning_rate': 1.1670245536724267e-06, 'epoch': 1.18} +{'loss': 89.3703, 'grad_norm': 1037.2267191891465, 'learning_rate': 1.164981656335319e-06, 'epoch': 1.18} +{'loss': 91.9609, 'grad_norm': 1097.6694107573562, 'learning_rate': 1.1629404380280417e-06, 'epoch': 1.19} +{'loss': 91.2961, 'grad_norm': 1070.5417837346174, 'learning_rate': 1.160900899138514e-06, 'epoch': 1.19} +{'loss': 91.3113, 'grad_norm': 1120.7016991017479, 'learning_rate': 1.1588630400543344e-06, 'epoch': 1.19} +{'loss': 88.6137, 'grad_norm': 1128.7770190756992, 'learning_rate': 1.156826861162781e-06, 'epoch': 1.19} +{'loss': 88.0625, 'grad_norm': 1001.5371814023371, 'learning_rate': 1.1547923628508162e-06, 'epoch': 1.19} +{'loss': 89.4605, 'grad_norm': 1144.0816151666986, 'learning_rate': 1.1527595455050844e-06, 'epoch': 1.19} +{'loss': 88.3579, 'grad_norm': 1113.9847839910867, 'learning_rate': 1.1507284095119042e-06, 'epoch': 1.19} +{'loss': 90.2667, 'grad_norm': 1135.7469978572392, 'learning_rate': 1.148698955257279e-06, 'epoch': 1.19} +{'loss': 91.5523, 'grad_norm': 1006.1506639686432, 'learning_rate': 1.1466711831268895e-06, 'epoch': 1.19} +{'loss': 90.8253, 'grad_norm': 1286.1453888402075, 'learning_rate': 1.1446450935060992e-06, 'epoch': 1.19} +{'loss': 94.9389, 'grad_norm': 9864.226826690936, 'learning_rate': 1.1426206867799561e-06, 'epoch': 1.19} +{'loss': 91.7602, 'grad_norm': 1161.7255481050981, 'learning_rate': 1.1405979633331798e-06, 'epoch': 1.19} +{'loss': 89.5524, 'grad_norm': 1091.0759177565935, 'learning_rate': 1.1385769235501742e-06, 'epoch': 1.19} +{'loss': 90.4095, 'grad_norm': 1086.863451720264, 'learning_rate': 1.136557567815021e-06, 'epoch': 1.19} +{'loss': 91.9396, 'grad_norm': 1060.7613318485683, 'learning_rate': 1.1345398965114861e-06, 'epoch': 1.19} +{'loss': 91.6961, 'grad_norm': 1006.2540805013022, 'learning_rate': 1.1325239100230157e-06, 'epoch': 1.19} +{'loss': 93.2539, 'grad_norm': 1019.9513655653617, 'learning_rate': 1.1305096087327294e-06, 'epoch': 1.19} +{'loss': 90.5293, 'grad_norm': 1127.9898510774178, 'learning_rate': 1.1284969930234313e-06, 'epoch': 1.19} +{'loss': 88.8443, 'grad_norm': 1000.8115603470346, 'learning_rate': 1.126486063277602e-06, 'epoch': 1.19} +{'loss': 88.9219, 'grad_norm': 1088.9740414972327, 'learning_rate': 1.1244768198774047e-06, 'epoch': 1.19} +{'loss': 89.3265, 'grad_norm': 1025.7777450089056, 'learning_rate': 1.1224692632046864e-06, 'epoch': 1.19} +{'loss': 91.6837, 'grad_norm': 1117.2578312413452, 'learning_rate': 1.1204633936409625e-06, 'epoch': 1.19} +{'loss': 89.1999, 'grad_norm': 1087.4383317164104, 'learning_rate': 1.1184592115674364e-06, 'epoch': 1.19} +{'loss': 88.2122, 'grad_norm': 1016.5131252335938, 'learning_rate': 1.1164567173649842e-06, 'epoch': 1.19} +{'loss': 89.9984, 'grad_norm': 1156.0309051107156, 'learning_rate': 1.1144559114141683e-06, 'epoch': 1.19} +{'loss': 91.1201, 'grad_norm': 1028.4204100187892, 'learning_rate': 1.112456794095229e-06, 'epoch': 1.19} +{'loss': 88.8127, 'grad_norm': 1007.3523814471213, 'learning_rate': 1.1104593657880812e-06, 'epoch': 1.19} +{'loss': 89.285, 'grad_norm': 1118.5829210974503, 'learning_rate': 1.1084636268723226e-06, 'epoch': 1.19} +{'loss': 89.9541, 'grad_norm': 1099.461620140619, 'learning_rate': 1.106469577727224e-06, 'epoch': 1.19} +{'loss': 89.9998, 'grad_norm': 1066.5267771526321, 'learning_rate': 1.1044772187317443e-06, 'epoch': 1.19} +{'loss': 89.4657, 'grad_norm': 1200.4534932266422, 'learning_rate': 1.1024865502645177e-06, 'epoch': 1.19} +{'loss': 89.8569, 'grad_norm': 1069.2508113766764, 'learning_rate': 1.1004975727038536e-06, 'epoch': 1.19} +{'loss': 88.0561, 'grad_norm': 1029.2933552829245, 'learning_rate': 1.098510286427743e-06, 'epoch': 1.19} +{'loss': 89.8023, 'grad_norm': 1120.6890053198892, 'learning_rate': 1.0965246918138529e-06, 'epoch': 1.19} +{'loss': 87.7889, 'grad_norm': 1019.6591990595432, 'learning_rate': 1.0945407892395321e-06, 'epoch': 1.19} +{'loss': 90.2906, 'grad_norm': 1014.582009000927, 'learning_rate': 1.0925585790818093e-06, 'epoch': 1.19} +{'loss': 90.4463, 'grad_norm': 1100.8970427461647, 'learning_rate': 1.0905780617173866e-06, 'epoch': 1.19} +{'loss': 90.1501, 'grad_norm': 934.9528148307606, 'learning_rate': 1.0885992375226472e-06, 'epoch': 1.19} +{'loss': 87.4981, 'grad_norm': 1074.658146349437, 'learning_rate': 1.0866221068736504e-06, 'epoch': 1.19} +[2025-05-30 04:22:39,074] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step62000 is about to be saved! +[2025-05-30 04:22:39,116] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 04:22:39,116] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 04:22:39,214] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 04:22:39,241] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 04:28:27,944] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 04:28:28,042] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 04:28:29,538] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step62000 is ready now! +[2025-05-30 04:28:29,589] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step62000 is about to be saved! +[2025-05-30 04:28:29,614] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 04:28:29,614] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 04:28:29,796] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 04:28:30,220] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 04:34:47,530] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 04:34:47,586] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-62000/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 04:34:53,633] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step62000 is ready now! +{'loss': 90.0052, 'grad_norm': 1551.5634854770753, 'learning_rate': 1.0846466701461378e-06, 'epoch': 1.19} +{'loss': 90.3737, 'grad_norm': 1068.149911864991, 'learning_rate': 1.0826729277155224e-06, 'epoch': 1.19} +{'loss': 89.6711, 'grad_norm': 1021.8513583504274, 'learning_rate': 1.0807008799569052e-06, 'epoch': 1.19} +{'loss': 91.1081, 'grad_norm': 1205.5049115794736, 'learning_rate': 1.0787305272450553e-06, 'epoch': 1.19} +{'loss': 87.9784, 'grad_norm': 1229.808393329302, 'learning_rate': 1.0767618699544224e-06, 'epoch': 1.19} +{'loss': 89.7653, 'grad_norm': 1124.7733624680914, 'learning_rate': 1.0747949084591402e-06, 'epoch': 1.19} +{'loss': 89.6926, 'grad_norm': 994.9835555510065, 'learning_rate': 1.0728296431330098e-06, 'epoch': 1.19} +{'loss': 89.082, 'grad_norm': 1041.2542241781525, 'learning_rate': 1.0708660743495203e-06, 'epoch': 1.19} +{'loss': 89.4907, 'grad_norm': 1051.7187681458224, 'learning_rate': 1.0689042024818307e-06, 'epoch': 1.19} +{'loss': 90.4547, 'grad_norm': 1121.0322242043856, 'learning_rate': 1.0669440279027788e-06, 'epoch': 1.19} +{'loss': 90.4016, 'grad_norm': 973.6049675973668, 'learning_rate': 1.0649855509848838e-06, 'epoch': 1.19} +{'loss': 88.7435, 'grad_norm': 1067.3593996561494, 'learning_rate': 1.0630287721003385e-06, 'epoch': 1.19} +{'loss': 90.3322, 'grad_norm': 1000.4925881114918, 'learning_rate': 1.0610736916210152e-06, 'epoch': 1.19} +{'loss': 90.7946, 'grad_norm': 1055.4718486031948, 'learning_rate': 1.0591203099184633e-06, 'epoch': 1.2} +{'loss': 89.7704, 'grad_norm': 1071.602510189393, 'learning_rate': 1.0571686273639049e-06, 'epoch': 1.2} +{'loss': 88.79, 'grad_norm': 1064.5129866916075, 'learning_rate': 1.0552186443282464e-06, 'epoch': 1.2} +{'loss': 89.1371, 'grad_norm': 1037.7982733743786, 'learning_rate': 1.053270361182065e-06, 'epoch': 1.2} +{'loss': 90.4338, 'grad_norm': 1183.228414935971, 'learning_rate': 1.0513237782956199e-06, 'epoch': 1.2} +{'loss': 90.2717, 'grad_norm': 1039.4450336498378, 'learning_rate': 1.049378896038844e-06, 'epoch': 1.2} +{'loss': 91.3785, 'grad_norm': 1133.6130058182846, 'learning_rate': 1.0474357147813463e-06, 'epoch': 1.2} +{'loss': 90.772, 'grad_norm': 1138.8173435539775, 'learning_rate': 1.0454942348924157e-06, 'epoch': 1.2} +{'loss': 89.3799, 'grad_norm': 1039.9883009767061, 'learning_rate': 1.043554456741015e-06, 'epoch': 1.2} +{'loss': 90.9896, 'grad_norm': 1199.714425078007, 'learning_rate': 1.0416163806957857e-06, 'epoch': 1.2} +{'loss': 89.9625, 'grad_norm': 1096.966983813145, 'learning_rate': 1.0396800071250458e-06, 'epoch': 1.2} +{'loss': 92.8259, 'grad_norm': 1071.269492524814, 'learning_rate': 1.037745336396785e-06, 'epoch': 1.2} +{'loss': 88.186, 'grad_norm': 1002.1451538395459, 'learning_rate': 1.035812368878677e-06, 'epoch': 1.2} +{'loss': 88.2229, 'grad_norm': 1040.4980391863833, 'learning_rate': 1.0338811049380636e-06, 'epoch': 1.2} +{'loss': 90.7987, 'grad_norm': 1023.5001641414258, 'learning_rate': 1.0319515449419737e-06, 'epoch': 1.2} +{'loss': 87.5336, 'grad_norm': 1056.3734014000438, 'learning_rate': 1.0300236892571013e-06, 'epoch': 1.2} +{'loss': 91.2443, 'grad_norm': 973.9631064528645, 'learning_rate': 1.0280975382498225e-06, 'epoch': 1.2} +{'loss': 91.9932, 'grad_norm': 1089.2783577968296, 'learning_rate': 1.0261730922861845e-06, 'epoch': 1.2} +{'loss': 89.6611, 'grad_norm': 1009.1070629639104, 'learning_rate': 1.024250351731918e-06, 'epoch': 1.2} +{'loss': 90.4855, 'grad_norm': 1143.0624457371111, 'learning_rate': 1.0223293169524263e-06, 'epoch': 1.2} +{'loss': 89.0875, 'grad_norm': 1007.1113309947477, 'learning_rate': 1.020409988312786e-06, 'epoch': 1.2} +{'loss': 89.2111, 'grad_norm': 1341.8631575404606, 'learning_rate': 1.0184923661777523e-06, 'epoch': 1.2} +{'loss': 90.0918, 'grad_norm': 1113.4636979933393, 'learning_rate': 1.0165764509117514e-06, 'epoch': 1.2} +{'loss': 89.7477, 'grad_norm': 1090.399979474299, 'learning_rate': 1.0146622428788943e-06, 'epoch': 1.2} +{'loss': 88.5831, 'grad_norm': 1122.7357379856699, 'learning_rate': 1.0127497424429577e-06, 'epoch': 1.2} +{'loss': 90.2224, 'grad_norm': 1178.2510389087176, 'learning_rate': 1.0108389499674008e-06, 'epoch': 1.2} +{'loss': 89.8232, 'grad_norm': 1172.9120906113333, 'learning_rate': 1.0089298658153556e-06, 'epoch': 1.2} +{'loss': 89.9483, 'grad_norm': 1083.8171947152941, 'learning_rate': 1.007022490349626e-06, 'epoch': 1.2} +{'loss': 87.2921, 'grad_norm': 1059.1447177436958, 'learning_rate': 1.0051168239327002e-06, 'epoch': 1.2} +{'loss': 89.5508, 'grad_norm': 1014.739593483353, 'learning_rate': 1.0032128669267306e-06, 'epoch': 1.2} +{'loss': 99.5556, 'grad_norm': 1070.0296516069518, 'learning_rate': 1.0013106196935528e-06, 'epoch': 1.2} +{'loss': 88.1592, 'grad_norm': 1076.9997374282482, 'learning_rate': 9.994100825946763e-07, 'epoch': 1.2} +{'loss': 88.6358, 'grad_norm': 986.7039797554721, 'learning_rate': 9.97511255991279e-07, 'epoch': 1.2} +{'loss': 93.246, 'grad_norm': 1191.1331852483368, 'learning_rate': 9.956141402442243e-07, 'epoch': 1.2} +{'loss': 89.9264, 'grad_norm': 1024.023205257407, 'learning_rate': 9.93718735714041e-07, 'epoch': 1.2} +{'loss': 88.0329, 'grad_norm': 1005.8093321336617, 'learning_rate': 9.918250427609411e-07, 'epoch': 1.2} +{'loss': 90.6464, 'grad_norm': 1070.4306963846616, 'learning_rate': 9.899330617448033e-07, 'epoch': 1.2} +{'loss': 88.7228, 'grad_norm': 1045.5151170831193, 'learning_rate': 9.880427930251834e-07, 'epoch': 1.2} +{'loss': 89.3981, 'grad_norm': 1165.694141684441, 'learning_rate': 9.861542369613176e-07, 'epoch': 1.2} +{'loss': 89.1914, 'grad_norm': 1130.7305636576268, 'learning_rate': 9.842673939121084e-07, 'epoch': 1.2} +{'loss': 90.3114, 'grad_norm': 1182.3800381168535, 'learning_rate': 9.823822642361381e-07, 'epoch': 1.2} +{'loss': 91.2085, 'grad_norm': 1131.1050351669949, 'learning_rate': 9.80498848291661e-07, 'epoch': 1.2} +{'loss': 89.6349, 'grad_norm': 1070.7636815047097, 'learning_rate': 9.786171464366057e-07, 'epoch': 1.2} +{'loss': 88.1699, 'grad_norm': 1173.311171994486, 'learning_rate': 9.76737159028578e-07, 'epoch': 1.2} +{'loss': 89.9851, 'grad_norm': 1046.1955929564267, 'learning_rate': 9.74858886424852e-07, 'epoch': 1.2} +{'loss': 89.1711, 'grad_norm': 1096.9243475749383, 'learning_rate': 9.729823289823826e-07, 'epoch': 1.2} +{'loss': 91.4638, 'grad_norm': 1105.0863869436855, 'learning_rate': 9.711074870577952e-07, 'epoch': 1.2} +{'loss': 89.6531, 'grad_norm': 931.6633524105691, 'learning_rate': 9.692343610073874e-07, 'epoch': 1.2} +{'loss': 88.852, 'grad_norm': 1211.3409252177903, 'learning_rate': 9.673629511871363e-07, 'epoch': 1.2} +{'loss': 88.6839, 'grad_norm': 1031.0203468819968, 'learning_rate': 9.654932579526854e-07, 'epoch': 1.2} +{'loss': 88.9684, 'grad_norm': 1095.507042806996, 'learning_rate': 9.636252816593606e-07, 'epoch': 1.2} +{'loss': 92.9338, 'grad_norm': 1140.8050080175385, 'learning_rate': 9.617590226621543e-07, 'epoch': 1.2} +{'loss': 91.7519, 'grad_norm': 1097.7115623621023, 'learning_rate': 9.59894481315734e-07, 'epoch': 1.21} +{'loss': 88.7403, 'grad_norm': 1123.3129244329896, 'learning_rate': 9.580316579744453e-07, 'epoch': 1.21} +{'loss': 91.1316, 'grad_norm': 990.4341264541902, 'learning_rate': 9.561705529923027e-07, 'epoch': 1.21} +{'loss': 90.852, 'grad_norm': 1254.8770203198549, 'learning_rate': 9.543111667229932e-07, 'epoch': 1.21} +{'loss': 88.8215, 'grad_norm': 1097.1585152744237, 'learning_rate': 9.52453499519882e-07, 'epoch': 1.21} +{'loss': 86.7569, 'grad_norm': 1067.4102212121263, 'learning_rate': 9.505975517360033e-07, 'epoch': 1.21} +{'loss': 88.1831, 'grad_norm': 1080.3553716693916, 'learning_rate': 9.487433237240695e-07, 'epoch': 1.21} +{'loss': 89.4, 'grad_norm': 997.8816568949109, 'learning_rate': 9.468908158364598e-07, 'epoch': 1.21} +{'loss': 87.9497, 'grad_norm': 1072.7707787124136, 'learning_rate': 9.450400284252281e-07, 'epoch': 1.21} +{'loss': 89.658, 'grad_norm': 996.222326126536, 'learning_rate': 9.431909618421076e-07, 'epoch': 1.21} +{'loss': 89.4344, 'grad_norm': 1087.6606278042532, 'learning_rate': 9.41343616438496e-07, 'epoch': 1.21} +{'loss': 89.6398, 'grad_norm': 1131.7195991225658, 'learning_rate': 9.39497992565469e-07, 'epoch': 1.21} +{'loss': 91.6305, 'grad_norm': 1092.3614230687956, 'learning_rate': 9.376540905737752e-07, 'epoch': 1.21} +{'loss': 88.4635, 'grad_norm': 1067.5925974972572, 'learning_rate': 9.358119108138309e-07, 'epoch': 1.21} +{'loss': 86.4593, 'grad_norm': 1049.9952535177713, 'learning_rate': 9.339714536357313e-07, 'epoch': 1.21} +{'loss': 89.2448, 'grad_norm': 1116.8385282310664, 'learning_rate': 9.321327193892405e-07, 'epoch': 1.21} +{'loss': 88.2816, 'grad_norm': 1074.2782973720418, 'learning_rate': 9.302957084237984e-07, 'epoch': 1.21} +{'loss': 88.0492, 'grad_norm': 1041.919072279686, 'learning_rate': 9.28460421088514e-07, 'epoch': 1.21} +{'loss': 86.9952, 'grad_norm': 978.3043304371496, 'learning_rate': 9.266268577321679e-07, 'epoch': 1.21} +{'loss': 90.3065, 'grad_norm': 1116.0146220160018, 'learning_rate': 9.247950187032195e-07, 'epoch': 1.21} +{'loss': 86.5929, 'grad_norm': 991.1986808710177, 'learning_rate': 9.229649043497924e-07, 'epoch': 1.21} +{'loss': 89.8317, 'grad_norm': 1062.8867150807323, 'learning_rate': 9.211365150196883e-07, 'epoch': 1.21} +{'loss': 89.8272, 'grad_norm': 1042.3274766317486, 'learning_rate': 9.193098510603804e-07, 'epoch': 1.21} +{'loss': 92.1139, 'grad_norm': 1089.2097173159132, 'learning_rate': 9.174849128190077e-07, 'epoch': 1.21} +{'loss': 91.3031, 'grad_norm': 1110.1152685620023, 'learning_rate': 9.156617006423929e-07, 'epoch': 1.21} +{'loss': 89.9685, 'grad_norm': 1257.9565302036435, 'learning_rate': 9.138402148770176e-07, 'epoch': 1.21} +{'loss': 88.8022, 'grad_norm': 1097.7065870385654, 'learning_rate': 9.12020455869046e-07, 'epoch': 1.21} +{'loss': 89.643, 'grad_norm': 1056.928995335234, 'learning_rate': 9.102024239643092e-07, 'epoch': 1.21} +{'loss': 89.1182, 'grad_norm': 982.7138811523498, 'learning_rate': 9.083861195083088e-07, 'epoch': 1.21} +{'loss': 88.512, 'grad_norm': 1151.6442202227656, 'learning_rate': 9.065715428462218e-07, 'epoch': 1.21} +{'loss': 88.8123, 'grad_norm': 1023.7853215364381, 'learning_rate': 9.047586943228925e-07, 'epoch': 1.21} +{'loss': 87.3212, 'grad_norm': 1200.6802248616045, 'learning_rate': 9.029475742828442e-07, 'epoch': 1.21} +{'loss': 89.609, 'grad_norm': 1012.7983340710929, 'learning_rate': 9.011381830702626e-07, 'epoch': 1.21} +{'loss': 88.5453, 'grad_norm': 1103.2600465575097, 'learning_rate': 8.993305210290093e-07, 'epoch': 1.21} +{'loss': 90.2521, 'grad_norm': 993.1502104831889, 'learning_rate': 8.975245885026207e-07, 'epoch': 1.21} +{'loss': 89.602, 'grad_norm': 1061.5216507853306, 'learning_rate': 8.957203858342956e-07, 'epoch': 1.21} +{'loss': 89.2821, 'grad_norm': 998.3942859407268, 'learning_rate': 8.93917913366914e-07, 'epoch': 1.21} +{'loss': 87.0529, 'grad_norm': 1117.5030937304537, 'learning_rate': 8.92117171443021e-07, 'epoch': 1.21} +{'loss': 86.2705, 'grad_norm': 1080.3378899230543, 'learning_rate': 8.903181604048339e-07, 'epoch': 1.21} +{'loss': 90.2404, 'grad_norm': 999.3423937542946, 'learning_rate': 8.885208805942391e-07, 'epoch': 1.21} +{'loss': 89.9337, 'grad_norm': 1147.0522952235283, 'learning_rate': 8.867253323527991e-07, 'epoch': 1.21} +{'loss': 88.6099, 'grad_norm': 1009.7029260108718, 'learning_rate': 8.849315160217465e-07, 'epoch': 1.21} +{'loss': 87.9439, 'grad_norm': 1028.1739337265863, 'learning_rate': 8.831394319419795e-07, 'epoch': 1.21} +{'loss': 87.5991, 'grad_norm': 1083.0895480189183, 'learning_rate': 8.813490804540714e-07, 'epoch': 1.21} +{'loss': 91.4831, 'grad_norm': 1165.0275780358436, 'learning_rate': 8.795604618982645e-07, 'epoch': 1.21} +{'loss': 88.796, 'grad_norm': 1110.9734820806427, 'learning_rate': 8.777735766144712e-07, 'epoch': 1.21} +{'loss': 90.0429, 'grad_norm': 1244.537842138518, 'learning_rate': 8.759884249422812e-07, 'epoch': 1.21} +{'loss': 87.6924, 'grad_norm': 1030.742807172026, 'learning_rate': 8.742050072209451e-07, 'epoch': 1.21} +{'loss': 87.2101, 'grad_norm': 1101.2062227769445, 'learning_rate': 8.724233237893897e-07, 'epoch': 1.21} +{'loss': 90.4212, 'grad_norm': 1253.228333771848, 'learning_rate': 8.706433749862087e-07, 'epoch': 1.21} +{'loss': 87.9088, 'grad_norm': 1061.0050956790153, 'learning_rate': 8.68865161149669e-07, 'epoch': 1.21} +{'loss': 90.2652, 'grad_norm': 1026.3688411752835, 'learning_rate': 8.670886826177094e-07, 'epoch': 1.21} +{'loss': 88.5118, 'grad_norm': 1094.0724770094746, 'learning_rate': 8.653139397279342e-07, 'epoch': 1.22} +{'loss': 90.2087, 'grad_norm': 1122.7476875200355, 'learning_rate': 8.635409328176214e-07, 'epoch': 1.22} +{'loss': 90.9953, 'grad_norm': 1079.3417387037046, 'learning_rate': 8.617696622237159e-07, 'epoch': 1.22} +{'loss': 90.6983, 'grad_norm': 1068.5777379651488, 'learning_rate': 8.600001282828341e-07, 'epoch': 1.22} +{'loss': 88.9849, 'grad_norm': 1051.3364562197432, 'learning_rate': 8.58232331331268e-07, 'epoch': 1.22} +{'loss': 88.8447, 'grad_norm': 1017.1536895884841, 'learning_rate': 8.564662717049699e-07, 'epoch': 1.22} +{'loss': 86.9618, 'grad_norm': 997.5765080494476, 'learning_rate': 8.54701949739567e-07, 'epoch': 1.22} +{'loss': 87.727, 'grad_norm': 1054.1340514149754, 'learning_rate': 8.529393657703544e-07, 'epoch': 1.22} +{'loss': 89.77, 'grad_norm': 1044.4868198102263, 'learning_rate': 8.511785201322998e-07, 'epoch': 1.22} +{'loss': 89.3118, 'grad_norm': 1098.6154161374914, 'learning_rate': 8.494194131600408e-07, 'epoch': 1.22} +{'loss': 89.5048, 'grad_norm': 1121.3960340174792, 'learning_rate': 8.476620451878803e-07, 'epoch': 1.22} +{'loss': 89.4715, 'grad_norm': 1105.199934403933, 'learning_rate': 8.459064165497932e-07, 'epoch': 1.22} +{'loss': 87.9746, 'grad_norm': 1096.5096676866121, 'learning_rate': 8.441525275794215e-07, 'epoch': 1.22} +{'loss': 90.0374, 'grad_norm': 1215.8737775316865, 'learning_rate': 8.42400378610082e-07, 'epoch': 1.22} +{'loss': 87.9551, 'grad_norm': 957.180492099868, 'learning_rate': 8.406499699747584e-07, 'epoch': 1.22} +{'loss': 88.9446, 'grad_norm': 1047.9654453769112, 'learning_rate': 8.389013020061021e-07, 'epoch': 1.22} +{'loss': 88.7135, 'grad_norm': 1116.1369572026513, 'learning_rate': 8.371543750364319e-07, 'epoch': 1.22} +{'loss': 88.7866, 'grad_norm': 1000.5025387146026, 'learning_rate': 8.354091893977401e-07, 'epoch': 1.22} +{'loss': 91.6669, 'grad_norm': 1083.5695104558852, 'learning_rate': 8.336657454216856e-07, 'epoch': 1.22} +{'loss': 89.9217, 'grad_norm': 1189.3914829231971, 'learning_rate': 8.319240434395992e-07, 'epoch': 1.22} +{'loss': 89.2717, 'grad_norm': 950.1038913238425, 'learning_rate': 8.301840837824782e-07, 'epoch': 1.22} +{'loss': 89.7995, 'grad_norm': 918.6500451478465, 'learning_rate': 8.28445866780987e-07, 'epoch': 1.22} +{'loss': 89.3927, 'grad_norm': 1034.5323759796725, 'learning_rate': 8.267093927654623e-07, 'epoch': 1.22} +{'loss': 90.2842, 'grad_norm': 1065.6112399449369, 'learning_rate': 8.249746620659049e-07, 'epoch': 1.22} +{'loss': 89.3214, 'grad_norm': 1083.4317817618808, 'learning_rate': 8.232416750119921e-07, 'epoch': 1.22} +{'loss': 88.1062, 'grad_norm': 1157.099211596909, 'learning_rate': 8.215104319330636e-07, 'epoch': 1.22} +{'loss': 86.1724, 'grad_norm': 1033.475888796802, 'learning_rate': 8.197809331581297e-07, 'epoch': 1.22} +{'loss': 89.6818, 'grad_norm': 1031.5517248334172, 'learning_rate': 8.180531790158674e-07, 'epoch': 1.22} +{'loss': 87.395, 'grad_norm': 1066.0626659623001, 'learning_rate': 8.163271698346242e-07, 'epoch': 1.22} +{'loss': 90.9426, 'grad_norm': 1126.3059600042866, 'learning_rate': 8.146029059424143e-07, 'epoch': 1.22} +{'loss': 91.4389, 'grad_norm': 1101.5034617391932, 'learning_rate': 8.128803876669256e-07, 'epoch': 1.22} +{'loss': 90.0356, 'grad_norm': 1224.7316283641194, 'learning_rate': 8.111596153355061e-07, 'epoch': 1.22} +{'loss': 91.638, 'grad_norm': 993.5951145660347, 'learning_rate': 8.094405892751766e-07, 'epoch': 1.22} +{'loss': 89.3296, 'grad_norm': 1173.0177423915409, 'learning_rate': 8.077233098126247e-07, 'epoch': 1.22} +{'loss': 89.1984, 'grad_norm': 1612.2957281406461, 'learning_rate': 8.060077772742059e-07, 'epoch': 1.22} +{'loss': 89.6781, 'grad_norm': 1068.995271891377, 'learning_rate': 8.042939919859482e-07, 'epoch': 1.22} +{'loss': 88.634, 'grad_norm': 1019.7542101812413, 'learning_rate': 8.025819542735424e-07, 'epoch': 1.22} +{'loss': 87.3363, 'grad_norm': 1056.240799759289, 'learning_rate': 8.008716644623471e-07, 'epoch': 1.22} +{'loss': 90.2811, 'grad_norm': 995.3201976822685, 'learning_rate': 7.991631228773889e-07, 'epoch': 1.22} +{'loss': 89.0568, 'grad_norm': 1024.8888660430441, 'learning_rate': 7.974563298433646e-07, 'epoch': 1.22} +{'loss': 88.5063, 'grad_norm': 1027.7305516488038, 'learning_rate': 7.957512856846417e-07, 'epoch': 1.22} +{'loss': 87.964, 'grad_norm': 1134.7069791357674, 'learning_rate': 7.940479907252463e-07, 'epoch': 1.22} +{'loss': 88.1113, 'grad_norm': 1008.3271086671348, 'learning_rate': 7.923464452888796e-07, 'epoch': 1.22} +{'loss': 89.4476, 'grad_norm': 984.31351972484, 'learning_rate': 7.906466496989051e-07, 'epoch': 1.22} +{'loss': 89.6882, 'grad_norm': 1189.347482036099, 'learning_rate': 7.889486042783579e-07, 'epoch': 1.22} +{'loss': 88.0329, 'grad_norm': 994.3518038298049, 'learning_rate': 7.872523093499396e-07, 'epoch': 1.22} +{'loss': 88.6179, 'grad_norm': 1082.234972362621, 'learning_rate': 7.855577652360191e-07, 'epoch': 1.22} +{'loss': 89.2778, 'grad_norm': 1048.640692573571, 'learning_rate': 7.8386497225863e-07, 'epoch': 1.22} +{'loss': 88.3818, 'grad_norm': 1025.8419406265018, 'learning_rate': 7.821739307394749e-07, 'epoch': 1.22} +{'loss': 88.7068, 'grad_norm': 931.6120240387821, 'learning_rate': 7.804846409999245e-07, 'epoch': 1.22} +{'loss': 88.0962, 'grad_norm': 1007.5532796115444, 'learning_rate': 7.787971033610176e-07, 'epoch': 1.22} +{'loss': 90.3369, 'grad_norm': 994.3223600702645, 'learning_rate': 7.771113181434564e-07, 'epoch': 1.22} +{'loss': 88.9386, 'grad_norm': 1069.278187782002, 'learning_rate': 7.754272856676126e-07, 'epoch': 1.23} +{'loss': 88.9485, 'grad_norm': 1030.2843428186973, 'learning_rate': 7.737450062535224e-07, 'epoch': 1.23} +{'loss': 91.827, 'grad_norm': 1194.605441224065, 'learning_rate': 7.720644802208909e-07, 'epoch': 1.23} +{'loss': 89.2204, 'grad_norm': 1030.8457782416076, 'learning_rate': 7.703857078890919e-07, 'epoch': 1.23} +{'loss': 90.3071, 'grad_norm': 1128.5397996236798, 'learning_rate': 7.687086895771622e-07, 'epoch': 1.23} +{'loss': 90.4261, 'grad_norm': 1008.0431472912688, 'learning_rate': 7.670334256038081e-07, 'epoch': 1.23} +{'loss': 88.8407, 'grad_norm': 1005.6707650298454, 'learning_rate': 7.65359916287397e-07, 'epoch': 1.23} +{'loss': 89.9202, 'grad_norm': 1052.8441989641562, 'learning_rate': 7.636881619459724e-07, 'epoch': 1.23} +{'loss': 88.6695, 'grad_norm': 968.4914173975645, 'learning_rate': 7.620181628972334e-07, 'epoch': 1.23} +{'loss': 91.5045, 'grad_norm': 1114.2486413524075, 'learning_rate': 7.603499194585562e-07, 'epoch': 1.23} +{'loss': 88.2283, 'grad_norm': 1252.1450173823455, 'learning_rate': 7.586834319469749e-07, 'epoch': 1.23} +{'loss': 88.0104, 'grad_norm': 1084.026572677962, 'learning_rate': 7.570187006791951e-07, 'epoch': 1.23} +{'loss': 89.0898, 'grad_norm': 1039.0201656433494, 'learning_rate': 7.553557259715827e-07, 'epoch': 1.23} +{'loss': 88.8237, 'grad_norm': 1205.5343979225258, 'learning_rate': 7.536945081401769e-07, 'epoch': 1.23} +{'loss': 89.3341, 'grad_norm': 1046.5570345713068, 'learning_rate': 7.52035047500681e-07, 'epoch': 1.23} +{'loss': 88.6506, 'grad_norm': 997.0346618576497, 'learning_rate': 7.503773443684615e-07, 'epoch': 1.23} +{'loss': 88.2894, 'grad_norm': 1059.689832623854, 'learning_rate': 7.487213990585529e-07, 'epoch': 1.23} +{'loss': 86.6521, 'grad_norm': 1039.8202110713246, 'learning_rate': 7.470672118856537e-07, 'epoch': 1.23} +{'loss': 88.322, 'grad_norm': 1035.6575144230867, 'learning_rate': 7.454147831641312e-07, 'epoch': 1.23} +{'loss': 85.7535, 'grad_norm': 1009.8171285471532, 'learning_rate': 7.437641132080187e-07, 'epoch': 1.23} +{'loss': 91.0006, 'grad_norm': 1142.455217146979, 'learning_rate': 7.42115202331013e-07, 'epoch': 1.23} +{'loss': 88.6341, 'grad_norm': 1086.5022782051701, 'learning_rate': 7.404680508464767e-07, 'epoch': 1.23} +{'loss': 86.6843, 'grad_norm': 1073.0351116992524, 'learning_rate': 7.388226590674374e-07, 'epoch': 1.23} +{'loss': 89.3119, 'grad_norm': 1103.0363501539543, 'learning_rate': 7.371790273065916e-07, 'epoch': 1.23} +{'loss': 89.3687, 'grad_norm': 1134.708862377482, 'learning_rate': 7.355371558763002e-07, 'epoch': 1.23} +{'loss': 88.2424, 'grad_norm': 1013.2828836076771, 'learning_rate': 7.338970450885885e-07, 'epoch': 1.23} +{'loss': 87.1044, 'grad_norm': 1108.7069616960298, 'learning_rate': 7.322586952551447e-07, 'epoch': 1.23} +{'loss': 87.9608, 'grad_norm': 974.2294543731725, 'learning_rate': 7.306221066873265e-07, 'epoch': 1.23} +{'loss': 89.2735, 'grad_norm': 981.9901307586537, 'learning_rate': 7.28987279696155e-07, 'epoch': 1.23} +{'loss': 89.4793, 'grad_norm': 1128.3019603582986, 'learning_rate': 7.273542145923196e-07, 'epoch': 1.23} +{'loss': 91.0723, 'grad_norm': 1091.6959417303601, 'learning_rate': 7.257229116861708e-07, 'epoch': 1.23} +[2025-05-30 06:22:57,027] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step64000 is about to be saved! +[2025-05-30 06:22:57,077] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 06:22:57,077] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 06:22:57,207] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 06:22:57,341] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 06:31:32,942] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 06:31:33,022] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 06:31:34,070] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step64000 is ready now! +[2025-05-30 06:31:34,123] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step64000 is about to be saved! +[2025-05-30 06:31:34,146] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 06:31:34,146] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 06:31:34,209] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 06:31:34,303] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 06:37:11,319] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 06:37:11,414] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-64000/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 06:37:21,658] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step64000 is ready now! +{'loss': 87.9787, 'grad_norm': 1061.9958396184572, 'learning_rate': 7.24093371287724e-07, 'epoch': 1.23} +{'loss': 86.2009, 'grad_norm': 1033.4657002158333, 'learning_rate': 7.224655937066616e-07, 'epoch': 1.23} +{'loss': 89.7837, 'grad_norm': 994.1389556850361, 'learning_rate': 7.208395792523315e-07, 'epoch': 1.23} +{'loss': 90.2948, 'grad_norm': 1160.139260648496, 'learning_rate': 7.192153282337477e-07, 'epoch': 1.23} +{'loss': 89.6399, 'grad_norm': 1126.0931991017367, 'learning_rate': 7.175928409595844e-07, 'epoch': 1.23} +{'loss': 90.2257, 'grad_norm': 1104.396892873046, 'learning_rate': 7.15972117738184e-07, 'epoch': 1.23} +{'loss': 87.6985, 'grad_norm': 991.7808469091683, 'learning_rate': 7.143531588775509e-07, 'epoch': 1.23} +{'loss': 87.6374, 'grad_norm': 954.0592178748386, 'learning_rate': 7.127359646853593e-07, 'epoch': 1.23} +{'loss': 89.8419, 'grad_norm': 1189.8920280917366, 'learning_rate': 7.111205354689454e-07, 'epoch': 1.23} +{'loss': 86.6229, 'grad_norm': 1014.6525351749449, 'learning_rate': 7.095068715353082e-07, 'epoch': 1.23} +{'loss': 89.7372, 'grad_norm': 1089.7628636986476, 'learning_rate': 7.078949731911122e-07, 'epoch': 1.23} +{'loss': 89.5662, 'grad_norm': 1076.550570377061, 'learning_rate': 7.062848407426859e-07, 'epoch': 1.23} +{'loss': 91.0975, 'grad_norm': 1018.1141395563409, 'learning_rate': 7.046764744960255e-07, 'epoch': 1.23} +{'loss': 87.3072, 'grad_norm': 1059.6733776764638, 'learning_rate': 7.030698747567855e-07, 'epoch': 1.23} +{'loss': 93.286, 'grad_norm': 1256.626567223584, 'learning_rate': 7.014650418302938e-07, 'epoch': 1.23} +{'loss': 89.7531, 'grad_norm': 1079.9759714784213, 'learning_rate': 6.99861976021533e-07, 'epoch': 1.23} +{'loss': 89.9408, 'grad_norm': 1059.228499087811, 'learning_rate': 6.982606776351531e-07, 'epoch': 1.23} +{'loss': 88.5521, 'grad_norm': 1071.4799550246853, 'learning_rate': 6.966611469754714e-07, 'epoch': 1.23} +{'loss': 87.5901, 'grad_norm': 1038.3332059522331, 'learning_rate': 6.95063384346466e-07, 'epoch': 1.23} +{'loss': 89.7526, 'grad_norm': 1085.2087337515607, 'learning_rate': 6.934673900517808e-07, 'epoch': 1.23} +{'loss': 87.2416, 'grad_norm': 1086.0424091611258, 'learning_rate': 6.918731643947219e-07, 'epoch': 1.23} +{'loss': 89.0602, 'grad_norm': 1128.2375763869818, 'learning_rate': 6.90280707678258e-07, 'epoch': 1.24} +{'loss': 87.8301, 'grad_norm': 1039.0812829082042, 'learning_rate': 6.886900202050284e-07, 'epoch': 1.24} +{'loss': 92.0846, 'grad_norm': 1274.5455931319534, 'learning_rate': 6.871011022773267e-07, 'epoch': 1.24} +{'loss': 86.6419, 'grad_norm': 1147.9421288588878, 'learning_rate': 6.855139541971179e-07, 'epoch': 1.24} +{'loss': 88.191, 'grad_norm': 1056.050370702246, 'learning_rate': 6.839285762660275e-07, 'epoch': 1.24} +{'loss': 88.4798, 'grad_norm': 1120.4688590676612, 'learning_rate': 6.823449687853434e-07, 'epoch': 1.24} +{'loss': 86.9927, 'grad_norm': 1111.0286259351824, 'learning_rate': 6.807631320560204e-07, 'epoch': 1.24} +{'loss': 90.7758, 'grad_norm': 1117.4564196449437, 'learning_rate': 6.791830663786725e-07, 'epoch': 1.24} +{'loss': 88.8819, 'grad_norm': 1094.0361720820806, 'learning_rate': 6.776047720535816e-07, 'epoch': 1.24} +{'loss': 87.5402, 'grad_norm': 1082.0462840912833, 'learning_rate': 6.760282493806914e-07, 'epoch': 1.24} +{'loss': 89.0988, 'grad_norm': 1095.7039177531383, 'learning_rate': 6.744534986596041e-07, 'epoch': 1.24} +{'loss': 88.7984, 'grad_norm': 1058.952108273115, 'learning_rate': 6.728805201895949e-07, 'epoch': 1.24} +{'loss': 89.0821, 'grad_norm': 1042.3427006646343, 'learning_rate': 6.713093142695915e-07, 'epoch': 1.24} +{'loss': 90.7609, 'grad_norm': 1020.093859623003, 'learning_rate': 6.697398811981948e-07, 'epoch': 1.24} +{'loss': 89.1679, 'grad_norm': 1033.9298341101369, 'learning_rate': 6.681722212736619e-07, 'epoch': 1.24} +{'loss': 92.3117, 'grad_norm': 1063.8375052703534, 'learning_rate': 6.666063347939134e-07, 'epoch': 1.24} +{'loss': 91.1611, 'grad_norm': 1032.915405233595, 'learning_rate': 6.650422220565378e-07, 'epoch': 1.24} +{'loss': 87.4246, 'grad_norm': 957.5462300724943, 'learning_rate': 6.634798833587785e-07, 'epoch': 1.24} +{'loss': 87.9314, 'grad_norm': 1115.280079755195, 'learning_rate': 6.619193189975515e-07, 'epoch': 1.24} +{'loss': 88.4976, 'grad_norm': 1088.9324120938873, 'learning_rate': 6.60360529269427e-07, 'epoch': 1.24} +{'loss': 87.9979, 'grad_norm': 1071.7549478046187, 'learning_rate': 6.588035144706418e-07, 'epoch': 1.24} +{'loss': 89.0344, 'grad_norm': 1029.998568951854, 'learning_rate': 6.572482748970976e-07, 'epoch': 1.24} +{'loss': 89.0072, 'grad_norm': 1011.9889666250491, 'learning_rate': 6.556948108443529e-07, 'epoch': 1.24} +{'loss': 88.7446, 'grad_norm': 1022.4592402197799, 'learning_rate': 6.541431226076345e-07, 'epoch': 1.24} +{'loss': 92.2881, 'grad_norm': 9631.955906882778, 'learning_rate': 6.525932104818278e-07, 'epoch': 1.24} +{'loss': 91.0229, 'grad_norm': 1184.5886522820738, 'learning_rate': 6.510450747614816e-07, 'epoch': 1.24} +{'loss': 89.8185, 'grad_norm': 1105.051354201067, 'learning_rate': 6.494987157408106e-07, 'epoch': 1.24} +{'loss': 89.7617, 'grad_norm': 1030.2924387103405, 'learning_rate': 6.479541337136863e-07, 'epoch': 1.24} +{'loss': 85.5664, 'grad_norm': 970.5007503055874, 'learning_rate': 6.46411328973644e-07, 'epoch': 1.24} +{'loss': 89.1034, 'grad_norm': 986.9553728579333, 'learning_rate': 6.448703018138869e-07, 'epoch': 1.24} +{'loss': 88.2503, 'grad_norm': 1060.4072095555184, 'learning_rate': 6.433310525272696e-07, 'epoch': 1.24} +{'loss': 88.6853, 'grad_norm': 1147.8332825035554, 'learning_rate': 6.417935814063203e-07, 'epoch': 1.24} +{'loss': 94.2417, 'grad_norm': 1085.9284349414409, 'learning_rate': 6.402578887432232e-07, 'epoch': 1.24} +{'loss': 87.5047, 'grad_norm': 1056.8426918010173, 'learning_rate': 6.387239748298213e-07, 'epoch': 1.24} +{'loss': 89.7186, 'grad_norm': 1091.9000044226245, 'learning_rate': 6.371918399576282e-07, 'epoch': 1.24} +{'loss': 89.7927, 'grad_norm': 1049.746274895536, 'learning_rate': 6.356614844178122e-07, 'epoch': 1.24} +{'loss': 89.0247, 'grad_norm': 1047.2289952690796, 'learning_rate': 6.341329085012083e-07, 'epoch': 1.24} +{'loss': 88.5801, 'grad_norm': 1190.8731662554146, 'learning_rate': 6.326061124983085e-07, 'epoch': 1.24} +{'loss': 89.4459, 'grad_norm': 1137.3024204607457, 'learning_rate': 6.310810966992698e-07, 'epoch': 1.24} +{'loss': 88.4398, 'grad_norm': 1191.516343157978, 'learning_rate': 6.295578613939113e-07, 'epoch': 1.24} +{'loss': 90.5215, 'grad_norm': 968.8770436206171, 'learning_rate': 6.280364068717116e-07, 'epoch': 1.24} +{'loss': 87.0384, 'grad_norm': 1153.2415207883244, 'learning_rate': 6.265167334218125e-07, 'epoch': 1.24} +{'loss': 89.1941, 'grad_norm': 1243.5360356845374, 'learning_rate': 6.249988413330166e-07, 'epoch': 1.24} +{'loss': 88.1868, 'grad_norm': 1041.158260747478, 'learning_rate': 6.234827308937863e-07, 'epoch': 1.24} +{'loss': 88.6856, 'grad_norm': 1066.7677744623816, 'learning_rate': 6.219684023922501e-07, 'epoch': 1.24} +{'loss': 87.9589, 'grad_norm': 980.9306561180616, 'learning_rate': 6.204558561161911e-07, 'epoch': 1.24} +{'loss': 88.3923, 'grad_norm': 994.5417505145333, 'learning_rate': 6.189450923530627e-07, 'epoch': 1.24} +{'loss': 89.0399, 'grad_norm': 1051.9647856474905, 'learning_rate': 6.174361113899707e-07, 'epoch': 1.24} +{'loss': 91.2849, 'grad_norm': 1084.6582108781533, 'learning_rate': 6.159289135136858e-07, 'epoch': 1.24} +{'loss': 88.2744, 'grad_norm': 1033.1804129699462, 'learning_rate': 6.144234990106413e-07, 'epoch': 1.24} +{'loss': 87.2689, 'grad_norm': 984.6713786580617, 'learning_rate': 6.129198681669279e-07, 'epoch': 1.24} +{'loss': 101.8615, 'grad_norm': 992.7667811950645, 'learning_rate': 6.11418021268303e-07, 'epoch': 1.24} +{'loss': 86.8037, 'grad_norm': 991.8871085503185, 'learning_rate': 6.099179586001791e-07, 'epoch': 1.25} +{'loss': 89.0414, 'grad_norm': 1046.550375377468, 'learning_rate': 6.084196804476317e-07, 'epoch': 1.25} +{'loss': 88.1724, 'grad_norm': 1079.8415293710361, 'learning_rate': 6.069231870953985e-07, 'epoch': 1.25} +{'loss': 87.8744, 'grad_norm': 1068.7818816725273, 'learning_rate': 6.054284788278764e-07, 'epoch': 1.25} +{'loss': 86.9524, 'grad_norm': 1048.5253746468372, 'learning_rate': 6.039355559291237e-07, 'epoch': 1.25} +{'loss': 89.6617, 'grad_norm': 1072.4049172630957, 'learning_rate': 6.024444186828615e-07, 'epoch': 1.25} +{'loss': 90.4485, 'grad_norm': 1104.226225479058, 'learning_rate': 6.009550673724662e-07, 'epoch': 1.25} +{'loss': 86.7894, 'grad_norm': 1078.5809034739852, 'learning_rate': 5.99467502280977e-07, 'epoch': 1.25} +{'loss': 87.7851, 'grad_norm': 1039.3619305178026, 'learning_rate': 5.979817236910979e-07, 'epoch': 1.25} +{'loss': 87.4478, 'grad_norm': 944.0931259109312, 'learning_rate': 5.964977318851906e-07, 'epoch': 1.25} +{'loss': 89.8411, 'grad_norm': 1041.810625138178, 'learning_rate': 5.950155271452751e-07, 'epoch': 1.25} +{'loss': 89.6883, 'grad_norm': 1091.8794527298344, 'learning_rate': 5.93535109753034e-07, 'epoch': 1.25} +{'loss': 87.7551, 'grad_norm': 1020.9227974342426, 'learning_rate': 5.920564799898065e-07, 'epoch': 1.25} +{'loss': 90.4561, 'grad_norm': 1164.4586810204755, 'learning_rate': 5.905796381365991e-07, 'epoch': 1.25} +{'loss': 90.5196, 'grad_norm': 996.181576318406, 'learning_rate': 5.89104584474074e-07, 'epoch': 1.25} +{'loss': 88.5801, 'grad_norm': 1012.0262361754992, 'learning_rate': 5.876313192825544e-07, 'epoch': 1.25} +{'loss': 88.4321, 'grad_norm': 1034.1752256224122, 'learning_rate': 5.861598428420223e-07, 'epoch': 1.25} +{'loss': 89.3444, 'grad_norm': 1035.6239960521877, 'learning_rate': 5.846901554321205e-07, 'epoch': 1.25} +{'loss': 87.7303, 'grad_norm': 991.1303147077263, 'learning_rate': 5.832222573321533e-07, 'epoch': 1.25} +{'loss': 88.2693, 'grad_norm': 1024.7109673001498, 'learning_rate': 5.817561488210844e-07, 'epoch': 1.25} +{'loss': 89.5512, 'grad_norm': 1057.3846679142944, 'learning_rate': 5.802918301775362e-07, 'epoch': 1.25} +{'loss': 87.6059, 'grad_norm': 944.8744024954952, 'learning_rate': 5.788293016797919e-07, 'epoch': 1.25} +{'loss': 87.4838, 'grad_norm': 1068.2896397751838, 'learning_rate': 5.773685636057924e-07, 'epoch': 1.25} +{'loss': 87.6785, 'grad_norm': 1008.8620735188481, 'learning_rate': 5.759096162331423e-07, 'epoch': 1.25} +{'loss': 86.7501, 'grad_norm': 1059.2452939278091, 'learning_rate': 5.744524598391032e-07, 'epoch': 1.25} +{'loss': 88.7938, 'grad_norm': 1039.1603531750914, 'learning_rate': 5.729970947005981e-07, 'epoch': 1.25} +{'loss': 88.6628, 'grad_norm': 1048.1765900812986, 'learning_rate': 5.715435210942055e-07, 'epoch': 1.25} +{'loss': 88.8099, 'grad_norm': 1190.936456743384, 'learning_rate': 5.70091739296168e-07, 'epoch': 1.25} +{'loss': 88.9225, 'grad_norm': 1044.2237267891278, 'learning_rate': 5.686417495823848e-07, 'epoch': 1.25} +{'loss': 88.5567, 'grad_norm': 1080.2094078238952, 'learning_rate': 5.671935522284177e-07, 'epoch': 1.25} +{'loss': 91.0999, 'grad_norm': 1060.3935555954397, 'learning_rate': 5.657471475094867e-07, 'epoch': 1.25} +{'loss': 87.5729, 'grad_norm': 990.7136147987497, 'learning_rate': 5.643025357004673e-07, 'epoch': 1.25} +{'loss': 85.9537, 'grad_norm': 993.1125554939497, 'learning_rate': 5.628597170758965e-07, 'epoch': 1.25} +{'loss': 87.7086, 'grad_norm': 1065.6815632453763, 'learning_rate': 5.61418691909974e-07, 'epoch': 1.25} +{'loss': 91.0937, 'grad_norm': 1004.808835891872, 'learning_rate': 5.599794604765574e-07, 'epoch': 1.25} +{'loss': 88.3531, 'grad_norm': 1138.6052658215774, 'learning_rate': 5.585420230491589e-07, 'epoch': 1.25} +{'loss': 90.4459, 'grad_norm': 1264.5018138523048, 'learning_rate': 5.571063799009546e-07, 'epoch': 1.25} +{'loss': 90.9624, 'grad_norm': 1056.075305681877, 'learning_rate': 5.556725313047751e-07, 'epoch': 1.25} +{'loss': 89.6386, 'grad_norm': 1049.1577210544838, 'learning_rate': 5.542404775331145e-07, 'epoch': 1.25} +{'loss': 89.2811, 'grad_norm': 1106.1770433148743, 'learning_rate': 5.528102188581252e-07, 'epoch': 1.25} +{'loss': 87.9496, 'grad_norm': 1010.233095780277, 'learning_rate': 5.513817555516176e-07, 'epoch': 1.25} +{'loss': 87.5698, 'grad_norm': 989.3405701454634, 'learning_rate': 5.499550878850579e-07, 'epoch': 1.25} +{'loss': 88.7648, 'grad_norm': 975.7647395699017, 'learning_rate': 5.485302161295758e-07, 'epoch': 1.25} +{'loss': 91.02, 'grad_norm': 1123.634432974002, 'learning_rate': 5.471071405559547e-07, 'epoch': 1.25} +{'loss': 87.2284, 'grad_norm': 1029.1320260714515, 'learning_rate': 5.456858614346428e-07, 'epoch': 1.25} +{'loss': 87.9036, 'grad_norm': 983.7172346107455, 'learning_rate': 5.442663790357439e-07, 'epoch': 1.25} +{'loss': 96.6151, 'grad_norm': 1111.5811532000507, 'learning_rate': 5.428486936290178e-07, 'epoch': 1.25} +{'loss': 88.8446, 'grad_norm': 1068.1580518188107, 'learning_rate': 5.414328054838869e-07, 'epoch': 1.25} +{'loss': 89.3942, 'grad_norm': 1035.9643190854015, 'learning_rate': 5.40018714869428e-07, 'epoch': 1.25} +{'loss': 88.3128, 'grad_norm': 998.9999689629083, 'learning_rate': 5.386064220543807e-07, 'epoch': 1.25} +{'loss': 88.2617, 'grad_norm': 1020.2531324258534, 'learning_rate': 5.371959273071414e-07, 'epoch': 1.25} +{'loss': 90.9618, 'grad_norm': 1167.0638496624265, 'learning_rate': 5.357872308957623e-07, 'epoch': 1.25} +{'loss': 89.6954, 'grad_norm': 1118.6131377261545, 'learning_rate': 5.343803330879571e-07, 'epoch': 1.26} +{'loss': 89.4251, 'grad_norm': 1020.4626703398204, 'learning_rate': 5.32975234151093e-07, 'epoch': 1.26} +{'loss': 88.1625, 'grad_norm': 999.3391917264408, 'learning_rate': 5.31571934352203e-07, 'epoch': 1.26} +{'loss': 88.8646, 'grad_norm': 992.3591181642022, 'learning_rate': 5.301704339579716e-07, 'epoch': 1.26} +{'loss': 87.3467, 'grad_norm': 1001.741138503521, 'learning_rate': 5.287707332347458e-07, 'epoch': 1.26} +{'loss': 88.5044, 'grad_norm': 1044.3295050891913, 'learning_rate': 5.273728324485261e-07, 'epoch': 1.26} +{'loss': 86.4658, 'grad_norm': 1026.6860394349405, 'learning_rate': 5.259767318649711e-07, 'epoch': 1.26} +{'loss': 89.3688, 'grad_norm': 1117.3194147919153, 'learning_rate': 5.24582431749403e-07, 'epoch': 1.26} +{'loss': 87.7381, 'grad_norm': 1076.0601230942327, 'learning_rate': 5.231899323667989e-07, 'epoch': 1.26} +{'loss': 87.2331, 'grad_norm': 1019.4459596979691, 'learning_rate': 5.217992339817912e-07, 'epoch': 1.26} +{'loss': 88.2212, 'grad_norm': 971.1147135583378, 'learning_rate': 5.204103368586722e-07, 'epoch': 1.26} +{'loss': 89.3295, 'grad_norm': 1049.9227463433367, 'learning_rate': 5.190232412613893e-07, 'epoch': 1.26} +{'loss': 86.8453, 'grad_norm': 966.3188410716679, 'learning_rate': 5.176379474535509e-07, 'epoch': 1.26} +{'loss': 86.2275, 'grad_norm': 1172.677874139398, 'learning_rate': 5.16254455698425e-07, 'epoch': 1.26} +{'loss': 87.4955, 'grad_norm': 1037.8665945200048, 'learning_rate': 5.148727662589326e-07, 'epoch': 1.26} +{'loss': 87.2191, 'grad_norm': 987.4974257640419, 'learning_rate': 5.134928793976512e-07, 'epoch': 1.26} +{'loss': 87.8055, 'grad_norm': 980.1597651256621, 'learning_rate': 5.12114795376818e-07, 'epoch': 1.26} +{'loss': 89.8678, 'grad_norm': 1014.4867238805962, 'learning_rate': 5.107385144583299e-07, 'epoch': 1.26} +{'loss': 90.4011, 'grad_norm': 1084.2369642014683, 'learning_rate': 5.093640369037389e-07, 'epoch': 1.26} +{'loss': 86.8005, 'grad_norm': 1080.0919415195572, 'learning_rate': 5.079913629742539e-07, 'epoch': 1.26} +{'loss': 87.6702, 'grad_norm': 1018.3467733372365, 'learning_rate': 5.066204929307406e-07, 'epoch': 1.26} +{'loss': 86.8736, 'grad_norm': 1152.6869241895592, 'learning_rate': 5.052514270337217e-07, 'epoch': 1.26} +{'loss': 87.5051, 'grad_norm': 981.2611980490807, 'learning_rate': 5.038841655433801e-07, 'epoch': 1.26} +{'loss': 89.4393, 'grad_norm': 1140.3644227750262, 'learning_rate': 5.025187087195538e-07, 'epoch': 1.26} +{'loss': 90.3332, 'grad_norm': 1087.7951975676024, 'learning_rate': 5.011550568217371e-07, 'epoch': 1.26} +{'loss': 87.5468, 'grad_norm': 1071.0662123561806, 'learning_rate': 4.99793210109083e-07, 'epoch': 1.26} +{'loss': 87.5084, 'grad_norm': 1014.6275360763791, 'learning_rate': 4.984331688403976e-07, 'epoch': 1.26} +{'loss': 89.7585, 'grad_norm': 1126.3709658951504, 'learning_rate': 4.970749332741487e-07, 'epoch': 1.26} +{'loss': 88.6827, 'grad_norm': 1131.0977531273982, 'learning_rate': 4.957185036684575e-07, 'epoch': 1.26} +{'loss': 89.5316, 'grad_norm': 1196.3077507549165, 'learning_rate': 4.943638802811057e-07, 'epoch': 1.26} +{'loss': 88.7819, 'grad_norm': 1053.2207070530724, 'learning_rate': 4.930110633695284e-07, 'epoch': 1.26} +{'loss': 87.0051, 'grad_norm': 988.4230095518593, 'learning_rate': 4.916600531908178e-07, 'epoch': 1.26} +{'loss': 89.1883, 'grad_norm': 1023.8387918410925, 'learning_rate': 4.90310850001724e-07, 'epoch': 1.26} +{'loss': 85.6344, 'grad_norm': 1014.7746488315746, 'learning_rate': 4.889634540586518e-07, 'epoch': 1.26} +{'loss': 89.6733, 'grad_norm': 1239.4309193228, 'learning_rate': 4.876178656176655e-07, 'epoch': 1.26} +{'loss': 87.6311, 'grad_norm': 1040.2311997888728, 'learning_rate': 4.862740849344849e-07, 'epoch': 1.26} +{'loss': 88.85, 'grad_norm': 1030.8245810975802, 'learning_rate': 4.849321122644846e-07, 'epoch': 1.26} +{'loss': 89.2616, 'grad_norm': 1106.4398667830878, 'learning_rate': 4.835919478626938e-07, 'epoch': 1.26} +{'loss': 88.3116, 'grad_norm': 1145.4225311665791, 'learning_rate': 4.822535919838045e-07, 'epoch': 1.26} +{'loss': 88.9381, 'grad_norm': 995.957380743514, 'learning_rate': 4.809170448821609e-07, 'epoch': 1.26} +{'loss': 88.8795, 'grad_norm': 1079.0123106409117, 'learning_rate': 4.795823068117622e-07, 'epoch': 1.26} +{'loss': 90.1047, 'grad_norm': 1089.5241963170454, 'learning_rate': 4.782493780262676e-07, 'epoch': 1.26} +{'loss': 87.3885, 'grad_norm': 1004.2474094305361, 'learning_rate': 4.769182587789878e-07, 'epoch': 1.26} +{'loss': 89.5859, 'grad_norm': 1199.4546294874165, 'learning_rate': 4.755889493228938e-07, 'epoch': 1.26} +{'loss': 87.2372, 'grad_norm': 1050.0171669004421, 'learning_rate': 4.7426144991061127e-07, 'epoch': 1.26} +{'loss': 88.016, 'grad_norm': 1032.9901954773245, 'learning_rate': 4.7293576079442185e-07, 'epoch': 1.26} +{'loss': 88.0205, 'grad_norm': 1083.012383180698, 'learning_rate': 4.7161188222626163e-07, 'epoch': 1.26} +{'loss': 85.7243, 'grad_norm': 946.239407381953, 'learning_rate': 4.702898144577228e-07, 'epoch': 1.26} +{'loss': 87.7087, 'grad_norm': 988.3334129644162, 'learning_rate': 4.689695577400566e-07, 'epoch': 1.26} +{'loss': 90.5188, 'grad_norm': 1023.4828172138202, 'learning_rate': 4.6765111232417004e-07, 'epoch': 1.26} +{'loss': 89.1048, 'grad_norm': 1031.9390874561168, 'learning_rate': 4.663344784606205e-07, 'epoch': 1.26} +{'loss': 85.7455, 'grad_norm': 936.7595247673604, 'learning_rate': 4.6501965639962563e-07, 'epoch': 1.26} +{'loss': 87.1945, 'grad_norm': 1060.273065740735, 'learning_rate': 4.6370664639105666e-07, 'epoch': 1.27} +{'loss': 89.3881, 'grad_norm': 1364.869296812164, 'learning_rate': 4.6239544868444287e-07, 'epoch': 1.27} +{'loss': 89.0111, 'grad_norm': 1125.812312324819, 'learning_rate': 4.610860635289671e-07, 'epoch': 1.27} +{'loss': 87.0056, 'grad_norm': 1020.3218754362146, 'learning_rate': 4.597784911734693e-07, 'epoch': 1.27} +{'loss': 97.3607, 'grad_norm': 1016.3652434254171, 'learning_rate': 4.584727318664428e-07, 'epoch': 1.27} +{'loss': 89.8963, 'grad_norm': 1032.4078267405794, 'learning_rate': 4.5716878585603696e-07, 'epoch': 1.27} +{'loss': 88.0174, 'grad_norm': 1021.6740495588583, 'learning_rate': 4.5586665339005687e-07, 'epoch': 1.27} +{'loss': 88.0598, 'grad_norm': 1045.2290298971745, 'learning_rate': 4.545663347159657e-07, 'epoch': 1.27} +{'loss': 89.6553, 'grad_norm': 1122.859334731621, 'learning_rate': 4.5326783008087793e-07, 'epoch': 1.27} +{'loss': 85.8787, 'grad_norm': 1014.652637180311, 'learning_rate': 4.5197113973156403e-07, 'epoch': 1.27} +{'loss': 87.9072, 'grad_norm': 1055.1533273601726, 'learning_rate': 4.5067626391445016e-07, 'epoch': 1.27} +{'loss': 89.2866, 'grad_norm': 1119.7637751768652, 'learning_rate': 4.4938320287562067e-07, 'epoch': 1.27} +{'loss': 87.4782, 'grad_norm': 987.169670227009, 'learning_rate': 4.4809195686080775e-07, 'epoch': 1.27} +{'loss': 87.0583, 'grad_norm': 976.7638476393885, 'learning_rate': 4.468025261154074e-07, 'epoch': 1.27} +{'loss': 90.4114, 'grad_norm': 971.758633072883, 'learning_rate': 4.4551491088446476e-07, 'epoch': 1.27} +{'loss': 87.6833, 'grad_norm': 1029.5154207375563, 'learning_rate': 4.4422911141267954e-07, 'epoch': 1.27} +{'loss': 88.404, 'grad_norm': 1037.1587256155633, 'learning_rate': 4.429451279444119e-07, 'epoch': 1.27} +{'loss': 87.2745, 'grad_norm': 1015.7741162044041, 'learning_rate': 4.416629607236711e-07, 'epoch': 1.27} +{'loss': 91.0204, 'grad_norm': 953.6696774114565, 'learning_rate': 4.403826099941244e-07, 'epoch': 1.27} +{'loss': 89.4529, 'grad_norm': 975.1772913457144, 'learning_rate': 4.3910407599909386e-07, 'epoch': 1.27} +{'loss': 86.3533, 'grad_norm': 939.2228748601515, 'learning_rate': 4.3782735898155183e-07, 'epoch': 1.27} +{'loss': 90.2353, 'grad_norm': 1053.4814367525132, 'learning_rate': 4.365524591841341e-07, 'epoch': 1.27} +{'loss': 88.8807, 'grad_norm': 981.1214482458923, 'learning_rate': 4.352793768491215e-07, 'epoch': 1.27} +[2025-05-30 08:25:31,985] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step66000 is about to be saved! +[2025-05-30 08:25:32,016] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 08:25:32,016] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 08:25:32,166] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 08:25:32,238] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 08:31:43,223] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 08:31:43,294] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 08:31:47,740] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step66000 is ready now! +[2025-05-30 08:31:47,793] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step66000 is about to be saved! +[2025-05-30 08:31:47,812] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 08:31:47,812] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 08:31:47,946] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 08:31:48,105] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 08:37:55,599] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 08:37:55,671] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-66000/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 08:37:56,080] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step66000 is ready now! +{'loss': 87.534, 'grad_norm': 1067.2307640924544, 'learning_rate': 4.3400811221845693e-07, 'epoch': 1.27} +{'loss': 88.5815, 'grad_norm': 1078.610416838028, 'learning_rate': 4.32738665533734e-07, 'epoch': 1.27} +{'loss': 87.3144, 'grad_norm': 1019.6994908864556, 'learning_rate': 4.314710370361985e-07, 'epoch': 1.27} +{'loss': 86.8871, 'grad_norm': 936.5374696647325, 'learning_rate': 4.302052269667589e-07, 'epoch': 1.27} +{'loss': 86.0753, 'grad_norm': 993.1902987368089, 'learning_rate': 4.289412355659694e-07, 'epoch': 1.27} +{'loss': 90.1927, 'grad_norm': 1052.077208715696, 'learning_rate': 4.2767906307404573e-07, 'epoch': 1.27} +{'loss': 89.7794, 'grad_norm': 1304.8298626787132, 'learning_rate': 4.264187097308503e-07, 'epoch': 1.27} +{'loss': 88.4707, 'grad_norm': 998.4124822621435, 'learning_rate': 4.251601757759061e-07, 'epoch': 1.27} +{'loss': 89.9227, 'grad_norm': 1052.1107086289276, 'learning_rate': 4.2390346144838836e-07, 'epoch': 1.27} +{'loss': 87.74, 'grad_norm': 972.4634042533976, 'learning_rate': 4.2264856698712497e-07, 'epoch': 1.27} +{'loss': 89.1505, 'grad_norm': 1018.1914794855592, 'learning_rate': 4.2139549263060185e-07, 'epoch': 1.27} +{'loss': 88.276, 'grad_norm': 1200.6212896779894, 'learning_rate': 4.201442386169552e-07, 'epoch': 1.27} +{'loss': 91.083, 'grad_norm': 957.0996697106041, 'learning_rate': 4.188948051839736e-07, 'epoch': 1.27} +{'loss': 87.9362, 'grad_norm': 994.5234849205007, 'learning_rate': 4.176471925691084e-07, 'epoch': 1.27} +{'loss': 87.6799, 'grad_norm': 983.0352044930706, 'learning_rate': 4.1640140100945304e-07, 'epoch': 1.27} +{'loss': 89.1752, 'grad_norm': 1031.0100133361334, 'learning_rate': 4.151574307417661e-07, 'epoch': 1.27} +{'loss': 87.3168, 'grad_norm': 1019.1302695611911, 'learning_rate': 4.1391528200245277e-07, 'epoch': 1.27} +{'loss': 87.9491, 'grad_norm': 1057.0511938378277, 'learning_rate': 4.126749550275733e-07, 'epoch': 1.27} +{'loss': 87.2307, 'grad_norm': 1099.9098683994393, 'learning_rate': 4.114364500528456e-07, 'epoch': 1.27} +{'loss': 86.2068, 'grad_norm': 1051.9558932757718, 'learning_rate': 4.1019976731363376e-07, 'epoch': 1.27} +{'loss': 86.8132, 'grad_norm': 1028.7035316674246, 'learning_rate': 4.0896490704496415e-07, 'epoch': 1.27} +{'loss': 88.9733, 'grad_norm': 1024.9933924760733, 'learning_rate': 4.0773186948151246e-07, 'epoch': 1.27} +{'loss': 86.863, 'grad_norm': 1027.594888073485, 'learning_rate': 4.065006548576056e-07, 'epoch': 1.27} +{'loss': 88.6613, 'grad_norm': 1023.0228535670689, 'learning_rate': 4.052712634072298e-07, 'epoch': 1.27} +{'loss': 87.76, 'grad_norm': 1007.8288071575872, 'learning_rate': 4.0404369536402143e-07, 'epoch': 1.27} +{'loss': 90.3628, 'grad_norm': 1284.2091544101115, 'learning_rate': 4.028179509612673e-07, 'epoch': 1.27} +{'loss': 88.2153, 'grad_norm': 990.8764319061531, 'learning_rate': 4.015940304319155e-07, 'epoch': 1.27} +{'loss': 89.436, 'grad_norm': 1036.8872984163222, 'learning_rate': 4.003719340085599e-07, 'epoch': 1.27} +{'loss': 87.447, 'grad_norm': 1216.6809621449686, 'learning_rate': 3.9915166192345365e-07, 'epoch': 1.27} +{'loss': 88.1861, 'grad_norm': 965.7278093827335, 'learning_rate': 3.979332144084991e-07, 'epoch': 1.28} +{'loss': 85.162, 'grad_norm': 1038.512114198436, 'learning_rate': 3.96716591695252e-07, 'epoch': 1.28} +{'loss': 87.4196, 'grad_norm': 942.3638019157886, 'learning_rate': 3.9550179401492415e-07, 'epoch': 1.28} +{'loss': 87.6526, 'grad_norm': 895.687952188131, 'learning_rate': 3.9428882159837645e-07, 'epoch': 1.28} +{'loss': 87.624, 'grad_norm': 911.0565522281943, 'learning_rate': 3.9307767467613e-07, 'epoch': 1.28} +{'loss': 88.9495, 'grad_norm': 1006.6203419422859, 'learning_rate': 3.918683534783496e-07, 'epoch': 1.28} +{'loss': 86.6378, 'grad_norm': 972.4518700089586, 'learning_rate': 3.9066085823485923e-07, 'epoch': 1.28} +{'loss': 91.1738, 'grad_norm': 996.4329003387317, 'learning_rate': 3.8945518917513636e-07, 'epoch': 1.28} +{'loss': 87.5352, 'grad_norm': 1037.957926664089, 'learning_rate': 3.8825134652830664e-07, 'epoch': 1.28} +{'loss': 87.6389, 'grad_norm': 1229.3244050254439, 'learning_rate': 3.870493305231537e-07, 'epoch': 1.28} +{'loss': 88.631, 'grad_norm': 1037.5686769155352, 'learning_rate': 3.858491413881105e-07, 'epoch': 1.28} +{'loss': 86.8355, 'grad_norm': 988.1706182624154, 'learning_rate': 3.8465077935126325e-07, 'epoch': 1.28} +{'loss': 86.4844, 'grad_norm': 1066.0579441435732, 'learning_rate': 3.834542446403544e-07, 'epoch': 1.28} +{'loss': 88.5356, 'grad_norm': 998.8438113470608, 'learning_rate': 3.822595374827742e-07, 'epoch': 1.28} +{'loss': 90.0113, 'grad_norm': 1058.7093557945861, 'learning_rate': 3.8106665810556885e-07, 'epoch': 1.28} +{'loss': 90.6233, 'grad_norm': 1052.8960005017157, 'learning_rate': 3.7987560673543697e-07, 'epoch': 1.28} +{'loss': 88.6339, 'grad_norm': 969.3881397904503, 'learning_rate': 3.7868638359872753e-07, 'epoch': 1.28} +{'loss': 89.8478, 'grad_norm': 1203.5003929804745, 'learning_rate': 3.7749898892144423e-07, 'epoch': 1.28} +{'loss': 88.4225, 'grad_norm': 1028.1996133883033, 'learning_rate': 3.76313422929242e-07, 'epoch': 1.28} +{'loss': 88.521, 'grad_norm': 1060.7548970213916, 'learning_rate': 3.751296858474318e-07, 'epoch': 1.28} +{'loss': 89.5752, 'grad_norm': 1009.1498217196835, 'learning_rate': 3.739477779009704e-07, 'epoch': 1.28} +{'loss': 89.0834, 'grad_norm': 1023.0581432171113, 'learning_rate': 3.7276769931447134e-07, 'epoch': 1.28} +{'loss': 88.5158, 'grad_norm': 928.5159824483403, 'learning_rate': 3.7158945031220197e-07, 'epoch': 1.28} +{'loss': 88.211, 'grad_norm': 944.6278747903012, 'learning_rate': 3.704130311180776e-07, 'epoch': 1.28} +{'loss': 88.9638, 'grad_norm': 1032.3860883781722, 'learning_rate': 3.6923844195566937e-07, 'epoch': 1.28} +{'loss': 88.3715, 'grad_norm': 1074.787997347902, 'learning_rate': 3.680656830481999e-07, 'epoch': 1.28} +{'loss': 89.2841, 'grad_norm': 965.4850090064335, 'learning_rate': 3.6689475461854085e-07, 'epoch': 1.28} +{'loss': 88.1625, 'grad_norm': 973.6742672750847, 'learning_rate': 3.657256568892187e-07, 'epoch': 1.28} +{'loss': 88.1794, 'grad_norm': 1102.3344206196225, 'learning_rate': 3.6455839008241346e-07, 'epoch': 1.28} +{'loss': 90.6507, 'grad_norm': 967.5086872113541, 'learning_rate': 3.633929544199566e-07, 'epoch': 1.28} +{'loss': 88.2421, 'grad_norm': 1155.2328010440674, 'learning_rate': 3.622293501233276e-07, 'epoch': 1.28} +{'loss': 88.3058, 'grad_norm': 1109.2076886026548, 'learning_rate': 3.6106757741366293e-07, 'epoch': 1.28} +{'loss': 88.8347, 'grad_norm': 1053.804996012788, 'learning_rate': 3.599076365117471e-07, 'epoch': 1.28} +{'loss': 87.4937, 'grad_norm': 1032.8459310916012, 'learning_rate': 3.587495276380193e-07, 'epoch': 1.28} +{'loss': 91.41, 'grad_norm': 1083.2874459353754, 'learning_rate': 3.5759325101257013e-07, 'epoch': 1.28} +{'loss': 87.505, 'grad_norm': 1122.8914220864947, 'learning_rate': 3.5643880685514164e-07, 'epoch': 1.28} +{'loss': 88.2859, 'grad_norm': 955.9383012227762, 'learning_rate': 3.552861953851261e-07, 'epoch': 1.28} +{'loss': 85.8777, 'grad_norm': 971.5107216089026, 'learning_rate': 3.541354168215683e-07, 'epoch': 1.28} +{'loss': 89.1906, 'grad_norm': 1024.4921136958233, 'learning_rate': 3.5298647138316657e-07, 'epoch': 1.28} +{'loss': 88.1354, 'grad_norm': 1034.5721064662441, 'learning_rate': 3.518393592882696e-07, 'epoch': 1.28} +{'loss': 88.7172, 'grad_norm': 1036.21453106577, 'learning_rate': 3.5069408075487757e-07, 'epoch': 1.28} +{'loss': 88.2302, 'grad_norm': 1224.797497496614, 'learning_rate': 3.4955063600064177e-07, 'epoch': 1.28} +{'loss': 88.3252, 'grad_norm': 1158.5240681083442, 'learning_rate': 3.484090252428629e-07, 'epoch': 1.28} +{'loss': 89.0938, 'grad_norm': 1076.5125784067898, 'learning_rate': 3.4726924869849967e-07, 'epoch': 1.28} +{'loss': 89.8308, 'grad_norm': 1047.8347614711236, 'learning_rate': 3.461313065841565e-07, 'epoch': 1.28} +{'loss': 87.9294, 'grad_norm': 1006.0033536544186, 'learning_rate': 3.4499519911609156e-07, 'epoch': 1.28} +{'loss': 90.286, 'grad_norm': 1222.1036039565229, 'learning_rate': 3.438609265102133e-07, 'epoch': 1.28} +{'loss': 86.801, 'grad_norm': 937.3946191406691, 'learning_rate': 3.4272848898207924e-07, 'epoch': 1.28} +{'loss': 88.2475, 'grad_norm': 1076.0024675249244, 'learning_rate': 3.4159788674690386e-07, 'epoch': 1.28} +{'loss': 87.4363, 'grad_norm': 1111.8657017028884, 'learning_rate': 3.404691200195498e-07, 'epoch': 1.28} +{'loss': 88.156, 'grad_norm': 973.6334749707833, 'learning_rate': 3.39342189014531e-07, 'epoch': 1.28} +{'loss': 87.3287, 'grad_norm': 992.8778143465786, 'learning_rate': 3.3821709394601056e-07, 'epoch': 1.28} +{'loss': 86.112, 'grad_norm': 1046.5104239371979, 'learning_rate': 3.37093835027803e-07, 'epoch': 1.29} +{'loss': 85.5205, 'grad_norm': 984.5369442970346, 'learning_rate': 3.3597241247337873e-07, 'epoch': 1.29} +{'loss': 88.728, 'grad_norm': 1060.8553523620283, 'learning_rate': 3.3485282649585615e-07, 'epoch': 1.29} +{'loss': 88.3569, 'grad_norm': 1000.5049226210637, 'learning_rate': 3.3373507730800167e-07, 'epoch': 1.29} +{'loss': 87.2639, 'grad_norm': 963.9527539435044, 'learning_rate': 3.3261916512223767e-07, 'epoch': 1.29} +{'loss': 88.3172, 'grad_norm': 995.826834117526, 'learning_rate': 3.315050901506323e-07, 'epoch': 1.29} +{'loss': 88.1728, 'grad_norm': 1150.6921348584951, 'learning_rate': 3.3039285260490715e-07, 'epoch': 1.29} +{'loss': 89.355, 'grad_norm': 1042.2834675699035, 'learning_rate': 3.292824526964389e-07, 'epoch': 1.29} +{'loss': 85.4949, 'grad_norm': 1018.0296495465223, 'learning_rate': 3.281738906362475e-07, 'epoch': 1.29} +{'loss': 90.083, 'grad_norm': 982.1080493596764, 'learning_rate': 3.270671666350078e-07, 'epoch': 1.29} +{'loss': 88.3409, 'grad_norm': 1005.5298085374101, 'learning_rate': 3.2596228090304496e-07, 'epoch': 1.29} +{'loss': 87.4613, 'grad_norm': 1018.4442471024855, 'learning_rate': 3.2485923365033203e-07, 'epoch': 1.29} +{'loss': 88.3339, 'grad_norm': 1104.1274530060223, 'learning_rate': 3.237580250864969e-07, 'epoch': 1.29} +{'loss': 87.8548, 'grad_norm': 1054.4250590086574, 'learning_rate': 3.2265865542081666e-07, 'epoch': 1.29} +{'loss': 86.3777, 'grad_norm': 914.3912823846429, 'learning_rate': 3.2156112486221747e-07, 'epoch': 1.29} +{'loss': 86.3351, 'grad_norm': 1115.8615921508872, 'learning_rate': 3.204654336192781e-07, 'epoch': 1.29} +{'loss': 88.2708, 'grad_norm': 979.1970036883674, 'learning_rate': 3.19371581900223e-07, 'epoch': 1.29} +{'loss': 86.2249, 'grad_norm': 980.46501112696, 'learning_rate': 3.1827956991293374e-07, 'epoch': 1.29} +{'loss': 88.2017, 'grad_norm': 1083.5792364353576, 'learning_rate': 3.1718939786493873e-07, 'epoch': 1.29} +{'loss': 87.5338, 'grad_norm': 1021.2187245794379, 'learning_rate': 3.1610106596341785e-07, 'epoch': 1.29} +{'loss': 88.586, 'grad_norm': 968.5578665395501, 'learning_rate': 3.150145744151989e-07, 'epoch': 1.29} +{'loss': 88.1251, 'grad_norm': 932.8607194068627, 'learning_rate': 3.139299234267612e-07, 'epoch': 1.29} +{'loss': 86.2982, 'grad_norm': 994.3822493653295, 'learning_rate': 3.1284711320423543e-07, 'epoch': 1.29} +{'loss': 87.1867, 'grad_norm': 988.3375833351926, 'learning_rate': 3.1176614395340364e-07, 'epoch': 1.29} +{'loss': 87.8856, 'grad_norm': 938.7600452625119, 'learning_rate': 3.1068701587969375e-07, 'epoch': 1.29} +{'loss': 88.2478, 'grad_norm': 970.0505943393988, 'learning_rate': 3.096097291881861e-07, 'epoch': 1.29} +{'loss': 88.6247, 'grad_norm': 1116.7780551147036, 'learning_rate': 3.085342840836103e-07, 'epoch': 1.29} +{'loss': 87.8867, 'grad_norm': 957.0577506838898, 'learning_rate': 3.074606807703484e-07, 'epoch': 1.29} +{'loss': 87.9896, 'grad_norm': 1041.659883957387, 'learning_rate': 3.0638891945243055e-07, 'epoch': 1.29} +{'loss': 88.3798, 'grad_norm': 1024.630147325568, 'learning_rate': 3.053190003335371e-07, 'epoch': 1.29} +{'loss': 87.5804, 'grad_norm': 1041.7626897175433, 'learning_rate': 3.042509236169977e-07, 'epoch': 1.29} +{'loss': 89.9175, 'grad_norm': 1039.2404956055295, 'learning_rate': 3.031846895057922e-07, 'epoch': 1.29} +{'loss': 87.3799, 'grad_norm': 1425.2505351281482, 'learning_rate': 3.021202982025506e-07, 'epoch': 1.29} +{'loss': 89.2327, 'grad_norm': 1043.4858936372639, 'learning_rate': 3.0105774990955463e-07, 'epoch': 1.29} +{'loss': 88.513, 'grad_norm': 1084.2223010354285, 'learning_rate': 2.999970448287304e-07, 'epoch': 1.29} +{'loss': 86.3249, 'grad_norm': 1003.6145383021876, 'learning_rate': 2.9893818316166e-07, 'epoch': 1.29} +{'loss': 87.1756, 'grad_norm': 1024.1848820336336, 'learning_rate': 2.978811651095692e-07, 'epoch': 1.29} +{'loss': 86.4503, 'grad_norm': 956.1864108862072, 'learning_rate': 2.968259908733384e-07, 'epoch': 1.29} +{'loss': 88.2746, 'grad_norm': 1034.239485860772, 'learning_rate': 2.9577266065349716e-07, 'epoch': 1.29} +{'loss': 88.1062, 'grad_norm': 950.5865801898216, 'learning_rate': 2.94721174650221e-07, 'epoch': 1.29} +{'loss': 87.9145, 'grad_norm': 943.5633020524253, 'learning_rate': 2.9367153306333663e-07, 'epoch': 1.29} +{'loss': 89.1593, 'grad_norm': 4180.014139817965, 'learning_rate': 2.926237360923212e-07, 'epoch': 1.29} +{'loss': 89.4917, 'grad_norm': 1253.115965063049, 'learning_rate': 2.9157778393629987e-07, 'epoch': 1.29} +{'loss': 87.6237, 'grad_norm': 959.0904684368068, 'learning_rate': 2.9053367679405145e-07, 'epoch': 1.29} +{'loss': 85.7171, 'grad_norm': 903.4182614312305, 'learning_rate': 2.8949141486399825e-07, 'epoch': 1.29} +{'loss': 88.7477, 'grad_norm': 1100.9208688113988, 'learning_rate': 2.8845099834421517e-07, 'epoch': 1.29} +{'loss': 89.0567, 'grad_norm': 944.7307386988479, 'learning_rate': 2.8741242743242416e-07, 'epoch': 1.29} +{'loss': 88.141, 'grad_norm': 1084.57321793479, 'learning_rate': 2.8637570232599834e-07, 'epoch': 1.29} +{'loss': 90.8234, 'grad_norm': 1192.5360568277792, 'learning_rate': 2.8534082322196123e-07, 'epoch': 1.29} +{'loss': 89.1871, 'grad_norm': 1046.7215864975356, 'learning_rate': 2.843077903169833e-07, 'epoch': 1.29} +{'loss': 86.8839, 'grad_norm': 932.0554347433949, 'learning_rate': 2.832766038073842e-07, 'epoch': 1.29} +{'loss': 86.9981, 'grad_norm': 1009.6392735767797, 'learning_rate': 2.8224726388913493e-07, 'epoch': 1.29} +{'loss': 88.8172, 'grad_norm': 1013.5057489648979, 'learning_rate': 2.812197707578501e-07, 'epoch': 1.3} +{'loss': 87.743, 'grad_norm': 990.0103396985045, 'learning_rate': 2.801941246088002e-07, 'epoch': 1.3} +{'loss': 88.4944, 'grad_norm': 951.6634464782219, 'learning_rate': 2.791703256369027e-07, 'epoch': 1.3} +{'loss': 89.3432, 'grad_norm': 1024.9282231862671, 'learning_rate': 2.781483740367208e-07, 'epoch': 1.3} +{'loss': 87.745, 'grad_norm': 1068.7772395349707, 'learning_rate': 2.7712827000247023e-07, 'epoch': 1.3} +{'loss': 90.3945, 'grad_norm': 1038.172188037931, 'learning_rate': 2.761100137280115e-07, 'epoch': 1.3} +{'loss': 88.9512, 'grad_norm': 990.6454849082388, 'learning_rate': 2.750936054068598e-07, 'epoch': 1.3} +{'loss': 89.7198, 'grad_norm': 1176.63576001356, 'learning_rate': 2.740790452321751e-07, 'epoch': 1.3} +{'loss': 88.3533, 'grad_norm': 1066.430385013159, 'learning_rate': 2.730663333967676e-07, 'epoch': 1.3} +{'loss': 90.1772, 'grad_norm': 2189.5345946449443, 'learning_rate': 2.720554700930944e-07, 'epoch': 1.3} +{'loss': 88.8383, 'grad_norm': 1160.0363248279966, 'learning_rate': 2.7104645551326193e-07, 'epoch': 1.3} +{'loss': 87.4284, 'grad_norm': 1009.2346486618973, 'learning_rate': 2.7003928984902783e-07, 'epoch': 1.3} +{'loss': 87.8429, 'grad_norm': 929.3934489922365, 'learning_rate': 2.690339732917968e-07, 'epoch': 1.3} +{'loss': 87.7517, 'grad_norm': 1209.9756924591725, 'learning_rate': 2.680305060326205e-07, 'epoch': 1.3} +{'loss': 87.0497, 'grad_norm': 1122.67804179435, 'learning_rate': 2.6702888826219965e-07, 'epoch': 1.3} +{'loss': 89.6317, 'grad_norm': 1112.9461003660706, 'learning_rate': 2.6602912017088534e-07, 'epoch': 1.3} +{'loss': 87.5284, 'grad_norm': 952.1981182770486, 'learning_rate': 2.650312019486767e-07, 'epoch': 1.3} +{'loss': 86.6546, 'grad_norm': 1027.9775610515844, 'learning_rate': 2.6403513378522093e-07, 'epoch': 1.3} +{'loss': 86.5092, 'grad_norm': 998.2980856400451, 'learning_rate': 2.630409158698122e-07, 'epoch': 1.3} +{'loss': 87.442, 'grad_norm': 917.5570938565992, 'learning_rate': 2.6204854839139483e-07, 'epoch': 1.3} +{'loss': 86.0374, 'grad_norm': 998.9315861948435, 'learning_rate': 2.6105803153855915e-07, 'epoch': 1.3} +{'loss': 89.7419, 'grad_norm': 979.6018946865331, 'learning_rate': 2.6006936549954784e-07, 'epoch': 1.3} +{'loss': 89.5219, 'grad_norm': 1045.7319488522724, 'learning_rate': 2.590825504622496e-07, 'epoch': 1.3} +{'loss': 89.245, 'grad_norm': 1085.9290425959057, 'learning_rate': 2.5809758661420105e-07, 'epoch': 1.3} +{'loss': 88.0186, 'grad_norm': 1169.418613825608, 'learning_rate': 2.5711447414258686e-07, 'epoch': 1.3} +{'loss': 89.0898, 'grad_norm': 1012.1600526560047, 'learning_rate': 2.5613321323423866e-07, 'epoch': 1.3} +{'loss': 87.3156, 'grad_norm': 1028.4598967464246, 'learning_rate': 2.551538040756396e-07, 'epoch': 1.3} +{'loss': 88.0197, 'grad_norm': 1204.1479963090774, 'learning_rate': 2.5417624685292073e-07, 'epoch': 1.3} +{'loss': 88.8531, 'grad_norm': 1080.771926430721, 'learning_rate': 2.532005417518568e-07, 'epoch': 1.3} +{'loss': 86.7865, 'grad_norm': 961.4393421056063, 'learning_rate': 2.522266889578751e-07, 'epoch': 1.3} +{'loss': 87.5392, 'grad_norm': 986.3073380307615, 'learning_rate': 2.512546886560474e-07, 'epoch': 1.3} +{'loss': 92.0205, 'grad_norm': 1196.9353255869842, 'learning_rate': 2.502845410310961e-07, 'epoch': 1.3} +{'loss': 88.2694, 'grad_norm': 1001.3216776630297, 'learning_rate': 2.4931624626739035e-07, 'epoch': 1.3} +{'loss': 88.7613, 'grad_norm': 955.6950631514107, 'learning_rate': 2.483498045489485e-07, 'epoch': 1.3} +{'loss': 86.9218, 'grad_norm': 963.7068283228351, 'learning_rate': 2.4738521605943477e-07, 'epoch': 1.3} +{'loss': 87.4716, 'grad_norm': 1031.915444404534, 'learning_rate': 2.464224809821614e-07, 'epoch': 1.3} +{'loss': 87.8403, 'grad_norm': 1091.8341062351583, 'learning_rate': 2.4546159950009085e-07, 'epoch': 1.3} +{'loss': 86.6576, 'grad_norm': 974.0693071561341, 'learning_rate': 2.445025717958294e-07, 'epoch': 1.3} +{'loss': 88.0123, 'grad_norm': 1060.616985660485, 'learning_rate': 2.4354539805163557e-07, 'epoch': 1.3} +{'loss': 87.1277, 'grad_norm': 1032.7206918766049, 'learning_rate': 2.425900784494106e-07, 'epoch': 1.3} +{'loss': 88.3152, 'grad_norm': 1213.1275861110225, 'learning_rate': 2.416366131707071e-07, 'epoch': 1.3} +{'loss': 88.5654, 'grad_norm': 973.773531884777, 'learning_rate': 2.4068500239672445e-07, 'epoch': 1.3} +{'loss': 88.9684, 'grad_norm': 1025.812632392404, 'learning_rate': 2.3973524630830804e-07, 'epoch': 1.3} +{'loss': 88.8647, 'grad_norm': 1150.191972588681, 'learning_rate': 2.3878734508595456e-07, 'epoch': 1.3} +{'loss': 88.4441, 'grad_norm': 1073.4958814851418, 'learning_rate': 2.3784129890980334e-07, 'epoch': 1.3} +{'loss': 86.7349, 'grad_norm': 938.2308321508485, 'learning_rate': 2.3689710795964272e-07, 'epoch': 1.3} +{'loss': 90.1415, 'grad_norm': 1138.4335889879294, 'learning_rate': 2.3595477241491138e-07, 'epoch': 1.3} +{'loss': 86.3429, 'grad_norm': 1017.7583745133959, 'learning_rate': 2.3501429245469164e-07, 'epoch': 1.3} +{'loss': 87.792, 'grad_norm': 1002.5951662439354, 'learning_rate': 2.340756682577172e-07, 'epoch': 1.3} +{'loss': 89.7713, 'grad_norm': 1046.3163508774433, 'learning_rate': 2.3313890000236316e-07, 'epoch': 1.3} +{'loss': 86.8788, 'grad_norm': 977.7834782696926, 'learning_rate': 2.322039878666571e-07, 'epoch': 1.3} +{'loss': 88.2021, 'grad_norm': 1008.8405199991938, 'learning_rate': 2.3127093202827356e-07, 'epoch': 1.3} +{'loss': 87.3934, 'grad_norm': 1088.755413446534, 'learning_rate': 2.3033973266452958e-07, 'epoch': 1.31} +{'loss': 86.9689, 'grad_norm': 979.1641952494475, 'learning_rate': 2.2941038995239585e-07, 'epoch': 1.31} +{'loss': 89.1719, 'grad_norm': 1007.0693503038641, 'learning_rate': 2.284829040684866e-07, 'epoch': 1.31} +{'loss': 89.3962, 'grad_norm': 919.6515695912348, 'learning_rate': 2.275572751890609e-07, 'epoch': 1.31} +{'loss': 88.8747, 'grad_norm': 971.2257876204799, 'learning_rate': 2.2663350349003134e-07, 'epoch': 1.31} +{'loss': 89.266, 'grad_norm': 1025.3947230235874, 'learning_rate': 2.257115891469508e-07, 'epoch': 1.31} +{'loss': 89.5586, 'grad_norm': 1008.1676706777615, 'learning_rate': 2.247915323350247e-07, 'epoch': 1.31} +{'loss': 89.2437, 'grad_norm': 1034.8539149429919, 'learning_rate': 2.23873333229101e-07, 'epoch': 1.31} +{'loss': 89.5193, 'grad_norm': 1095.9422446995475, 'learning_rate': 2.2295699200367782e-07, 'epoch': 1.31} +{'loss': 86.327, 'grad_norm': 1031.6112663484503, 'learning_rate': 2.2204250883290035e-07, 'epoch': 1.31} +{'loss': 89.242, 'grad_norm': 947.34429753449, 'learning_rate': 2.2112988389055622e-07, 'epoch': 1.31} +{'loss': 87.2344, 'grad_norm': 956.4052930847561, 'learning_rate': 2.202191173500845e-07, 'epoch': 1.31} +{'loss': 87.5878, 'grad_norm': 977.719589802586, 'learning_rate': 2.1931020938457116e-07, 'epoch': 1.31} +{'loss': 87.6706, 'grad_norm': 1073.9163478715725, 'learning_rate': 2.1840316016674468e-07, 'epoch': 1.31} +{'loss': 89.2306, 'grad_norm': 1007.9001982402051, 'learning_rate': 2.1749796986898608e-07, 'epoch': 1.31} +[2025-05-30 10:26:11,302] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step68000 is about to be saved! +[2025-05-30 10:26:11,332] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 10:26:11,332] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 10:26:11,478] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 10:26:11,511] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 10:32:25,134] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 10:32:25,219] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 10:32:26,602] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step68000 is ready now! +[2025-05-30 10:32:26,677] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step68000 is about to be saved! +[2025-05-30 10:32:26,702] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 10:32:26,703] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 10:32:26,764] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 10:32:27,036] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 10:38:36,155] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 10:38:36,230] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-68000/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 10:38:42,165] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step68000 is ready now! +{'loss': 85.2475, 'grad_norm': 1192.9297873582525, 'learning_rate': 2.1659463866331775e-07, 'epoch': 1.31} +{'loss': 87.4505, 'grad_norm': 946.2906247942249, 'learning_rate': 2.1569316672141128e-07, 'epoch': 1.31} +{'loss': 87.316, 'grad_norm': 1023.5771606110181, 'learning_rate': 2.1479355421458514e-07, 'epoch': 1.31} +{'loss': 89.568, 'grad_norm': 1331.9888864308248, 'learning_rate': 2.1389580131380373e-07, 'epoch': 1.31} +{'loss': 87.7742, 'grad_norm': 1082.4593526768342, 'learning_rate': 2.1299990818968053e-07, 'epoch': 1.31} +{'loss': 86.2468, 'grad_norm': 1136.0133440524805, 'learning_rate': 2.121058750124705e-07, 'epoch': 1.31} +{'loss': 87.2197, 'grad_norm': 963.9082491726817, 'learning_rate': 2.112137019520788e-07, 'epoch': 1.31} +{'loss': 89.2541, 'grad_norm': 1028.3882504048463, 'learning_rate': 2.103233891780576e-07, 'epoch': 1.31} +{'loss': 87.9172, 'grad_norm': 993.5501210403175, 'learning_rate': 2.094349368596016e-07, 'epoch': 1.31} +{'loss': 88.5815, 'grad_norm': 1000.8652335723433, 'learning_rate': 2.0854834516555789e-07, 'epoch': 1.31} +{'loss': 86.6355, 'grad_norm': 962.0103884765233, 'learning_rate': 2.0766361426441505e-07, 'epoch': 1.31} +{'loss': 88.1316, 'grad_norm': 961.2642283283437, 'learning_rate': 2.0678074432430861e-07, 'epoch': 1.31} +{'loss': 86.2803, 'grad_norm': 950.1341811112403, 'learning_rate': 2.0589973551302322e-07, 'epoch': 1.31} +{'loss': 87.3417, 'grad_norm': 1519.9609055007147, 'learning_rate': 2.050205879979872e-07, 'epoch': 1.31} +{'loss': 86.6129, 'grad_norm': 1052.1820992789233, 'learning_rate': 2.0414330194627573e-07, 'epoch': 1.31} +{'loss': 86.6454, 'grad_norm': 970.8635815919531, 'learning_rate': 2.0326787752461218e-07, 'epoch': 1.31} +{'loss': 88.1475, 'grad_norm': 1110.8530570468172, 'learning_rate': 2.0239431489936233e-07, 'epoch': 1.31} +{'loss': 88.4034, 'grad_norm': 996.6900218817675, 'learning_rate': 2.0152261423654118e-07, 'epoch': 1.31} +{'loss': 87.138, 'grad_norm': 955.9178091501813, 'learning_rate': 2.006527757018084e-07, 'epoch': 1.31} +{'loss': 87.7979, 'grad_norm': 1035.8389205309522, 'learning_rate': 1.9978479946047292e-07, 'epoch': 1.31} +{'loss': 88.0619, 'grad_norm': 1093.6131824331094, 'learning_rate': 1.9891868567748496e-07, 'epoch': 1.31} +{'loss': 88.7819, 'grad_norm': 1043.9009860411065, 'learning_rate': 1.9805443451744178e-07, 'epoch': 1.31} +{'loss': 89.0084, 'grad_norm': 1178.3782095127842, 'learning_rate': 1.9719204614459087e-07, 'epoch': 1.31} +{'loss': 89.7529, 'grad_norm': 989.7679205908005, 'learning_rate': 1.9633152072282003e-07, 'epoch': 1.31} +{'loss': 87.6097, 'grad_norm': 1001.0902470412394, 'learning_rate': 1.9547285841566843e-07, 'epoch': 1.31} +{'loss': 88.0568, 'grad_norm': 947.252080415401, 'learning_rate': 1.9461605938631667e-07, 'epoch': 1.31} +{'loss': 86.5105, 'grad_norm': 1041.146307022585, 'learning_rate': 1.937611237975934e-07, 'epoch': 1.31} +{'loss': 88.8964, 'grad_norm': 1134.408672843241, 'learning_rate': 1.9290805181197414e-07, 'epoch': 1.31} +{'loss': 87.938, 'grad_norm': 1220.7586060931258, 'learning_rate': 1.9205684359157596e-07, 'epoch': 1.31} +{'loss': 87.8115, 'grad_norm': 983.6393246690869, 'learning_rate': 1.9120749929816717e-07, 'epoch': 1.31} +{'loss': 91.251, 'grad_norm': 1110.0143048204873, 'learning_rate': 1.9036001909315981e-07, 'epoch': 1.31} +{'loss': 89.3706, 'grad_norm': 1027.835477415659, 'learning_rate': 1.8951440313760838e-07, 'epoch': 1.31} +{'loss': 87.2802, 'grad_norm': 1043.1362182534515, 'learning_rate': 1.8867065159221766e-07, 'epoch': 1.31} +{'loss': 89.0908, 'grad_norm': 1011.9440976720005, 'learning_rate': 1.8782876461733712e-07, 'epoch': 1.31} +{'loss': 87.5085, 'grad_norm': 954.513390243256, 'learning_rate': 1.8698874237295993e-07, 'epoch': 1.31} +{'loss': 89.3293, 'grad_norm': 1104.4335998763318, 'learning_rate': 1.861505850187273e-07, 'epoch': 1.31} +{'loss': 86.9735, 'grad_norm': 1029.114379315574, 'learning_rate': 1.8531429271392287e-07, 'epoch': 1.31} +{'loss': 88.5079, 'grad_norm': 961.2353682795065, 'learning_rate': 1.8447986561747954e-07, 'epoch': 1.32} +{'loss': 89.0948, 'grad_norm': 1022.4383231024831, 'learning_rate': 1.836473038879727e-07, 'epoch': 1.32} +{'loss': 88.0866, 'grad_norm': 1001.5614363429968, 'learning_rate': 1.8281660768362686e-07, 'epoch': 1.32} +{'loss': 88.4006, 'grad_norm': 1015.142279056549, 'learning_rate': 1.8198777716230688e-07, 'epoch': 1.32} +{'loss': 85.3501, 'grad_norm': 1007.225518480179, 'learning_rate': 1.8116081248152782e-07, 'epoch': 1.32} +{'loss': 88.2399, 'grad_norm': 1011.0473108642984, 'learning_rate': 1.803357137984463e-07, 'epoch': 1.32} +{'loss': 87.8917, 'grad_norm': 988.9635690866107, 'learning_rate': 1.7951248126986787e-07, 'epoch': 1.32} +{'loss': 86.0175, 'grad_norm': 973.1936824975086, 'learning_rate': 1.786911150522419e-07, 'epoch': 1.32} +{'loss': 89.0579, 'grad_norm': 1008.5700042882964, 'learning_rate': 1.7787161530166242e-07, 'epoch': 1.32} +{'loss': 87.4392, 'grad_norm': 930.7821058095162, 'learning_rate': 1.7705398217386927e-07, 'epoch': 1.32} +{'loss': 90.2269, 'grad_norm': 1078.7917639792452, 'learning_rate': 1.7623821582424594e-07, 'epoch': 1.32} +{'loss': 89.54, 'grad_norm': 1033.2428740200319, 'learning_rate': 1.754243164078262e-07, 'epoch': 1.32} +{'loss': 87.9742, 'grad_norm': 1044.3672935598195, 'learning_rate': 1.74612284079283e-07, 'epoch': 1.32} +{'loss': 87.4499, 'grad_norm': 1044.7836210097937, 'learning_rate': 1.738021189929384e-07, 'epoch': 1.32} +{'loss': 88.1232, 'grad_norm': 976.7095582620346, 'learning_rate': 1.729938213027571e-07, 'epoch': 1.32} +{'loss': 88.4414, 'grad_norm': 1320.5105936341167, 'learning_rate': 1.7218739116235061e-07, 'epoch': 1.32} +{'loss': 89.8237, 'grad_norm': 1002.6032888616, 'learning_rate': 1.7138282872497524e-07, 'epoch': 1.32} +{'loss': 87.5913, 'grad_norm': 973.8891281519794, 'learning_rate': 1.7058013414353203e-07, 'epoch': 1.32} +{'loss': 87.6792, 'grad_norm': 987.3799974964865, 'learning_rate': 1.6977930757056783e-07, 'epoch': 1.32} +{'loss': 87.192, 'grad_norm': 1123.7640122286546, 'learning_rate': 1.68980349158272e-07, 'epoch': 1.32} +{'loss': 86.813, 'grad_norm': 1003.034080734824, 'learning_rate': 1.68183259058482e-07, 'epoch': 1.32} +{'loss': 87.897, 'grad_norm': 951.4793539985793, 'learning_rate': 1.6738803742267994e-07, 'epoch': 1.32} +{'loss': 86.3419, 'grad_norm': 972.8774577248968, 'learning_rate': 1.6659468440198835e-07, 'epoch': 1.32} +{'loss': 86.5987, 'grad_norm': 1112.9916199815768, 'learning_rate': 1.658032001471821e-07, 'epoch': 1.32} +{'loss': 86.7564, 'grad_norm': 932.4801360705217, 'learning_rate': 1.6501358480867424e-07, 'epoch': 1.32} +{'loss': 87.1175, 'grad_norm': 1372.7192587770814, 'learning_rate': 1.6422583853652586e-07, 'epoch': 1.32} +{'loss': 88.7671, 'grad_norm': 1016.2726010097441, 'learning_rate': 1.6343996148044384e-07, 'epoch': 1.32} +{'loss': 90.9565, 'grad_norm': 1216.7168015168006, 'learning_rate': 1.6265595378977539e-07, 'epoch': 1.32} +{'loss': 86.5766, 'grad_norm': 1158.0019935930736, 'learning_rate': 1.6187381561351912e-07, 'epoch': 1.32} +{'loss': 89.1304, 'grad_norm': 933.5261416332992, 'learning_rate': 1.6109354710031167e-07, 'epoch': 1.32} +{'loss': 87.5476, 'grad_norm': 952.5709348536352, 'learning_rate': 1.6031514839843776e-07, 'epoch': 1.32} +{'loss': 87.3968, 'grad_norm': 964.2454969001408, 'learning_rate': 1.5953861965582683e-07, 'epoch': 1.32} +{'loss': 88.5854, 'grad_norm': 1279.2461438876112, 'learning_rate': 1.58763961020052e-07, 'epoch': 1.32} +{'loss': 86.7534, 'grad_norm': 1024.346141697806, 'learning_rate': 1.579911726383321e-07, 'epoch': 1.32} +{'loss': 91.0752, 'grad_norm': 1000.7524041527469, 'learning_rate': 1.5722025465752965e-07, 'epoch': 1.32} +{'loss': 86.9279, 'grad_norm': 919.8035447913545, 'learning_rate': 1.5645120722415087e-07, 'epoch': 1.32} +{'loss': 86.9795, 'grad_norm': 992.5596526145313, 'learning_rate': 1.5568403048434877e-07, 'epoch': 1.32} +{'loss': 87.8739, 'grad_norm': 922.3448111448542, 'learning_rate': 1.54918724583919e-07, 'epoch': 1.32} +{'loss': 88.8755, 'grad_norm': 1182.4827164804867, 'learning_rate': 1.5415528966830294e-07, 'epoch': 1.32} +{'loss': 89.1134, 'grad_norm': 950.868031351102, 'learning_rate': 1.533937258825857e-07, 'epoch': 1.32} +{'loss': 87.437, 'grad_norm': 981.526687485204, 'learning_rate': 1.5263403337149595e-07, 'epoch': 1.32} +{'loss': 87.5231, 'grad_norm': 994.116064319888, 'learning_rate': 1.5187621227940708e-07, 'epoch': 1.32} +{'loss': 89.3553, 'grad_norm': 1031.7430108751887, 'learning_rate': 1.5112026275033941e-07, 'epoch': 1.32} +{'loss': 88.2907, 'grad_norm': 1010.6836130790458, 'learning_rate': 1.5036618492795473e-07, 'epoch': 1.32} +{'loss': 87.8248, 'grad_norm': 1042.8588986414206, 'learning_rate': 1.4961397895555953e-07, 'epoch': 1.32} +{'loss': 88.8343, 'grad_norm': 1033.701043047029, 'learning_rate': 1.4886364497610495e-07, 'epoch': 1.32} +{'loss': 88.619, 'grad_norm': 994.6782518360867, 'learning_rate': 1.4811518313218698e-07, 'epoch': 1.32} +{'loss': 88.8399, 'grad_norm': 1061.073370353421, 'learning_rate': 1.4736859356604404e-07, 'epoch': 1.32} +{'loss': 89.875, 'grad_norm': 1032.5163673593322, 'learning_rate': 1.4662387641956155e-07, 'epoch': 1.32} +{'loss': 87.076, 'grad_norm': 1017.2998635386689, 'learning_rate': 1.4588103183426738e-07, 'epoch': 1.32} +{'loss': 85.5119, 'grad_norm': 1050.5224794971116, 'learning_rate': 1.4514005995133197e-07, 'epoch': 1.32} +{'loss': 85.3647, 'grad_norm': 1015.3562635262094, 'learning_rate': 1.4440096091157263e-07, 'epoch': 1.32} +{'loss': 87.8548, 'grad_norm': 1063.309957040085, 'learning_rate': 1.4366373485545039e-07, 'epoch': 1.33} +{'loss': 85.5251, 'grad_norm': 926.5825678113621, 'learning_rate': 1.4292838192306869e-07, 'epoch': 1.33} +{'loss': 90.0194, 'grad_norm': 1052.1747865687742, 'learning_rate': 1.4219490225417688e-07, 'epoch': 1.33} +{'loss': 88.0186, 'grad_norm': 972.5550812605665, 'learning_rate': 1.4146329598816567e-07, 'epoch': 1.33} +{'loss': 86.8516, 'grad_norm': 1090.400170270598, 'learning_rate': 1.4073356326407162e-07, 'epoch': 1.33} +{'loss': 87.9412, 'grad_norm': 963.5500862068395, 'learning_rate': 1.40005704220576e-07, 'epoch': 1.33} +{'loss': 86.2778, 'grad_norm': 1044.6183973721813, 'learning_rate': 1.3927971899600378e-07, 'epoch': 1.33} +{'loss': 87.5015, 'grad_norm': 1132.632731405899, 'learning_rate': 1.3855560772832122e-07, 'epoch': 1.33} +{'loss': 86.4542, 'grad_norm': 974.7294878625107, 'learning_rate': 1.3783337055514156e-07, 'epoch': 1.33} +{'loss': 88.1407, 'grad_norm': 1074.8414286179407, 'learning_rate': 1.3711300761371949e-07, 'epoch': 1.33} +{'loss': 87.1928, 'grad_norm': 994.4385769601863, 'learning_rate': 1.3639451904095547e-07, 'epoch': 1.33} +{'loss': 91.0386, 'grad_norm': 942.3958340310667, 'learning_rate': 1.3567790497339362e-07, 'epoch': 1.33} +{'loss': 87.7389, 'grad_norm': 989.7819736621865, 'learning_rate': 1.3496316554722056e-07, 'epoch': 1.33} +{'loss': 89.0414, 'grad_norm': 1030.2390350188944, 'learning_rate': 1.342503008982665e-07, 'epoch': 1.33} +{'loss': 87.1879, 'grad_norm': 953.0192896822222, 'learning_rate': 1.3353931116200647e-07, 'epoch': 1.33} +{'loss': 87.92, 'grad_norm': 994.0240481064342, 'learning_rate': 1.3283019647355788e-07, 'epoch': 1.33} +{'loss': 87.2126, 'grad_norm': 1086.3580515003086, 'learning_rate': 1.321229569676852e-07, 'epoch': 1.33} +{'loss': 89.3817, 'grad_norm': 1048.914388862556, 'learning_rate': 1.3141759277879197e-07, 'epoch': 1.33} +{'loss': 87.7631, 'grad_norm': 944.3896880996413, 'learning_rate': 1.3071410404092876e-07, 'epoch': 1.33} +{'loss': 87.5264, 'grad_norm': 1074.2557976284115, 'learning_rate': 1.3001249088778756e-07, 'epoch': 1.33} +{'loss': 88.6911, 'grad_norm': 1097.2395676402427, 'learning_rate': 1.293127534527039e-07, 'epoch': 1.33} +{'loss': 85.7871, 'grad_norm': 1059.068164452973, 'learning_rate': 1.2861489186865917e-07, 'epoch': 1.33} +{'loss': 90.3536, 'grad_norm': 1014.3780014731856, 'learning_rate': 1.279189062682773e-07, 'epoch': 1.33} +{'loss': 86.7069, 'grad_norm': 977.9359731949125, 'learning_rate': 1.272247967838236e-07, 'epoch': 1.33} +{'loss': 87.5494, 'grad_norm': 1009.7568374525803, 'learning_rate': 1.2653256354721032e-07, 'epoch': 1.33} +{'loss': 87.5568, 'grad_norm': 943.6320793065876, 'learning_rate': 1.258422066899878e-07, 'epoch': 1.33} +{'loss': 88.8588, 'grad_norm': 1114.0718997277997, 'learning_rate': 1.2515372634335666e-07, 'epoch': 1.33} +{'loss': 87.7472, 'grad_norm': 1167.2634091824325, 'learning_rate': 1.2446712263815663e-07, 'epoch': 1.33} +{'loss': 87.2542, 'grad_norm': 958.2185553059672, 'learning_rate': 1.2378239570487117e-07, 'epoch': 1.33} +{'loss': 86.2363, 'grad_norm': 1044.5106024904555, 'learning_rate': 1.2309954567362836e-07, 'epoch': 1.33} +{'loss': 87.7598, 'grad_norm': 1048.1277592853967, 'learning_rate': 1.2241857267419666e-07, 'epoch': 1.33} +{'loss': 87.5969, 'grad_norm': 1026.7083965876195, 'learning_rate': 1.217394768359914e-07, 'epoch': 1.33} +{'loss': 87.9861, 'grad_norm': 963.4751490707558, 'learning_rate': 1.210622582880716e-07, 'epoch': 1.33} +{'loss': 87.1228, 'grad_norm': 1053.6229543712018, 'learning_rate': 1.203869171591343e-07, 'epoch': 1.33} +{'loss': 88.9013, 'grad_norm': 1052.1376612892961, 'learning_rate': 1.1971345357752573e-07, 'epoch': 1.33} +{'loss': 89.0505, 'grad_norm': 960.0266539065085, 'learning_rate': 1.1904186767123016e-07, 'epoch': 1.33} +{'loss': 88.5179, 'grad_norm': 1045.9412787787535, 'learning_rate': 1.1837215956787996e-07, 'epoch': 1.33} +{'loss': 85.1552, 'grad_norm': 1034.1200046103183, 'learning_rate': 1.1770432939474662e-07, 'epoch': 1.33} +{'loss': 88.3539, 'grad_norm': 919.1174055536665, 'learning_rate': 1.1703837727874757e-07, 'epoch': 1.33} +{'loss': 87.5385, 'grad_norm': 1033.1514101252299, 'learning_rate': 1.1637430334644151e-07, 'epoch': 1.33} +{'loss': 87.8079, 'grad_norm': 1047.8050495367618, 'learning_rate': 1.1571210772402975e-07, 'epoch': 1.33} +{'loss': 87.7306, 'grad_norm': 909.2420821949602, 'learning_rate': 1.1505179053735827e-07, 'epoch': 1.33} +{'loss': 88.102, 'grad_norm': 1053.1073914447163, 'learning_rate': 1.1439335191191558e-07, 'epoch': 1.33} +{'loss': 86.9749, 'grad_norm': 961.8341104210888, 'learning_rate': 1.1373679197283493e-07, 'epoch': 1.33} +{'loss': 87.8442, 'grad_norm': 970.778079776642, 'learning_rate': 1.1308211084488763e-07, 'epoch': 1.33} +{'loss': 88.0741, 'grad_norm': 1015.770368176254, 'learning_rate': 1.1242930865249191e-07, 'epoch': 1.33} +{'loss': 86.7621, 'grad_norm': 920.8529277648046, 'learning_rate': 1.1177838551970854e-07, 'epoch': 1.33} +{'loss': 86.8567, 'grad_norm': 1960.028176032558, 'learning_rate': 1.111293415702408e-07, 'epoch': 1.33} +{'loss': 90.3623, 'grad_norm': 1982.6425737178079, 'learning_rate': 1.1048217692743556e-07, 'epoch': 1.33} +{'loss': 88.0065, 'grad_norm': 1175.936095604249, 'learning_rate': 1.0983689171427891e-07, 'epoch': 1.33} +{'loss': 89.4344, 'grad_norm': 993.0496023436694, 'learning_rate': 1.0919348605340385e-07, 'epoch': 1.33} +{'loss': 87.1583, 'grad_norm': 930.346095562839, 'learning_rate': 1.0855196006708701e-07, 'epoch': 1.33} +{'loss': 86.7555, 'grad_norm': 988.2302458698597, 'learning_rate': 1.07912313877242e-07, 'epoch': 1.34} +{'loss': 89.7553, 'grad_norm': 991.3680409116122, 'learning_rate': 1.0727454760543154e-07, 'epoch': 1.34} +{'loss': 87.0474, 'grad_norm': 998.5772741003566, 'learning_rate': 1.066386613728565e-07, 'epoch': 1.34} +{'loss': 86.6234, 'grad_norm': 975.8963996727728, 'learning_rate': 1.0600465530036353e-07, 'epoch': 1.34} +{'loss': 86.4827, 'grad_norm': 1000.0683301857006, 'learning_rate': 1.053725295084418e-07, 'epoch': 1.34} +{'loss': 86.8814, 'grad_norm': 1072.9129283380505, 'learning_rate': 1.0474228411721966e-07, 'epoch': 1.34} +{'loss': 87.6715, 'grad_norm': 932.7091728290271, 'learning_rate': 1.0411391924647241e-07, 'epoch': 1.34} +{'loss': 86.6586, 'grad_norm': 1009.0369755439087, 'learning_rate': 1.0348743501561564e-07, 'epoch': 1.34} +{'loss': 89.5883, 'grad_norm': 1000.4911113141661, 'learning_rate': 1.0286283154370635e-07, 'epoch': 1.34} +{'loss': 86.1749, 'grad_norm': 960.8104100603246, 'learning_rate': 1.0224010894944958e-07, 'epoch': 1.34} +{'loss': 87.6698, 'grad_norm': 1008.7875164860511, 'learning_rate': 1.0161926735118511e-07, 'epoch': 1.34} +{'loss': 90.0852, 'grad_norm': 1060.0901245435891, 'learning_rate': 1.0100030686690298e-07, 'epoch': 1.34} +{'loss': 88.2329, 'grad_norm': 1015.7995189755972, 'learning_rate': 1.0038322761422914e-07, 'epoch': 1.34} +{'loss': 85.9226, 'grad_norm': 942.8717401768159, 'learning_rate': 9.976802971043532e-08, 'epoch': 1.34} +{'loss': 87.5734, 'grad_norm': 1049.443176773189, 'learning_rate': 9.915471327243798e-08, 'epoch': 1.34} +{'loss': 86.956, 'grad_norm': 1041.224403736942, 'learning_rate': 9.854327841678946e-08, 'epoch': 1.34} +{'loss': 87.5276, 'grad_norm': 1068.1517787959065, 'learning_rate': 9.793372525969125e-08, 'epoch': 1.34} +{'loss': 88.1194, 'grad_norm': 1072.2787969882725, 'learning_rate': 9.732605391698403e-08, 'epoch': 1.34} +{'loss': 88.855, 'grad_norm': 1106.6883462586563, 'learning_rate': 9.672026450414984e-08, 'epoch': 1.34} +{'loss': 86.96, 'grad_norm': 1047.711622693989, 'learning_rate': 9.611635713631662e-08, 'epoch': 1.34} +{'loss': 89.1907, 'grad_norm': 1041.6172685016215, 'learning_rate': 9.551433192824922e-08, 'epoch': 1.34} +{'loss': 87.0573, 'grad_norm': 950.6484464028559, 'learning_rate': 9.491418899436167e-08, 'epoch': 1.34} +{'loss': 86.3641, 'grad_norm': 1023.5924823502779, 'learning_rate': 9.431592844870497e-08, 'epoch': 1.34} +{'loss': 89.6064, 'grad_norm': 939.0975344638732, 'learning_rate': 9.371955040497371e-08, 'epoch': 1.34} +{'loss': 89.1514, 'grad_norm': 1176.5589963099455, 'learning_rate': 9.312505497650726e-08, 'epoch': 1.34} +{'loss': 86.314, 'grad_norm': 987.7711534789539, 'learning_rate': 9.253244227628188e-08, 'epoch': 1.34} +{'loss': 87.4466, 'grad_norm': 944.8159689701668, 'learning_rate': 9.194171241692084e-08, 'epoch': 1.34} +{'loss': 89.634, 'grad_norm': 1104.9714877583274, 'learning_rate': 9.135286551068879e-08, 'epoch': 1.34} +{'loss': 89.218, 'grad_norm': 997.5334471082004, 'learning_rate': 9.076590166949062e-08, 'epoch': 1.34} +{'loss': 86.346, 'grad_norm': 1002.8054845222072, 'learning_rate': 9.01808210048749e-08, 'epoch': 1.34} +{'loss': 88.1865, 'grad_norm': 1097.9067853834624, 'learning_rate': 8.959762362803159e-08, 'epoch': 1.34} +{'loss': 87.481, 'grad_norm': 1020.5046108134213, 'learning_rate': 8.901630964979314e-08, 'epoch': 1.34} +{'loss': 86.0373, 'grad_norm': 916.5317898006372, 'learning_rate': 8.843687918063449e-08, 'epoch': 1.34} +{'loss': 86.4017, 'grad_norm': 1014.1151513384792, 'learning_rate': 8.78593323306709e-08, 'epoch': 1.34} +{'loss': 86.2399, 'grad_norm': 1019.7670849303364, 'learning_rate': 8.728366920966125e-08, 'epoch': 1.34} +{'loss': 87.7714, 'grad_norm': 981.5241068033171, 'learning_rate': 8.670988992700802e-08, 'epoch': 1.34} +{'loss': 88.8423, 'grad_norm': 1090.4584299437647, 'learning_rate': 8.613799459175066e-08, 'epoch': 1.34} +{'loss': 86.1156, 'grad_norm': 1022.1468154122882, 'learning_rate': 8.556798331257555e-08, 'epoch': 1.34} +{'loss': 87.8703, 'grad_norm': 925.1698781532649, 'learning_rate': 8.499985619780827e-08, 'epoch': 1.34} +{'loss': 88.8096, 'grad_norm': 992.110038172553, 'learning_rate': 8.4433613355418e-08, 'epoch': 1.34} +{'loss': 86.9421, 'grad_norm': 979.0444029528661, 'learning_rate': 8.386925489301534e-08, 'epoch': 1.34} +{'loss': 89.6975, 'grad_norm': 1148.6703822701745, 'learning_rate': 8.330678091785005e-08, 'epoch': 1.34} +{'loss': 87.9146, 'grad_norm': 915.7124398413803, 'learning_rate': 8.274619153681884e-08, 'epoch': 1.34} +{'loss': 86.0652, 'grad_norm': 958.4027858931914, 'learning_rate': 8.21874868564565e-08, 'epoch': 1.34} +{'loss': 88.8284, 'grad_norm': 978.2180596883929, 'learning_rate': 8.163066698294031e-08, 'epoch': 1.34} +{'loss': 85.897, 'grad_norm': 904.0351939369677, 'learning_rate': 8.107573202209007e-08, 'epoch': 1.34} +{'loss': 86.935, 'grad_norm': 948.9426107285246, 'learning_rate': 8.052268207936697e-08, 'epoch': 1.34} +{'loss': 88.5961, 'grad_norm': 1069.2882919777337, 'learning_rate': 7.997151725987473e-08, 'epoch': 1.34} +{'loss': 88.4792, 'grad_norm': 1058.2989530163095, 'learning_rate': 7.942223766835732e-08, 'epoch': 1.34} +{'loss': 85.4532, 'grad_norm': 919.7473880210032, 'learning_rate': 7.887484340920237e-08, 'epoch': 1.34} +{'loss': 87.4283, 'grad_norm': 1067.702223779541, 'learning_rate': 7.832933458643665e-08, 'epoch': 1.34} +{'loss': 87.5256, 'grad_norm': 1199.1004835679223, 'learning_rate': 7.778571130373059e-08, 'epoch': 1.34} +{'loss': 90.9455, 'grad_norm': 1167.0796239015287, 'learning_rate': 7.724397366439706e-08, 'epoch': 1.35} +{'loss': 87.9362, 'grad_norm': 1025.4363429571445, 'learning_rate': 7.670412177138598e-08, 'epoch': 1.35} +{'loss': 86.8816, 'grad_norm': 1017.3635207729292, 'learning_rate': 7.616615572729636e-08, 'epoch': 1.35} +{'loss': 86.7268, 'grad_norm': 1141.525531477595, 'learning_rate': 7.563007563436198e-08, 'epoch': 1.35} +{'loss': 92.1843, 'grad_norm': 1005.6708712093525, 'learning_rate': 7.509588159446135e-08, 'epoch': 1.35} +{'loss': 88.662, 'grad_norm': 975.9642065005564, 'learning_rate': 7.456357370911549e-08, 'epoch': 1.35} +{'loss': 88.113, 'grad_norm': 961.2883821967412, 'learning_rate': 7.403315207948236e-08, 'epoch': 1.35} +[2025-05-30 12:27:09,940] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step70000 is about to be saved! +[2025-05-30 12:27:09,956] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 12:27:09,956] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 12:27:09,998] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 12:27:10,148] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 12:33:04,852] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 12:33:04,934] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 12:33:06,325] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step70000 is ready now! +[2025-05-30 12:33:06,403] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step70000 is about to be saved! +[2025-05-30 12:33:06,428] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 12:33:06,429] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 12:33:06,546] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 12:33:06,662] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 12:39:16,263] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 12:39:16,372] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-70000/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 12:39:25,757] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step70000 is ready now! +{'loss': 88.5508, 'grad_norm': 1015.2324739872023, 'learning_rate': 7.350461680636911e-08, 'epoch': 1.35} +{'loss': 88.416, 'grad_norm': 954.7174463689851, 'learning_rate': 7.29779679902165e-08, 'epoch': 1.35} +{'loss': 88.3958, 'grad_norm': 1011.9947649433252, 'learning_rate': 7.245320573111003e-08, 'epoch': 1.35} +{'loss': 87.8456, 'grad_norm': 985.3668414897388, 'learning_rate': 7.193033012877993e-08, 'epoch': 1.35} +{'loss': 88.3964, 'grad_norm': 947.13838338677, 'learning_rate': 7.140934128259336e-08, 'epoch': 1.35} +{'loss': 85.1781, 'grad_norm': 913.0372251815663, 'learning_rate': 7.089023929155892e-08, 'epoch': 1.35} +{'loss': 88.3753, 'grad_norm': 1124.8566473760877, 'learning_rate': 7.03730242543299e-08, 'epoch': 1.35} +{'loss': 90.7503, 'grad_norm': 1024.1407044245798, 'learning_rate': 6.98576962691977e-08, 'epoch': 1.35} +{'loss': 89.0101, 'grad_norm': 970.0284660650161, 'learning_rate': 6.934425543409839e-08, 'epoch': 1.35} +{'loss': 88.7427, 'grad_norm': 927.2076832791624, 'learning_rate': 6.883270184660618e-08, 'epoch': 1.35} +{'loss': 87.4561, 'grad_norm': 922.3873087582327, 'learning_rate': 6.832303560393882e-08, 'epoch': 1.35} +{'loss': 85.8838, 'grad_norm': 950.3637421290334, 'learning_rate': 6.781525680295442e-08, 'epoch': 1.35} +{'loss': 87.9312, 'grad_norm': 956.7060699970328, 'learning_rate': 6.730936554015243e-08, 'epoch': 1.35} +{'loss': 87.7664, 'grad_norm': 962.6133348625468, 'learning_rate': 6.680536191167263e-08, 'epoch': 1.35} +{'loss': 88.2995, 'grad_norm': 978.7816358116617, 'learning_rate': 6.630324601329952e-08, 'epoch': 1.35} +{'loss': 89.3606, 'grad_norm': 1106.633033160485, 'learning_rate': 6.580301794045563e-08, 'epoch': 1.35} +{'loss': 86.8125, 'grad_norm': 1031.382721599589, 'learning_rate': 6.530467778820493e-08, 'epoch': 1.35} +{'loss': 90.3971, 'grad_norm': 1050.2765506840312, 'learning_rate': 6.480822565125388e-08, 'epoch': 1.35} +{'loss': 88.4683, 'grad_norm': 1020.0256538910764, 'learning_rate': 6.431366162394814e-08, 'epoch': 1.35} +{'loss': 87.0688, 'grad_norm': 1024.846805845699, 'learning_rate': 6.382098580027806e-08, 'epoch': 1.35} +{'loss': 87.7939, 'grad_norm': 1032.5718265570497, 'learning_rate': 6.33301982738721e-08, 'epoch': 1.35} +{'loss': 86.6412, 'grad_norm': 962.517890070462, 'learning_rate': 6.284129913800119e-08, 'epoch': 1.35} +{'loss': 86.0784, 'grad_norm': 966.7077099942377, 'learning_rate': 6.235428848557546e-08, 'epoch': 1.35} +{'loss': 87.1701, 'grad_norm': 995.9970905653934, 'learning_rate': 6.186916640914975e-08, 'epoch': 1.35} +{'loss': 85.5125, 'grad_norm': 880.1079740667956, 'learning_rate': 6.138593300091699e-08, 'epoch': 1.35} +{'loss': 89.258, 'grad_norm': 1028.8537252261085, 'learning_rate': 6.090458835271151e-08, 'epoch': 1.35} +{'loss': 86.8775, 'grad_norm': 1081.566303290676, 'learning_rate': 6.042513255601012e-08, 'epoch': 1.35} +{'loss': 88.0884, 'grad_norm': 1082.247347122673, 'learning_rate': 5.994756570192994e-08, 'epoch': 1.35} +{'loss': 87.7516, 'grad_norm': 957.1503692183421, 'learning_rate': 5.947188788122726e-08, 'epoch': 1.35} +{'loss': 87.1252, 'grad_norm': 1027.69432915017, 'learning_rate': 5.8998099184304217e-08, 'epoch': 1.35} +{'loss': 86.8296, 'grad_norm': 927.5920867662082, 'learning_rate': 5.852619970119766e-08, 'epoch': 1.35} +{'loss': 87.6991, 'grad_norm': 940.4219745857731, 'learning_rate': 5.805618952159142e-08, 'epoch': 1.35} +{'loss': 88.6249, 'grad_norm': 980.8579347204316, 'learning_rate': 5.758806873480627e-08, 'epoch': 1.35} +{'loss': 87.9151, 'grad_norm': 1127.0222210621296, 'learning_rate': 5.7121837429804374e-08, 'epoch': 1.35} +{'loss': 88.363, 'grad_norm': 977.2135249970161, 'learning_rate': 5.66574956951893e-08, 'epoch': 1.35} +{'loss': 87.2064, 'grad_norm': 940.8628635513492, 'learning_rate': 5.6195043619207136e-08, 'epoch': 1.35} +{'loss': 88.0438, 'grad_norm': 1226.6240869912062, 'learning_rate': 5.5734481289744235e-08, 'epoch': 1.35} +{'loss': 87.6735, 'grad_norm': 969.0131969351859, 'learning_rate': 5.5275808794325035e-08, 'epoch': 1.35} +{'loss': 86.7672, 'grad_norm': 974.059591626203, 'learning_rate': 5.481902622011648e-08, 'epoch': 1.35} +{'loss': 86.5584, 'grad_norm': 993.224314306425, 'learning_rate': 5.436413365392912e-08, 'epoch': 1.35} +{'loss': 88.8249, 'grad_norm': 946.9379183623244, 'learning_rate': 5.3911131182209364e-08, 'epoch': 1.35} +{'loss': 87.4125, 'grad_norm': 936.3196669848517, 'learning_rate': 5.346001889104946e-08, 'epoch': 1.35} +{'loss': 90.6388, 'grad_norm': 899.4148566648854, 'learning_rate': 5.3010796866178604e-08, 'epoch': 1.35} +{'loss': 87.3507, 'grad_norm': 1012.8184658772714, 'learning_rate': 5.25634651929674e-08, 'epoch': 1.35} +{'loss': 88.6118, 'grad_norm': 957.2373846633621, 'learning_rate': 5.2118023956428954e-08, 'epoch': 1.35} +{'loss': 88.2696, 'grad_norm': 964.5866058939087, 'learning_rate': 5.167447324121666e-08, 'epoch': 1.36} +{'loss': 87.2884, 'grad_norm': 1111.613316770151, 'learning_rate': 5.123281313162198e-08, 'epoch': 1.36} +{'loss': 87.0563, 'grad_norm': 865.4550842834648, 'learning_rate': 5.079304371158222e-08, 'epoch': 1.36} +{'loss': 87.7151, 'grad_norm': 1003.2003413987039, 'learning_rate': 5.035516506466942e-08, 'epoch': 1.36} +{'loss': 87.7322, 'grad_norm': 1077.9944809929093, 'learning_rate': 4.991917727409923e-08, 'epoch': 1.36} +{'loss': 88.6635, 'grad_norm': 1007.2534041942016, 'learning_rate': 4.9485080422729813e-08, 'epoch': 1.36} +{'loss': 86.3859, 'grad_norm': 932.2559435395646, 'learning_rate': 4.9052874593057406e-08, 'epoch': 1.36} +{'loss': 86.5749, 'grad_norm': 946.8762553684294, 'learning_rate': 4.862255986721964e-08, 'epoch': 1.36} +{'loss': 89.0943, 'grad_norm': 1045.4208702745418, 'learning_rate': 4.8194136326994434e-08, 'epoch': 1.36} +{'loss': 90.1303, 'grad_norm': 948.7840982652582, 'learning_rate': 4.7767604053799987e-08, 'epoch': 1.36} +{'loss': 88.5195, 'grad_norm': 1094.1749702488216, 'learning_rate': 4.73429631286948e-08, 'epoch': 1.36} +{'loss': 87.7601, 'grad_norm': 972.4990907457444, 'learning_rate': 4.6920213632382084e-08, 'epoch': 1.36} +{'loss': 88.2297, 'grad_norm': 930.2346403396441, 'learning_rate': 4.6499355645198696e-08, 'epoch': 1.36} +{'loss': 92.2327, 'grad_norm': 2264.078843204098, 'learning_rate': 4.608038924712732e-08, 'epoch': 1.36} +{'loss': 87.764, 'grad_norm': 1057.7342580607624, 'learning_rate': 4.566331451778872e-08, 'epoch': 1.36} +{'loss': 87.3673, 'grad_norm': 989.4441965109818, 'learning_rate': 4.524813153644614e-08, 'epoch': 1.36} +{'loss': 86.5538, 'grad_norm': 1058.0262936360598, 'learning_rate': 4.483484038199981e-08, 'epoch': 1.36} +{'loss': 88.1301, 'grad_norm': 1011.6088514861996, 'learning_rate': 4.442344113299579e-08, 'epoch': 1.36} +{'loss': 86.0506, 'grad_norm': 1103.1022500862275, 'learning_rate': 4.4013933867614877e-08, 'epoch': 1.36} +{'loss': 87.9762, 'grad_norm': 994.1709652500863, 'learning_rate': 4.3606318663680366e-08, 'epoch': 1.36} +{'loss': 87.6325, 'grad_norm': 1044.0341072339006, 'learning_rate': 4.320059559865919e-08, 'epoch': 1.36} +{'loss': 88.5599, 'grad_norm': 1001.6131703487874, 'learning_rate': 4.279676474965411e-08, 'epoch': 1.36} +{'loss': 85.4031, 'grad_norm': 996.9532327764684, 'learning_rate': 4.23948261934104e-08, 'epoch': 1.36} +{'loss': 88.8189, 'grad_norm': 941.4667280970889, 'learning_rate': 4.199478000631474e-08, 'epoch': 1.36} +{'loss': 87.3012, 'grad_norm': 988.9907061965832, 'learning_rate': 4.159662626439187e-08, 'epoch': 1.36} +{'loss': 86.8461, 'grad_norm': 929.6410106392639, 'learning_rate': 4.120036504330682e-08, 'epoch': 1.36} +{'loss': 87.0199, 'grad_norm': 983.2783849770223, 'learning_rate': 4.080599641836824e-08, 'epoch': 1.36} +{'loss': 88.0811, 'grad_norm': 949.652705312469, 'learning_rate': 4.041352046452285e-08, 'epoch': 1.36} +{'loss': 87.208, 'grad_norm': 930.5004760215958, 'learning_rate': 4.002293725635653e-08, 'epoch': 1.36} +{'loss': 87.8266, 'grad_norm': 1044.581815800321, 'learning_rate': 3.963424686809658e-08, 'epoch': 1.36} +{'loss': 93.4472, 'grad_norm': 1042.515302791247, 'learning_rate': 3.924744937361169e-08, 'epoch': 1.36} +{'loss': 86.1, 'grad_norm': 925.7868452917998, 'learning_rate': 3.8862544846409725e-08, 'epoch': 1.36} +{'loss': 89.4086, 'grad_norm': 968.9133366167827, 'learning_rate': 3.8479533359639943e-08, 'epoch': 1.36} +{'loss': 88.6372, 'grad_norm': 1017.0911859578382, 'learning_rate': 3.8098414986088574e-08, 'epoch': 1.36} +{'loss': 86.9526, 'grad_norm': 1030.898712954814, 'learning_rate': 3.771918979818545e-08, 'epoch': 1.36} +{'loss': 86.7351, 'grad_norm': 890.8933503806697, 'learning_rate': 3.7341857867999595e-08, 'epoch': 1.36} +{'loss': 88.3305, 'grad_norm': 951.5027312019686, 'learning_rate': 3.696641926724032e-08, 'epoch': 1.36} +{'loss': 87.6302, 'grad_norm': 1007.3209808498447, 'learning_rate': 3.659287406725609e-08, 'epoch': 1.36} +{'loss': 88.1745, 'grad_norm': 929.0235937059809, 'learning_rate': 3.622122233903791e-08, 'epoch': 1.36} +{'loss': 86.2331, 'grad_norm': 1167.057189103408, 'learning_rate': 3.5851464153213725e-08, 'epoch': 1.36} +{'loss': 87.6291, 'grad_norm': 990.1252782037627, 'learning_rate': 3.5483599580055094e-08, 'epoch': 1.36} +{'loss': 87.5176, 'grad_norm': 988.5420415176393, 'learning_rate': 3.511762868946944e-08, 'epoch': 1.36} +{'loss': 86.9421, 'grad_norm': 1002.7267238322582, 'learning_rate': 3.4753551551010014e-08, 'epoch': 1.36} +{'loss': 87.127, 'grad_norm': 1076.5911562848187, 'learning_rate': 3.4391368233864796e-08, 'epoch': 1.36} +{'loss': 88.8963, 'grad_norm': 929.7758235632779, 'learning_rate': 3.40310788068654e-08, 'epoch': 1.36} +{'loss': 87.8862, 'grad_norm': 906.9706435707932, 'learning_rate': 3.3672683338480396e-08, 'epoch': 1.36} +{'loss': 88.3995, 'grad_norm': 1050.531712848183, 'learning_rate': 3.331618189682084e-08, 'epoch': 1.36} +{'loss': 87.2321, 'grad_norm': 1012.5760509285927, 'learning_rate': 3.296157454963811e-08, 'epoch': 1.36} +{'loss': 87.6604, 'grad_norm': 1234.331994322304, 'learning_rate': 3.260886136432273e-08, 'epoch': 1.36} +{'loss': 87.9956, 'grad_norm': 1021.9149201990874, 'learning_rate': 3.225804240790331e-08, 'epoch': 1.36} +{'loss': 88.4971, 'grad_norm': 1067.1578384335653, 'learning_rate': 3.190911774705319e-08, 'epoch': 1.36} +{'loss': 89.5863, 'grad_norm': 1090.2012826147889, 'learning_rate': 3.156208744808042e-08, 'epoch': 1.36} +{'loss': 87.2859, 'grad_norm': 1030.9130206487798, 'learning_rate': 3.1216951576936714e-08, 'epoch': 1.37} +{'loss': 88.2898, 'grad_norm': 1093.990422952287, 'learning_rate': 3.087371019921293e-08, 'epoch': 1.37} +{'loss': 89.1, 'grad_norm': 1103.3136642407117, 'learning_rate': 3.0532363380139094e-08, 'epoch': 1.37} +{'loss': 86.6286, 'grad_norm': 1102.4751241457368, 'learning_rate': 3.019291118458445e-08, 'epoch': 1.37} +{'loss': 86.0204, 'grad_norm': 1027.7073500947586, 'learning_rate': 2.9855353677061825e-08, 'epoch': 1.37} +{'loss': 86.9846, 'grad_norm': 992.4277901531111, 'learning_rate': 2.951969092171991e-08, 'epoch': 1.37} +{'loss': 86.482, 'grad_norm': 957.3894243081533, 'learning_rate': 2.9185922982348792e-08, 'epoch': 1.37} +{'loss': 87.3528, 'grad_norm': 1014.9515586191095, 'learning_rate': 2.8854049922379946e-08, 'epoch': 1.37} +{'loss': 88.8309, 'grad_norm': 973.0975598382304, 'learning_rate': 2.8524071804881816e-08, 'epoch': 1.37} +{'loss': 88.8769, 'grad_norm': 1005.3138891554185, 'learning_rate': 2.8195988692564236e-08, 'epoch': 1.37} +{'loss': 88.9154, 'grad_norm': 985.5152873621449, 'learning_rate': 2.7869800647779554e-08, 'epoch': 1.37} +{'loss': 86.38, 'grad_norm': 950.2290168328781, 'learning_rate': 2.7545507732514853e-08, 'epoch': 1.37} +{'loss': 88.2005, 'grad_norm': 949.555300340844, 'learning_rate': 2.722311000839972e-08, 'epoch': 1.37} +{'loss': 86.624, 'grad_norm': 944.6112409339528, 'learning_rate': 2.690260753670404e-08, 'epoch': 1.37} +{'loss': 86.4638, 'grad_norm': 966.3432116834203, 'learning_rate': 2.658400037833686e-08, 'epoch': 1.37} +{'loss': 85.4038, 'grad_norm': 865.9011546595628, 'learning_rate': 2.6267288593848637e-08, 'epoch': 1.37} +{'loss': 88.7737, 'grad_norm': 898.1898810277587, 'learning_rate': 2.5952472243425673e-08, 'epoch': 1.37} +{'loss': 88.3412, 'grad_norm': 995.75676981736, 'learning_rate': 2.563955138689789e-08, 'epoch': 1.37} +{'loss': 87.8373, 'grad_norm': 999.4923231494721, 'learning_rate': 2.532852608373215e-08, 'epoch': 1.37} +{'loss': 87.2165, 'grad_norm': 956.0492141529646, 'learning_rate': 2.5019396393037854e-08, 'epoch': 1.37} +{'loss': 87.5606, 'grad_norm': 1075.451936427107, 'learning_rate': 2.4712162373562443e-08, 'epoch': 1.37} +{'loss': 88.6751, 'grad_norm': 1023.1550608879714, 'learning_rate': 2.4406824083694768e-08, 'epoch': 1.37} +{'loss': 88.5718, 'grad_norm': 1100.048589173061, 'learning_rate': 2.4103381581459527e-08, 'epoch': 1.37} +{'loss': 85.8406, 'grad_norm': 913.6996267981973, 'learning_rate': 2.3801834924525037e-08, 'epoch': 1.37} +{'loss': 88.337, 'grad_norm': 970.2411948880884, 'learning_rate': 2.3502184170198784e-08, 'epoch': 1.37} +{'loss': 88.0562, 'grad_norm': 964.6791731682778, 'learning_rate': 2.320442937542633e-08, 'epoch': 1.37} +{'loss': 87.2715, 'grad_norm': 959.4747074506259, 'learning_rate': 2.2908570596793523e-08, 'epoch': 1.37} +{'loss': 86.3263, 'grad_norm': 958.341283444579, 'learning_rate': 2.2614607890526496e-08, 'epoch': 1.37} +{'loss': 87.6527, 'grad_norm': 986.4250264056336, 'learning_rate': 2.2322541312490565e-08, 'epoch': 1.37} +{'loss': 87.8862, 'grad_norm': 1010.6211360989117, 'learning_rate': 2.2032370918190217e-08, 'epoch': 1.37} +{'loss': 87.1094, 'grad_norm': 989.2936987831342, 'learning_rate': 2.1744096762771337e-08, 'epoch': 1.37} +{'loss': 88.0699, 'grad_norm': 975.5941856123171, 'learning_rate': 2.1457718901018998e-08, 'epoch': 1.37} +{'loss': 87.4269, 'grad_norm': 908.3064327162865, 'learning_rate': 2.1173237387355218e-08, 'epoch': 1.37} +{'loss': 87.9803, 'grad_norm': 1070.0857829633612, 'learning_rate': 2.0890652275843416e-08, 'epoch': 1.37} +{'loss': 87.6016, 'grad_norm': 1130.384017849355, 'learning_rate': 2.060996362018841e-08, 'epoch': 1.37} +{'loss': 86.377, 'grad_norm': 937.0624267669327, 'learning_rate': 2.033117147373309e-08, 'epoch': 1.37} +{'loss': 87.7305, 'grad_norm': 1023.3514529677414, 'learning_rate': 2.005427588945841e-08, 'epoch': 1.37} +{'loss': 86.8899, 'grad_norm': 1090.448324108016, 'learning_rate': 1.9779276919987822e-08, 'epoch': 1.37} +{'loss': 88.0151, 'grad_norm': 1032.9336275445796, 'learning_rate': 1.950617461758064e-08, 'epoch': 1.37} +{'loss': 85.6417, 'grad_norm': 996.3796772083945, 'learning_rate': 1.9234969034140904e-08, 'epoch': 1.37} +{'loss': 88.1467, 'grad_norm': 1004.3338499927982, 'learning_rate': 1.8965660221206274e-08, 'epoch': 1.37} +{'loss': 86.9477, 'grad_norm': 1037.6756952186506, 'learning_rate': 1.8698248229960246e-08, 'epoch': 1.37} +{'loss': 87.3329, 'grad_norm': 1293.8039644591843, 'learning_rate': 1.843273311121885e-08, 'epoch': 1.37} +{'loss': 87.1406, 'grad_norm': 952.2651647909577, 'learning_rate': 1.816911491544393e-08, 'epoch': 1.37} +{'loss': 88.0563, 'grad_norm': 949.7462567130285, 'learning_rate': 1.7907393692732088e-08, 'epoch': 1.37} +{'loss': 90.0579, 'grad_norm': 1012.1623808738989, 'learning_rate': 1.7647569492823537e-08, 'epoch': 1.37} +{'loss': 86.7791, 'grad_norm': 907.4608428570249, 'learning_rate': 1.738964236509433e-08, 'epoch': 1.37} +{'loss': 86.3022, 'grad_norm': 930.2420531860478, 'learning_rate': 1.7133612358563034e-08, 'epoch': 1.37} +{'loss': 87.123, 'grad_norm': 1016.2114527024702, 'learning_rate': 1.6879479521885178e-08, 'epoch': 1.37} +{'loss': 87.9375, 'grad_norm': 1098.588909244141, 'learning_rate': 1.6627243903357682e-08, 'epoch': 1.37} +{'loss': 84.9348, 'grad_norm': 932.9089959766746, 'learning_rate': 1.6376905550914422e-08, 'epoch': 1.37} +{'loss': 90.0433, 'grad_norm': 1050.4948342698717, 'learning_rate': 1.61284645121329e-08, 'epoch': 1.37} +{'loss': 87.1582, 'grad_norm': 979.9177108034568, 'learning_rate': 1.588192083422535e-08, 'epoch': 1.38} +{'loss': 89.2692, 'grad_norm': 942.1975017332254, 'learning_rate': 1.5637274564046513e-08, 'epoch': 1.38} +{'loss': 87.9194, 'grad_norm': 994.0100422515802, 'learning_rate': 1.5394525748089196e-08, 'epoch': 1.38} +{'loss': 85.7516, 'grad_norm': 971.325677578681, 'learning_rate': 1.51536744324865e-08, 'epoch': 1.38} +{'loss': 87.8383, 'grad_norm': 1064.8501669617592, 'learning_rate': 1.4914720663009585e-08, 'epoch': 1.38} +{'loss': 87.4511, 'grad_norm': 964.6379461988205, 'learning_rate': 1.467766448507102e-08, 'epoch': 1.38} +{'loss': 87.7798, 'grad_norm': 1031.3250205540824, 'learning_rate': 1.4442505943720319e-08, 'epoch': 1.38} +{'loss': 87.0063, 'grad_norm': 1030.0719208839635, 'learning_rate': 1.4209245083648405e-08, 'epoch': 1.38} +{'loss': 86.3678, 'grad_norm': 962.0494781608716, 'learning_rate': 1.3977881949185368e-08, 'epoch': 1.38} +{'loss': 85.6435, 'grad_norm': 927.0839753783084, 'learning_rate': 1.3748416584298263e-08, 'epoch': 1.38} +{'loss': 88.9005, 'grad_norm': 953.8681846517652, 'learning_rate': 1.3520849032597761e-08, 'epoch': 1.38} +{'loss': 86.0561, 'grad_norm': 916.9786272556966, 'learning_rate': 1.3295179337329267e-08, 'epoch': 1.38} +{'loss': 87.5395, 'grad_norm': 965.2239622618708, 'learning_rate': 1.3071407541379589e-08, 'epoch': 1.38} +{'loss': 86.5425, 'grad_norm': 972.7732220732743, 'learning_rate': 1.2849533687276928e-08, 'epoch': 1.38} +{'loss': 87.2125, 'grad_norm': 1064.1312428565095, 'learning_rate': 1.2629557817184223e-08, 'epoch': 1.38} +{'loss': 87.9327, 'grad_norm': 961.1740457626382, 'learning_rate': 1.2411479972908035e-08, 'epoch': 1.38} +{'loss': 87.1781, 'grad_norm': 984.9308295746899, 'learning_rate': 1.219530019589299e-08, 'epoch': 1.38} +{'loss': 86.7171, 'grad_norm': 1087.7869704230052, 'learning_rate': 1.1981018527219557e-08, 'epoch': 1.38} +{'loss': 89.2048, 'grad_norm': 986.6645675613387, 'learning_rate': 1.1768635007614049e-08, 'epoch': 1.38} +{'loss': 88.1814, 'grad_norm': 1002.0723666468804, 'learning_rate': 1.1558149677435293e-08, 'epoch': 1.38} +{'loss': 86.5086, 'grad_norm': 1012.002546402622, 'learning_rate': 1.1349562576685736e-08, 'epoch': 1.38} +{'loss': 89.5048, 'grad_norm': 1120.4263864279058, 'learning_rate': 1.1142873745006998e-08, 'epoch': 1.38} +{'loss': 91.85, 'grad_norm': 1053.7276124049745, 'learning_rate': 1.0938083221676554e-08, 'epoch': 1.38} +{'loss': 87.3343, 'grad_norm': 960.3942428068962, 'learning_rate': 1.073519104561438e-08, 'epoch': 1.38} +{'loss': 88.3265, 'grad_norm': 1014.47749634448, 'learning_rate': 1.053419725537852e-08, 'epoch': 1.38} +{'loss': 88.3569, 'grad_norm': 921.7805068806679, 'learning_rate': 1.033510188916731e-08, 'epoch': 1.38} +{'loss': 86.3528, 'grad_norm': 971.947021625914, 'learning_rate': 1.0137904984816038e-08, 'epoch': 1.38} +{'loss': 88.6345, 'grad_norm': 1075.2645826681357, 'learning_rate': 9.942606579800285e-09, 'epoch': 1.38} +{'loss': 88.9311, 'grad_norm': 1041.834863243411, 'learning_rate': 9.749206711237025e-09, 'epoch': 1.38} +{'loss': 88.0186, 'grad_norm': 979.2433944776103, 'learning_rate': 9.55770541587797e-09, 'epoch': 1.38} +{'loss': 86.9119, 'grad_norm': 940.8547206560268, 'learning_rate': 9.368102730118456e-09, 'epoch': 1.38} +{'loss': 86.353, 'grad_norm': 942.8084458236752, 'learning_rate': 9.180398689990765e-09, 'epoch': 1.38} +{'loss': 87.6875, 'grad_norm': 915.5622632522583, 'learning_rate': 8.994593331165257e-09, 'epoch': 1.38} +{'loss': 88.9142, 'grad_norm': 992.1894471494141, 'learning_rate': 8.810686688953684e-09, 'epoch': 1.38} +{'loss': 91.3402, 'grad_norm': 1001.4867118362066, 'learning_rate': 8.628678798306977e-09, 'epoch': 1.38} +{'loss': 90.513, 'grad_norm': 1022.2141941975134, 'learning_rate': 8.448569693814135e-09, 'epoch': 1.38} +{'loss': 86.5447, 'grad_norm': 1005.4401419380401, 'learning_rate': 8.270359409702222e-09, 'epoch': 1.38} +{'loss': 87.3186, 'grad_norm': 944.5150290856143, 'learning_rate': 8.094047979839703e-09, 'epoch': 1.38} +{'loss': 86.5252, 'grad_norm': 992.048390607863, 'learning_rate': 7.91963543773422e-09, 'epoch': 1.38} +{'loss': 88.7467, 'grad_norm': 965.2598935153469, 'learning_rate': 7.747121816530368e-09, 'epoch': 1.38} +{'loss': 87.0624, 'grad_norm': 1139.226266062103, 'learning_rate': 7.576507149013034e-09, 'epoch': 1.38} +{'loss': 88.4979, 'grad_norm': 1024.3878015402204, 'learning_rate': 7.407791467606284e-09, 'epoch': 1.38} +{'loss': 86.7747, 'grad_norm': 988.3276696926416, 'learning_rate': 7.240974804374467e-09, 'epoch': 1.38} +{'loss': 87.383, 'grad_norm': 919.1639645142038, 'learning_rate': 7.076057191017782e-09, 'epoch': 1.38} +{'loss': 89.7887, 'grad_norm': 1008.4203987923904, 'learning_rate': 6.913038658880045e-09, 'epoch': 1.38} +{'loss': 85.947, 'grad_norm': 1054.4898787581205, 'learning_rate': 6.751919238939808e-09, 'epoch': 1.38} +{'loss': 89.1598, 'grad_norm': 939.6549917059373, 'learning_rate': 6.592698961818134e-09, 'epoch': 1.38} +{'loss': 86.2084, 'grad_norm': 966.5131775047538, 'learning_rate': 6.435377857771929e-09, 'epoch': 1.38} +{'loss': 87.3941, 'grad_norm': 1042.7069806522366, 'learning_rate': 6.279955956699502e-09, 'epoch': 1.38} +{'loss': 87.8212, 'grad_norm': 1032.4986901765726, 'learning_rate': 6.126433288138334e-09, 'epoch': 1.38} +{'loss': 87.086, 'grad_norm': 1182.0574475879032, 'learning_rate': 5.9748098812639765e-09, 'epoch': 1.38} +[2025-05-30 14:27:39,330] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step72000 is about to be saved! +[2025-05-30 14:27:39,417] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 14:27:39,417] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 14:27:39,521] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 14:27:39,634] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 14:33:23,414] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 14:33:23,525] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 14:33:37,070] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step72000 is ready now! +[2025-05-30 14:33:37,154] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step72000 is about to be saved! +[2025-05-30 14:33:37,180] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 14:33:37,180] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 14:33:37,273] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 14:33:37,386] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 14:36:41,998] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 14:36:42,409] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72000/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 14:39:35,707] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step72000 is ready now! +{'loss': 88.1532, 'grad_norm': 967.3171277546878, 'learning_rate': 5.825085764891159e-09, 'epoch': 1.38} +{'loss': 89.0635, 'grad_norm': 984.587977526743, 'learning_rate': 5.677260967473786e-09, 'epoch': 1.39} +{'loss': 86.2324, 'grad_norm': 979.8283623773779, 'learning_rate': 5.531335517104941e-09, 'epoch': 1.39} +{'loss': 88.9177, 'grad_norm': 1007.1759706507947, 'learning_rate': 5.3873094415168855e-09, 'epoch': 1.39} +{'loss': 88.4266, 'grad_norm': 964.2203955005554, 'learning_rate': 5.2451827680810584e-09, 'epoch': 1.39} +{'loss': 85.4763, 'grad_norm': 1071.1277928242914, 'learning_rate': 5.104955523805854e-09, 'epoch': 1.39} +{'loss': 88.2506, 'grad_norm': 1002.7543011389782, 'learning_rate': 4.966627735342178e-09, 'epoch': 1.39} +{'loss': 87.6855, 'grad_norm': 934.2748021679942, 'learning_rate': 4.830199428977889e-09, 'epoch': 1.39} +{'loss': 86.696, 'grad_norm': 981.9862772287422, 'learning_rate': 4.695670630640026e-09, 'epoch': 1.39} +{'loss': 88.1686, 'grad_norm': 966.7820402903133, 'learning_rate': 4.563041365894805e-09, 'epoch': 1.39} +{'loss': 88.9959, 'grad_norm': 1030.5604644130851, 'learning_rate': 4.432311659946509e-09, 'epoch': 1.39} +{'loss': 87.4966, 'grad_norm': 987.1222061284278, 'learning_rate': 4.303481537640819e-09, 'epoch': 1.39} +{'loss': 88.3241, 'grad_norm': 1049.8232089882238, 'learning_rate': 4.176551023460373e-09, 'epoch': 1.39} +{'loss': 87.4826, 'grad_norm': 1119.865700829389, 'learning_rate': 4.051520141526988e-09, 'epoch': 1.39} +{'loss': 86.5117, 'grad_norm': 980.0928936940231, 'learning_rate': 3.928388915602766e-09, 'epoch': 1.39} +{'loss': 87.4828, 'grad_norm': 973.6333858789903, 'learning_rate': 3.807157369086767e-09, 'epoch': 1.39} +{'loss': 89.0409, 'grad_norm': 980.2798627290723, 'learning_rate': 3.6878255250183406e-09, 'epoch': 1.39} +{'loss': 88.2948, 'grad_norm': 934.5297990239596, 'learning_rate': 3.570393406077122e-09, 'epoch': 1.39} +{'loss': 87.6006, 'grad_norm': 935.6267915295247, 'learning_rate': 3.4548610345774835e-09, 'epoch': 1.39} +{'loss': 87.8388, 'grad_norm': 993.9074844434567, 'learning_rate': 3.341228432478527e-09, 'epoch': 1.39} +{'loss': 86.3768, 'grad_norm': 964.63898557126, 'learning_rate': 3.22949562137298e-09, 'epoch': 1.39} +{'loss': 87.0855, 'grad_norm': 1062.1843540665054, 'learning_rate': 3.119662622496078e-09, 'epoch': 1.39} +{'loss': 85.8699, 'grad_norm': 979.2057198116161, 'learning_rate': 3.0117294567200138e-09, 'epoch': 1.39} +{'loss': 87.0225, 'grad_norm': 918.8243811289104, 'learning_rate': 2.9056961445572686e-09, 'epoch': 1.39} +{'loss': 87.529, 'grad_norm': 1021.008379921173, 'learning_rate': 2.8015627061583895e-09, 'epoch': 1.39} +{'loss': 86.2294, 'grad_norm': 1031.0795036982356, 'learning_rate': 2.6993291613131025e-09, 'epoch': 1.39} +{'loss': 87.5893, 'grad_norm': 1034.042081991301, 'learning_rate': 2.59899552945031e-09, 'epoch': 1.39} +{'loss': 87.5226, 'grad_norm': 1436.8294464204064, 'learning_rate': 2.5005618296380927e-09, 'epoch': 1.39} +{'loss': 87.0244, 'grad_norm': 989.1170765730982, 'learning_rate': 2.4040280805825988e-09, 'epoch': 1.39} +{'loss': 87.581, 'grad_norm': 1011.8219386841323, 'learning_rate': 2.309394300629153e-09, 'epoch': 1.39} +{'loss': 87.4662, 'grad_norm': 981.0615251667069, 'learning_rate': 2.216660507762258e-09, 'epoch': 1.39} +{'loss': 89.342, 'grad_norm': 959.8987893088489, 'learning_rate': 2.1258267196055947e-09, 'epoch': 1.39} +{'loss': 86.9227, 'grad_norm': 1073.1306066321881, 'learning_rate': 2.03689295342091e-09, 'epoch': 1.39} +{'loss': 85.1955, 'grad_norm': 909.7588695164392, 'learning_rate': 1.94985922611024e-09, 'epoch': 1.39} +{'loss': 90.6588, 'grad_norm': 1060.910868839086, 'learning_rate': 1.8647255542125764e-09, 'epoch': 1.39} +{'loss': 87.0466, 'grad_norm': 1060.0430638472687, 'learning_rate': 1.7814919539083097e-09, 'epoch': 1.39} +{'loss': 86.3542, 'grad_norm': 955.6049579823401, 'learning_rate': 1.7001584410147876e-09, 'epoch': 1.39} +{'loss': 87.1863, 'grad_norm': 1069.5627101507996, 'learning_rate': 1.6207250309874246e-09, 'epoch': 1.39} +{'loss': 87.8861, 'grad_norm': 975.670813742135, 'learning_rate': 1.5431917389252538e-09, 'epoch': 1.39} +{'loss': 87.5462, 'grad_norm': 1032.3730610539315, 'learning_rate': 1.4675585795598247e-09, 'epoch': 1.39} +{'loss': 88.9961, 'grad_norm': 991.4524355540151, 'learning_rate': 1.3938255672651945e-09, 'epoch': 1.39} +{'loss': 87.302, 'grad_norm': 945.2255224308649, 'learning_rate': 1.3219927160545987e-09, 'epoch': 1.39} +{'loss': 87.4277, 'grad_norm': 995.1412370083196, 'learning_rate': 1.2520600395793393e-09, 'epoch': 1.39} +{'loss': 86.2278, 'grad_norm': 897.0071765155246, 'learning_rate': 1.184027551129896e-09, 'epoch': 1.39} +{'loss': 87.9403, 'grad_norm': 1044.4392681499835, 'learning_rate': 1.117895263633706e-09, 'epoch': 1.39} +{'loss': 87.9398, 'grad_norm': 943.8361738402376, 'learning_rate': 1.053663189660714e-09, 'epoch': 1.39} +{'loss': 87.5199, 'grad_norm': 1000.1663918118003, 'learning_rate': 9.913313414167125e-10, 'epoch': 1.39} +{'loss': 87.1486, 'grad_norm': 921.2309769974447, 'learning_rate': 9.308997307466705e-10, 'epoch': 1.39} +{'loss': 88.9418, 'grad_norm': 993.7136228988124, 'learning_rate': 8.723683691369556e-10, 'epoch': 1.39} +{'loss': 88.9942, 'grad_norm': 931.7740818322811, 'learning_rate': 8.157372677097819e-10, 'epoch': 1.39} +{'loss': 88.2906, 'grad_norm': 916.3514055043717, 'learning_rate': 7.610064372276516e-10, 'epoch': 1.39} +{'loss': 87.8611, 'grad_norm': 1015.6541129402533, 'learning_rate': 7.081758880911338e-10, 'epoch': 1.39} +{'loss': 84.6571, 'grad_norm': 913.2451122940839, 'learning_rate': 6.572456303421959e-10, 'epoch': 1.39} +{'loss': 86.056, 'grad_norm': 985.7263941937796, 'learning_rate': 6.082156736575418e-10, 'epoch': 1.4} +{'loss': 86.2987, 'grad_norm': 926.2774022579998, 'learning_rate': 5.610860273574936e-10, 'epoch': 1.4} +{'loss': 88.3001, 'grad_norm': 1099.3029957252807, 'learning_rate': 5.158567003960002e-10, 'epoch': 1.4} +{'loss': 88.5232, 'grad_norm': 1027.3323604009, 'learning_rate': 4.725277013695184e-10, 'epoch': 1.4} +{'loss': 88.7987, 'grad_norm': 1003.650052156073, 'learning_rate': 4.3109903851368264e-10, 'epoch': 1.4} +{'loss': 86.7593, 'grad_norm': 1206.8616422323043, 'learning_rate': 3.9157071969997407e-10, 'epoch': 1.4} +{'loss': 86.6146, 'grad_norm': 986.8570253575689, 'learning_rate': 3.5394275244127196e-10, 'epoch': 1.4} +{'loss': 86.7285, 'grad_norm': 979.0140703915079, 'learning_rate': 3.1821514388852284e-10, 'epoch': 1.4} +{'loss': 87.5085, 'grad_norm': 976.9007442175745, 'learning_rate': 2.843879008307404e-10, 'epoch': 1.4} +{'loss': 87.8175, 'grad_norm': 937.3688376834692, 'learning_rate': 2.524610296972263e-10, 'epoch': 1.4} +{'loss': 85.633, 'grad_norm': 926.1713720457545, 'learning_rate': 2.2243453655645953e-10, 'epoch': 1.4} +{'loss': 89.213, 'grad_norm': 946.246180286084, 'learning_rate': 1.94308427112766e-10, 'epoch': 1.4} +{'loss': 87.9667, 'grad_norm': 963.3474275234415, 'learning_rate': 1.6808270671186954e-10, 'epoch': 1.4} +{'loss': 86.8075, 'grad_norm': 1045.970829926005, 'learning_rate': 1.437573803386716e-10, 'epoch': 1.4} +{'loss': 86.6823, 'grad_norm': 963.6018089899023, 'learning_rate': 1.2133245261503058e-10, 'epoch': 1.4} +{'loss': 86.8503, 'grad_norm': 1157.6557831153566, 'learning_rate': 1.0080792780309267e-10, 'epoch': 1.4} +{'loss': 86.2735, 'grad_norm': 939.2311201668213, 'learning_rate': 8.218380980418161e-11, 'epoch': 1.4} +{'loss': 86.8267, 'grad_norm': 1004.3151618096059, 'learning_rate': 6.546010215546794e-11, 'epoch': 1.4} +{'loss': 88.8226, 'grad_norm': 940.3336835887027, 'learning_rate': 5.0636808037740624e-11, 'epoch': 1.4} +{'loss': 88.6209, 'grad_norm': 934.5937486504141, 'learning_rate': 3.771393026541503e-11, 'epoch': 1.4} +{'loss': 87.5885, 'grad_norm': 1032.2253562318701, 'learning_rate': 2.6691471297635162e-11, 'epoch': 1.4} +{'loss': 86.7914, 'grad_norm': 924.0314650780757, 'learning_rate': 1.7569433226061194e-11, 'epoch': 1.4} +{'loss': 87.1133, 'grad_norm': 963.7887650169927, 'learning_rate': 1.0347817785971714e-11, 'epoch': 1.4} +{'loss': 86.5877, 'grad_norm': 907.7821380863663, 'learning_rate': 5.026626349602382e-12, 'epoch': 1.4} +{'loss': 92.1708, 'grad_norm': 1027.722779756652, 'learning_rate': 1.6058599272561482e-12, 'epoch': 1.4} +{'loss': 89.2123, 'grad_norm': 952.5352064649396, 'learning_rate': 8.551916952370676e-14, 'epoch': 1.4} +[2025-05-30 15:22:44,915] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step72793 is about to be saved! +[2025-05-30 15:22:44,932] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 15:22:44,932] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 15:22:45,029] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 15:22:45,170] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 15:30:39,656] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 15:30:39,685] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 15:30:42,704] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step72793 is ready now! +[2025-05-30 15:30:42,757] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step72793 is about to be saved! +[2025-05-30 15:30:42,784] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 15:30:42,784] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 15:30:42,979] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 15:30:43,236] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 15:36:29,939] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 15:36:29,999] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/checkpoint-72793/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 15:36:39,562] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step72793 is ready now! +{'eval_loss': 111.6994857788086, 'eval_runtime': 17.7847, 'eval_samples_per_second': 39.135, 'eval_steps_per_second': 2.474, 'epoch': 1.4} +{'train_runtime': 264406.7938, 'train_samples_per_second': 17.62, 'train_steps_per_second': 0.275, 'train_loss': 149.02320945810703, 'epoch': 1.4} +***** train metrics ***** + epoch = 1.4 + total_flos = 43975769GF + train_loss = 149.0232 + train_runtime = 3 days, 1:26:46.79 + train_samples = 19473081 + train_samples_per_second = 17.62 + train_steps_per_second = 0.275 +2025-05-30 15:36:58 - INFO - __main__ - *** Save model *** +[2025-05-30 15:36:59,445] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step72793 is about to be saved! +[2025-05-30 15:36:59,534] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt +[2025-05-30 15:36:59,534] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt... +[2025-05-30 15:36:59,620] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/global_step72793/zero_pp_rank_0_mp_rank_00_model_states.pt. +[2025-05-30 15:36:59,695] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2025-05-30 15:43:00,118] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2025-05-30 15:43:00,282] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B/global_step72793/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2025-05-30 15:43:06,465] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step72793 is ready now! +2025-05-30 15:43:06 - INFO - __main__ - Model saved to hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B +2025-05-30 15:43:06 - INFO - __main__ - *** Evaluate *** +***** eval metrics ***** + epoch = 1.4 + eval_loss = 111.6995 + eval_runtime = 0:00:17.63 + eval_samples = 4096 + eval_samples_per_second = 39.462 + eval_steps_per_second = 2.495 +2025-05-30 15:43:24 - INFO - __main__ - *** Training complete *** +wandb: +wandb: 🚀 View run hybrid_8B_mla16_mamba16_Fix160_qr2048_qh64_stage2_data11B at: https://wandb.ai/mrgzadeh/huggingface/runs/y5dnnyl6 +wandb: Find logs at: wandb/run-20250527_141012-y5dnnyl6/logs