|
|
1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] |
|
|
1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] ***************************************** |
|
|
1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
|
1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] ***************************************** |
|
|
0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] |
|
|
0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] ***************************************** |
|
|
0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
|
0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] ***************************************** |
|
|
2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] |
|
|
2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] ***************************************** |
|
|
2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
|
2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] ***************************************** |
|
|
3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] |
|
|
3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] ***************************************** |
|
|
3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
|
3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] ***************************************** |
|
|
2: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:1780066] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
|
|
0: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3081979] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
|
|
2: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:1780066] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
|
|
0: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3081979] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
|
|
3: [2025-11-24 00:08:36,434] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3626820] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
|
|
3: [2025-11-24 00:08:36,434] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3626820] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
|
|
1: [2025-11-24 00:08:36,535] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:737836] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
|
|
1: [2025-11-24 00:08:36,535] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:737836] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
|
|
0: [33m[2025-11-24 00:08:40,005] [WARNING] [axolotl.utils.config.normalize_config:139] [PID:3081979] [RANK:0] Invalid value for save_steps (1.6666666666666667) from saves_per_epoch and/or num_epochs. Saving at training end only.[39m |
|
|
0: [2025-11-24 00:08:40,025] [INFO] [axolotl.cli.config.load_cfg:245] [PID:3081979] [RANK:0] config: |
|
|
0: { |
|
|
0: "activation_offloading": false, |
|
|
0: "auto_resume_from_checkpoints": true, |
|
|
0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1763939290349239182.yaml", |
|
|
0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b", |
|
|
0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b", |
|
|
0: "batch_size": 16, |
|
|
0: "bf16": true, |
|
|
0: "capabilities": { |
|
|
0: "bf16": true, |
|
|
0: "compute_capability": "sm_90", |
|
|
0: "fp8": false, |
|
|
0: "n_gpu": 16, |
|
|
0: "n_node": 1 |
|
|
0: }, |
|
|
0: "chat_template": "gemma3", |
|
|
0: "context_parallel_size": 1, |
|
|
0: "dataloader_num_workers": 16, |
|
|
0: "dataloader_pin_memory": true, |
|
|
0: "dataloader_prefetch_factor": 256, |
|
|
0: "dataset_prepared_path": "/lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0", |
|
|
0: "dataset_processes": 192, |
|
|
0: "datasets": [ |
|
|
0: { |
|
|
0: "chat_template": "tokenizer_default", |
|
|
0: "data_files": [ |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0007.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0009.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0005.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0006.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0014.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0010.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0012.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0008.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0001.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0002.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0013.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0015.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0004.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0011.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0000.jsonl", |
|
|
0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0003.jsonl" |
|
|
0: ], |
|
|
0: "ds_type": "json", |
|
|
0: "field_messages": "conversations", |
|
|
0: "message_property_mappings": { |
|
|
0: "content": "content", |
|
|
0: "role": "role" |
|
|
0: }, |
|
|
0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking", |
|
|
0: "trust_remote_code": false, |
|
|
0: "type": "chat_template" |
|
|
0: } |
|
|
0: ], |
|
|
0: "ddp": true, |
|
|
0: "deepspeed": { |
|
|
0: "bf16": { |
|
|
0: "enabled": true |
|
|
0: }, |
|
|
0: "gradient_accumulation_steps": "auto", |
|
|
0: "gradient_clipping": "auto", |
|
|
0: "train_batch_size": "auto", |
|
|
0: "train_micro_batch_size_per_gpu": "auto", |
|
|
0: "wall_clock_breakdown": false, |
|
|
0: "zero_optimization": { |
|
|
0: "contiguous_gradients": true, |
|
|
0: "overlap_comm": true, |
|
|
0: "reduce_bucket_size": "auto", |
|
|
0: "stage": 3, |
|
|
0: "stage3_gather_16bit_weights_on_model_save": true, |
|
|
0: "stage3_param_persistence_threshold": "auto", |
|
|
0: "stage3_prefetch_bucket_size": "auto", |
|
|
0: "sub_group_size": 0 |
|
|
0: } |
|
|
0: }, |
|
|
0: "device": "cuda:0", |
|
|
0: "device_map": { |
|
|
0: "": 0 |
|
|
0: }, |
|
|
0: "dion_rank_fraction": 1.0, |
|
|
0: "dion_rank_multiple_of": 1, |
|
|
0: "env_capabilities": { |
|
|
0: "torch_version": "2.6.0" |
|
|
0: }, |
|
|
0: "eot_tokens": [ |
|
|
0: "<end_of_turn>" |
|
|
0: ], |
|
|
0: "eval_batch_size": 1, |
|
|
0: "eval_causal_lm_metrics": [ |
|
|
0: "sacrebleu", |
|
|
0: "comet", |
|
|
0: "ter", |
|
|
0: "chrf" |
|
|
0: ], |
|
|
0: "eval_max_new_tokens": 128, |
|
|
0: "eval_sample_packing": true, |
|
|
0: "eval_table_size": 0, |
|
|
0: "evals_per_epoch": 0, |
|
|
0: "flash_attention": true, |
|
|
0: "fp16": false, |
|
|
0: "gradient_accumulation_steps": 1, |
|
|
0: "gradient_checkpointing": true, |
|
|
0: "gradient_checkpointing_kwargs": { |
|
|
0: "use_reentrant": true |
|
|
0: }, |
|
|
0: "is_multimodal": true, |
|
|
0: "learning_rate": 2e-06, |
|
|
0: "lisa_layers_attribute": "model.layers", |
|
|
0: "load_best_model_at_end": false, |
|
|
0: "load_in_4bit": false, |
|
|
0: "load_in_8bit": false, |
|
|
0: "local_rank": 0, |
|
|
0: "logging_steps": 10, |
|
|
0: "lora_dropout": 0.0, |
|
|
0: "loraplus_lr_embedding": 1e-06, |
|
|
0: "lr_scheduler": "warmup_stable_decay", |
|
|
0: "lr_scheduler_kwargs": { |
|
|
0: "min_lr_ratio": 0.1, |
|
|
0: "num_decay_steps": 200 |
|
|
0: }, |
|
|
0: "max_prompt_len": 512, |
|
|
0: "mean_resizing_embeddings": false, |
|
|
0: "micro_batch_size": 1, |
|
|
0: "model_config_type": "gemma3", |
|
|
0: "num_epochs": 0.6, |
|
|
0: "optimizer": "adamw_torch_fused", |
|
|
0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0", |
|
|
0: "pad_to_sequence_len": true, |
|
|
0: "pretrain_multipack_attn": true, |
|
|
0: "pretrain_multipack_buffer_size": 10000, |
|
|
0: "processor_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b", |
|
|
0: "profiler_steps_start": 0, |
|
|
0: "qlora_sharded_model_loading": false, |
|
|
0: "ray_num_workers": 1, |
|
|
0: "resources_per_worker": { |
|
|
0: "GPU": 1 |
|
|
0: }, |
|
|
0: "sample_packing": true, |
|
|
0: "sample_packing_bin_size": 200, |
|
|
0: "sample_packing_group_size": 100000, |
|
|
0: "save_only_model": true, |
|
|
0: "save_safetensors": true, |
|
|
0: "save_total_limit": 20, |
|
|
0: "saves_per_epoch": 1, |
|
|
0: "sequence_len": 16384, |
|
|
0: "shuffle_before_merging_datasets": false, |
|
|
0: "shuffle_merged_datasets": true, |
|
|
0: "skip_prepare_dataset": false, |
|
|
0: "strict": false, |
|
|
0: "tensor_parallel_size": 1, |
|
|
0: "tf32": false, |
|
|
0: "tiled_mlp_use_original_mlp": true, |
|
|
0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-27b", |
|
|
0: "torch_dtype": "torch.bfloat16", |
|
|
0: "train_on_inputs": false, |
|
|
0: "trl": { |
|
|
0: "log_completions": false, |
|
|
0: "mask_truncated_completions": false, |
|
|
0: "ref_model_mixup_alpha": 0.9, |
|
|
0: "ref_model_sync_steps": 64, |
|
|
0: "scale_rewards": true, |
|
|
0: "sync_ref_model": false, |
|
|
0: "use_vllm": false, |
|
|
0: "vllm_server_host": "0.0.0.0", |
|
|
0: "vllm_server_port": 8000 |
|
|
0: }, |
|
|
0: "use_ray": false, |
|
|
0: "use_tensorboard": true, |
|
|
0: "val_set_size": 0.0, |
|
|
0: "vllm": { |
|
|
0: "device": "auto", |
|
|
0: "dtype": "auto", |
|
|
0: "gpu_memory_utilization": 0.9, |
|
|
0: "host": "0.0.0.0", |
|
|
0: "port": 8000 |
|
|
0: }, |
|
|
0: "warmup_steps": 100, |
|
|
0: "weight_decay": 0.0, |
|
|
0: "world_size": 16 |
|
|
0: }[39m |
|
|
0: [2025-11-24 00:08:40,026] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:3081979] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.[39m |
|
|
0: [2025-11-24 00:08:41,217] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:472] [PID:3081979] [RANK:0] Loading prepared dataset from disk at /lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0/06698e902d3dba325ca34849b1dea5ea...[39m |
|
|
0: [2025-11-24 00:09:14,927] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3081979] [RANK:0] gather_len_batches: [18976, 18976, 18976, 18975, 18977, 18976, 18975, 18976, 18976, 18975, 18976, 18976, 18976, 18976, 18976, 18976][39m |
|
|
0: [2025-11-24 00:09:14,950] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:3081979] [RANK:0] sample_packing_eff_est across ranks: [0.9989354014396667, 0.9988301396369934, 0.9989880323410034, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9989354014396667, 0.9989354014396667, 0.9989354014396667, 0.9988827705383301, 0.9989354014396667, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9989354014396667][39m |
|
|
0: [2025-11-24 00:09:14,959] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:3081979] [RANK:0] Maximum number of steps set at 711[39m |
|
|
3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. |
|
|
0: [2025-11-24 00:09:22,718] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:3081979] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m |
|
|
0: [2025-11-24 00:09:22,719] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:3081979] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m |
|
|
3:
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.97s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββοΏ½ |
|
|
1:
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.97s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββοΏ½ |
|
|
2:
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββοΏ½ |
|
|
0:
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:11<00:47, 11.79s/it]
Loading checkpoint shards: 20%|ββ | 1/5 [00:15<01:00, 15.22s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.98s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:22<00:32, 10.97s/it]
Loading checkpoint shards: 40%|ββββ | 2/5 [00:25<00:36, 12.21s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββοΏ½ |
|
|
0: οΏ½β | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:35<00:22, 11.47s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:45<00:10, 10.54s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
3: οΏ½β | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
1: οΏ½β | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25 |
|
|
1: s/it] |
|
|
1: |
|
|
3:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
0:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
1:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
3: |
|
|
0:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
2: οΏ½β | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 60%|ββββββ | 3/5 [00:31<00:20, 10.27s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 80%|ββββββββ | 4/5 [00:41<00:10, 10.19s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
3:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
2:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
2:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
2:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
1:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 9.95s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:51<00:00, 10.25s/it] |
|
|
0:
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:54<00:00, 10.29s/it]
Loading checkpoint shards: 100%|ββββββββββ| 5/5 [00:54<00:00, 10.97s/it] |
|
|
0: [2025-11-24 00:10:19,017] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:3081979] [RANK:0] Converting modules to torch.bfloat16[39m |
|
|
0: [2025-11-24 00:10:22,748] [INFO] [axolotl.train.save_initial_configs:416] [PID:3081979] [RANK:0] Pre-saving tokenizer to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0...[39m |
|
|
0: [2025-11-24 00:10:23,317] [INFO] [axolotl.train.save_initial_configs:419] [PID:3081979] [RANK:0] Pre-saving model config to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0...[39m |
|
|
0: [2025-11-24 00:10:23,327] [INFO] [axolotl.train.save_initial_configs:423] [PID:3081979] [RANK:0] Pre-saving processor to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0...[39m |
|
|
0: [2025-11-24 00:10:26,392] [INFO] [axolotl.train.execute_training:203] [PID:3081979] [RANK:0] Starting trainer...[39m |
|
|
0: [2025-11-24 00:11:58,358] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3081979] [RANK:0] gather_len_batches: [18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976][39m |
|
|
0: Parameter Offload - Persistent parameters statistics: param_count = 563, numel = 1166448 |
|
|
0: {'loss': 0.6182, 'grad_norm': 2.8189747023390073, 'learning_rate': 3.62e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.01} |
|
|
0:
0%| | 0/711 [00:00<?, ?it/s]
0%| | 1/711 [03:14<38:20:16, 194.39s/it]
0%| | 2/711 [03:19<16:22:13, 83.12s/it]
0%| | 3/711 [03:24<9:20:24, 47.49s/it]
1%| | 4/711 [03:29<6:02:21, 30.75s/it]
1%| | 5/711 [03:35<4:14:24, 21.62s/it]
1%| | 6/711 [03:40<3:09:35, 16.14s/it]
1%| | 7/711 [03:45<2:27:08, 12.54s/it]
1%| | 8/711 [03:50<1:59:06, 10.17s/it]
1%|β | 9/711 [03:56<1:40:22, 8.58s/it]
1%|β | 10/711 [04:01<1:27:41, 7.51s/it]
1%|β | 10/711 [04:01<1:27:41, 7.51s/it]
2%|β | 11/711 [04:06<1:18:52, 6.76s/it]
2%|β | 12/711 [04:11<1:12:59, 6.27s/it]
2%|β | 13/711 [04:16<1:08:45, 5.91s/it]
2%|β | 14/711 [04:21<1:06:01, 5.68s/it]
2%|β | 15/711 [04:26<1:03:51, 5.51s/it]
2%|β | 16/711 [04:31<1:02:15, 5.38s/it]
2%|β | 17/711 [04:37<1:01:45, 5.34s/i |
|
|
0: {'loss': 0.5822, 'grad_norm': 1.7276350224873818, 'learning_rate': 5.420000000000001e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.02} |
|
|
0: {'loss': 0.5571, 'grad_norm': 2.161001413543057, 'learning_rate': 7.219999999999999e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.03} |
|
|
0: t]
3%|β | 18/711 [04:42<1:01:12, 5.30s/it]
3%|β | 19/711 [04:47<1:00:37, 5.26s/it]
3%|β | 20/711 [04:52<1:00:10, 5.23s/it]
3%|β | 20/711 [04:52<1:00:10, 5.23s/it]
3%|β | 21/711 [04:57<59:44, 5.19s/it]
3%|β | 22/711 [05:02<59:22, 5.17s/it]
3%|β | 23/711 [05:07<59:00, 5.15s/it]
3%|β | 24/711 [05:13<58:57, 5.15s/it]
4%|β | 25/711 [05:18<59:09, 5.17s/it]
4%|β | 26/711 [05:23<58:51, 5.16s/it]
4%|β | 27/711 [05:28<58:27, 5.13s/it]
4%|β | 28/711 [05:33<58:17, 5.12s/it]
4%|β | 29/711 [05:38<58:07, 5.11s/it]
4%|β | 30/711 [05:44<59:10, 5.21s/it]
4%|β | 30/711 [05:44<59:10, 5.21s/it]
4%|β | 31/711 [05:49<58:50, 5.19s/it]
5%|β | 32/711 [05:54<58:36, 5.18s/it]
5%|β | 33/711 [05:59<59:31, 5 |
|
|
0: {'loss': 0.5218, 'grad_norm': 1.0906530849609661, 'learning_rate': 9.020000000000001e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.03} |
|
|
0: .27s/it]
5%|β | 34/711 [06:04<58:50, 5.22s/it]
5%|β | 35/711 [06:10<58:21, 5.18s/it]
5%|β | 36/711 [06:15<57:59, 5.16s/it]
5%|β | 37/711 [06:20<57:35, 5.13s/it]
5%|β | 38/711 [06:25<57:22, 5.12s/it]
5%|β | 39/711 [06:30<57:16, 5.11s/it]
6%|β | 40/711 [06:35<57:20, 5.13s/it]
6%|β | 40/711 [06:35<57:20, 5.13s/it]
6%|β | 41/711 [06:40<57:23, 5.14s/it]
6%|β | 42/711 [06:45<57:09, 5.13s/it]
6%|β | 43/711 [06:51<57:31, 5.17s/it]
6%|β | 44/711 [06:56<57:25, 5.17s/it]
6%|β | 45/711 [07:01<57:08, 5.15s/it]
6%|β | 46/711 [07:06<57:01, 5.15s/it]
7%|β | 47/711 [07:11<56:53, 5.14s/it]
7%|β | 48/711 [07:16<57:06, 5.17s/it]
7%|β | 49/711 [07:22<57:03, 5.17s/it]
7%|β | 50/711 [07:27<57:26, 5.21s/it]
|
|
|
0: {'loss': 0.4959, 'grad_norm': 0.8804401875885586, 'learning_rate': 1.082e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.04} |
|
|
0: {'loss': 0.4729, 'grad_norm': 1.7572079203155466, 'learning_rate': 1.262e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.05} |
|
|
0:
7%|β | 50/711 [07:27<57:26, 5.21s/it]
7%|β | 51/711 [07:32<57:06, 5.19s/it]
7%|β | 52/711 [07:37<57:02, 5.19s/it]
7%|β | 53/711 [07:43<57:29, 5.24s/it]
8%|β | 54/711 [07:48<58:04, 5.30s/it]
8%|β | 55/711 [07:53<57:20, 5.25s/it]
8%|β | 56/711 [07:59<57:54, 5.30s/it]
8%|β | 57/711 [08:04<57:10, 5.25s/it]
8%|β | 58/711 [08:09<57:55, 5.32s/it]
8%|β | 59/711 [08:14<57:11, 5.26s/it]
8%|β | 60/711 [08:19<56:30, 5.21s/it]
8%|β | 60/711 [08:19<56:30, 5.21s/it]
9%|β | 61/711 [08:25<56:35, 5.22s/it]
9%|β | 62/711 [08:30<56:18, 5.21s/it]
9%|β | 63/711 [08:35<57:08, 5.29s/it]
9%|β | 64/711 [08:41<56:59, 5.28s/it]
9%|β | 65/711 [08:46<56:22, 5.24s/it]
9%|β | 66/711 [08:51<56:36, 5.27s/it]
9%|β | 67/711 [08:56<56:06, 5.23s/it]
1 |
|
|
0: {'loss': 0.4737, 'grad_norm': 1.0832691281380182, 'learning_rate': 1.442e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.06} |
|
|
0: {'loss': 0.4648, 'grad_norm': 0.9351167776948649, 'learning_rate': 1.622e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.07} |
|
|
0: 0%|β | 68/711 [09:01<55:32, 5.18s/it]
10%|β | 69/711 [09:06<55:12, 5.16s/it]
10%|β | 70/711 [09:12<55:17, 5.18s/it]
10%|β | 70/711 [09:12<55:17, 5.18s/it]
10%|β | 71/711 [09:17<54:55, 5.15s/it]
10%|β | 72/711 [09:22<54:39, 5.13s/it]
10%|β | 73/711 [09:27<54:25, 5.12s/it]
10%|β | 74/711 [09:32<54:18, 5.12s/it]
11%|β | 75/711 [09:37<54:05, 5.10s/it]
11%|β | 76/711 [09:42<54:01, 5.11s/it]
11%|β | 77/711 [09:47<53:56, 5.11s/it]
11%|β | 78/711 [09:52<54:06, 5.13s/it]
11%|β | 79/711 [09:57<53:56, 5.12s/it]
11%|ββ | 80/711 [10:03<54:04, 5.14s/it]
11%|ββ | 80/711 [10:03<54:04, 5.14s/it]
11%|ββ | 81/711 [10:08<55:07, 5.25s/it]
12%|ββ | 82/711 [10:13<54:50, 5.23s/it]
12%|ββ | 83/711 [10:19<55:29, 5.30s/it |
|
|
0: {'loss': 0.4437, 'grad_norm': 1.0944333242533355, 'learning_rate': 1.802e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.08} |
|
|
0: ]
12%|ββ | 84/711 [10:24<54:57, 5.26s/it]
12%|ββ | 85/711 [10:29<54:21, 5.21s/it]
12%|ββ | 86/711 [10:35<55:05, 5.29s/it]
12%|ββ | 87/711 [10:40<54:33, 5.25s/it]
12%|ββ | 88/711 [10:45<54:02, 5.20s/it]
13%|ββ | 89/711 [10:50<53:35, 5.17s/it]
13%|ββ | 90/711 [10:55<53:14, 5.14s/it]
13%|ββ | 90/711 [10:55<53:14, 5.14s/it]
13%|ββ | 91/711 [11:00<52:59, 5.13s/it]
13%|ββ | 92/711 [11:05<53:42, 5.21s/it]
13%|ββ | 93/711 [11:11<54:32, 5.30s/it]
13%|ββ | 94/711 [11:16<54:50, 5.33s/it]
13%|ββ | 95/711 [11:22<55:36, 5.42s/it]
14%|ββ | 96/711 [11:27<55:11, 5.38s/it]
14%|ββ | 97/711 [11:32<54:13, 5.30s/it]
14%|ββ | 98/711 [11:38<54:39, 5.35s/it]
14%|ββ | 99/711 [11:43<53:58, 5.29s/it]
14%|ββ | 100/711 [11:48<53:15, 5.23s/it]
|
|
|
0: {'loss': 0.4312, 'grad_norm': 0.821415164120209, 'learning_rate': 1.982e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.08} |
|
|
0: {'loss': 0.4519, 'grad_norm': 1.098049116364939, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.09} |
|
|
0:
14%|ββ | 100/711 [11:48<53:15, 5.23s/it]
14%|ββ | 101/711 [11:53<52:46, 5.19s/it]
14%|ββ | 102/711 [11:58<52:21, 5.16s/it]
14%|ββ | 103/711 [12:03<52:00, 5.13s/it]
15%|ββ | 104/711 [12:08<51:44, 5.11s/it]
15%|ββ | 105/711 [12:14<51:39, 5.11s/it]
15%|ββ | 106/711 [12:19<51:43, 5.13s/it]
15%|ββ | 107/711 [12:24<51:34, 5.12s/it]
15%|ββ | 108/711 [12:29<51:22, 5.11s/it]
15%|ββ | 109/711 [12:34<51:11, 5.10s/it]
15%|ββ | 110/711 [12:39<51:14, 5.12s/it]
15%|ββ | 110/711 [12:39<51:14, 5.12s/it]
16%|ββ | 111/711 [12:44<51:43, 5.17s/it]
16%|ββ | 112/711 [12:50<51:33, 5.16s/it]
16%|ββ | 113/711 [12:55<52:08, 5.23s/it]
16%|ββ | 114/711 [13:00<51:48, 5.21s/it]
16%|ββ | 115/711 [13:05<51:23, 5.17s/it]
16%|ββ | |
|
|
0: {'loss': 0.4418, 'grad_norm': 0.8654847799165983, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.1} |
|
|
0: {'loss': 0.4272, 'grad_norm': 0.8743836149172823, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.11} |
|
|
0: 116/711 [13:10<51:17, 5.17s/it]
16%|ββ | 117/711 [13:16<51:03, 5.16s/it]
17%|ββ | 118/711 [13:21<50:59, 5.16s/it]
17%|ββ | 119/711 [13:26<51:20, 5.20s/it]
17%|ββ | 120/711 [13:31<50:54, 5.17s/it]
17%|ββ | 120/711 [13:31<50:54, 5.17s/it]
17%|ββ | 121/711 [13:36<50:42, 5.16s/it]
17%|ββ | 122/711 [13:41<50:37, 5.16s/it]
17%|ββ | 123/711 [13:47<50:33, 5.16s/it]
17%|ββ | 124/711 [13:52<50:44, 5.19s/it]
18%|ββ | 125/711 [13:57<51:31, 5.28s/it]
18%|ββ | 126/711 [14:03<52:11, 5.35s/it]
18%|ββ | 127/711 [14:08<52:12, 5.36s/it]
18%|ββ | 128/711 [14:14<52:10, 5.37s/it]
18%|ββ | 129/711 [14:19<51:21, 5.29s/it]
18%|ββ | 130/711 [14:24<50:50, 5.25s/it]
18%|ββ | 130/711 [14:24<50:50, 5.25s/it]
18%|ββ | 131/ |
|
|
0: {'loss': 0.4317, 'grad_norm': 0.886837889977122, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.12} |
|
|
0: 711 [14:29<50:24, 5.22s/it]
19%|ββ | 132/711 [14:34<50:04, 5.19s/it]
19%|ββ | 133/711 [14:39<50:21, 5.23s/it]
19%|ββ | 134/711 [14:45<49:58, 5.20s/it]
19%|ββ | 135/711 [14:50<49:37, 5.17s/it]
19%|ββ | 136/711 [14:55<49:54, 5.21s/it]
19%|ββ | 137/711 [15:00<49:29, 5.17s/it]
19%|ββ | 138/711 [15:05<49:14, 5.16s/it]
20%|ββ | 139/711 [15:10<49:05, 5.15s/it]
20%|ββ | 140/711 [15:16<49:19, 5.18s/it]
20%|ββ | 140/711 [15:16<49:19, 5.18s/it]
20%|ββ | 141/711 [15:21<49:04, 5.17s/it]
20%|ββ | 142/711 [15:26<48:43, 5.14s/it]
20%|ββ | 143/711 [15:31<48:31, 5.13s/it]
20%|ββ | 144/711 [15:36<48:59, 5.19s/it]
20%|ββ | 145/711 [15:41<48:54, 5.19s/it]
21%|ββ | 146/711 [15:46<48:39, 5.17s/it]
21%|ββ | 147/711 [15:52<48:22, 5.15s/it]
21%|ββ | 148/7 |
|
|
0: {'loss': 0.4309, 'grad_norm': 1.0717708423744885, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.13} |
|
|
0: {'loss': 0.4316, 'grad_norm': 0.8573484702226136, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.13} |
|
|
0: 11 [15:57<48:11, 5.14s/it]
21%|ββ | 149/711 [16:02<48:06, 5.14s/it]
21%|ββ | 150/711 [16:07<48:06, 5.14s/it]
21%|ββ | 150/711 [16:07<48:06, 5.14s/it]
21%|ββ | 151/711 [16:12<48:00, 5.14s/it]
21%|βββ | 152/711 [16:17<48:35, 5.22s/it]
22%|βββ | 153/711 [16:23<48:21, 5.20s/it]
22%|βββ | 154/711 [16:28<48:00, 5.17s/it]
22%|βββ | 155/711 [16:33<47:54, 5.17s/it]
22%|βββ | 156/711 [16:38<47:42, 5.16s/it]
22%|βββ | 157/711 [16:43<47:36, 5.16s/it]
22%|βββ | 158/711 [16:48<47:27, 5.15s/it]
22%|βββ | 159/711 [16:54<48:09, 5.23s/it]
23%|βββ | 160/711 [16:59<47:40, 5.19s/it]
23%|βββ | 160/711 [16:59<47:40, 5.19s/it]
23%|βββ | 161/711 [17:04<47:17, 5.16s/it]
23%|βββ | 162/711 [17:09<47:17, 5.17s/it]
23%|οΏ½ |
|
|
0: {'loss': 0.4239, 'grad_norm': 0.8728825320101697, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.14} |
|
|
0: οΏ½οΏ½ββ | 163/711 [17:14<46:56, 5.14s/it]
23%|βββ | 164/711 [17:20<47:49, 5.25s/it]
23%|βββ | 165/711 [17:25<47:18, 5.20s/it]
23%|βββ | 166/711 [17:30<47:54, 5.27s/it]
23%|βββ | 167/711 [17:35<47:21, 5.22s/it]
24%|βββ | 168/711 [17:41<47:10, 5.21s/it]
24%|βββ | 169/711 [17:46<46:57, 5.20s/it]
24%|βββ | 170/711 [17:51<46:38, 5.17s/it]
24%|βββ | 170/711 [17:51<46:38, 5.17s/it]
24%|βββ | 171/711 [17:56<46:21, 5.15s/it]
24%|βββ | 172/711 [18:01<46:17, 5.15s/it]
24%|βββ | 173/711 [18:06<46:18, 5.16s/it]
24%|βββ | 174/711 [18:11<46:01, 5.14s/it]
25%|βββ | 175/711 [18:16<45:50, 5.13s/it]
25%|βββ | 176/711 [18:22<45:49, 5.14s/it]
25%|βββ | 177/711 [18:27<45:49, 5.15s/it]
25%|βββ | 178/711 [18:32<46:45, 5.26s/it]
25%|βββ | 179/ |
|
|
0: {'loss': 0.4173, 'grad_norm': 2.470513995230686, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.15} |
|
|
0: {'loss': 0.4151, 'grad_norm': 0.9038938137872402, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.16} |
|
|
0: 711 [18:37<46:12, 5.21s/it]
25%|βββ | 180/711 [18:43<45:49, 5.18s/it]
25%|βββ | 180/711 [18:43<45:49, 5.18s/it]
25%|βββ | 181/711 [18:48<45:33, 5.16s/it]
26%|βββ | 182/711 [18:53<45:22, 5.15s/it]
26%|βββ | 183/711 [18:58<45:10, 5.13s/it]
26%|βββ | 184/711 [19:03<45:33, 5.19s/it]
26%|βββ | 185/711 [19:08<45:10, 5.15s/it]
26%|βββ | 186/711 [19:13<44:56, 5.14s/it]
26%|βββ | 187/711 [19:18<44:44, 5.12s/it]
26%|βββ | 188/711 [19:24<45:35, 5.23s/it]
27%|βββ | 189/711 [19:29<45:10, 5.19s/it]
27%|βββ | 190/711 [19:34<44:59, 5.18s/it]
27%|βββ | 190/711 [19:34<44:59, 5.18s/it]
27%|βββ | 191/711 [19:39<44:41, 5.16s/it]
27%|βββ | 192/711 [19:44<44:29, 5.14s/it]
27%|βββ | 193/711 [19:50<44:25, 5.15s/i |
|
|
0: {'loss': 0.4194, 'grad_norm': 2.3527260378633015, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.17} |
|
|
0: t]
27%|βββ | 194/711 [19:55<44:10, 5.13s/it]
27%|βββ | 195/711 [20:00<44:25, 5.17s/it]
28%|βββ | 196/711 [20:05<44:10, 5.15s/it]
28%|βββ | 197/711 [20:10<44:27, 5.19s/it]
28%|βββ | 198/711 [20:16<45:08, 5.28s/it]
28%|βββ | 199/711 [20:21<44:34, 5.22s/it]
28%|βββ | 200/711 [20:26<44:17, 5.20s/it]
28%|βββ | 200/711 [20:26<44:17, 5.20s/it]
28%|βββ | 201/711 [20:31<43:55, 5.17s/it]
28%|βββ | 202/711 [20:36<43:35, 5.14s/it]
29%|βββ | 203/711 [20:41<43:37, 5.15s/it]
29%|βββ | 204/711 [20:47<44:14, 5.24s/it]
29%|βββ | 205/711 [20:52<44:01, 5.22s/it]
29%|βββ | 206/711 [20:57<43:43, 5.20s/it]
29%|βββ | 207/711 [21:02<43:30, 5.18s/it]
29%|βββ | 208/711 [21:07<43:14, 5.16s/it]
29%|βββ | 209/711 [21:12<43:02, 5.15s/it]
30%|βββ |
|
|
0: {'loss': 0.413, 'grad_norm': 0.893185793908122, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.18} |
|
|
0: {'loss': 0.4217, 'grad_norm': 1.160958862723743, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.19} |
|
|
0: | 210/711 [21:18<42:50, 5.13s/it]
30%|βββ | 210/711 [21:18<42:50, 5.13s/it]
30%|βββ | 211/711 [21:23<42:40, 5.12s/it]
30%|βββ | 212/711 [21:28<42:38, 5.13s/it]
30%|βββ | 213/711 [21:33<42:38, 5.14s/it]
30%|βββ | 214/711 [21:38<42:36, 5.14s/it]
30%|βββ | 215/711 [21:43<42:24, 5.13s/it]
30%|βββ | 216/711 [21:48<42:15, 5.12s/it]
31%|βββ | 217/711 [21:53<42:11, 5.12s/it]
31%|βββ | 218/711 [21:59<42:01, 5.12s/it]
31%|βββ | 219/711 [22:04<42:01, 5.12s/it]
31%|βββ | 220/711 [22:09<41:50, 5.11s/it]
31%|βββ | 220/711 [22:09<41:50, 5.11s/it]
31%|βββ | 221/711 [22:14<42:03, 5.15s/it]
31%|βββ | 222/711 [22:19<42:36, 5.23s/it]
31%|ββββ | 223/711 [22:25<42:39, 5.25s/it]
32%|ββββ | 224/711 [22:30<42 |
|
|
0: {'loss': 0.4102, 'grad_norm': 0.8461972280700218, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.19} |
|
|
0: :22, 5.22s/it]
32%|ββββ | 225/711 [22:35<42:13, 5.21s/it]
32%|ββββ | 226/711 [22:40<42:40, 5.28s/it]
32%|ββββ | 227/711 [22:46<42:11, 5.23s/it]
32%|ββββ | 228/711 [22:51<41:45, 5.19s/it]
32%|ββββ | 229/711 [22:56<41:33, 5.17s/it]
32%|ββββ | 230/711 [23:01<41:16, 5.15s/it]
32%|ββββ | 230/711 [23:01<41:16, 5.15s/it]
32%|ββββ | 231/711 [23:06<41:02, 5.13s/it]
33%|ββββ | 232/711 [23:11<40:52, 5.12s/it]
33%|ββββ | 233/711 [23:16<40:57, 5.14s/it]
33%|ββββ | 234/711 [23:21<40:58, 5.15s/it]
33%|ββββ | 235/711 [23:27<40:52, 5.15s/it]
33%|ββββ | 236/711 [23:32<40:41, 5.14s/it]
33%|ββββ | 237/711 [23:37<40:32, 5.13s/it]
33%|ββββ | 238/711 [23:42<40:21, 5.12s/it]
34%|ββββ | 239/711 [23:47<40:22, 5.13s/it]
34%|ββββ | 240/ |
|
|
0: {'loss': 0.4155, 'grad_norm': 0.8314605620161184, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.2} |
|
|
0: {'loss': 0.4045, 'grad_norm': 0.8328744939735258, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.21} |
|
|
0: 711 [23:52<40:14, 5.13s/it]
34%|ββββ | 240/711 [23:52<40:14, 5.13s/it]
34%|ββββ | 241/711 [23:57<40:04, 5.12s/it]
34%|ββββ | 242/711 [24:03<40:30, 5.18s/it]
34%|ββββ | 243/711 [24:08<40:55, 5.25s/it]
34%|ββββ | 244/711 [24:13<40:38, 5.22s/it]
34%|ββββ | 245/711 [24:18<40:16, 5.18s/it]
35%|ββββ | 246/711 [24:23<40:06, 5.18s/it]
35%|ββββ | 247/711 [24:29<40:20, 5.22s/it]
35%|ββββ | 248/711 [24:34<39:58, 5.18s/it]
35%|ββββ | 249/711 [24:39<39:40, 5.15s/it]
35%|ββββ | 250/711 [24:44<39:34, 5.15s/it]
35%|ββββ | 250/711 [24:44<39:34, 5.15s/it]
35%|ββββ | 251/711 [24:49<39:23, 5.14s/it]
35%|ββββ | 252/711 [24:54<39:08, 5.12s/it]
36%|ββββ | 253/711 [24:59<39:13, 5.14s/it]
36%|ββββ |
|
|
0: {'loss': 0.4005, 'grad_norm': 0.8810433727017853, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.22} |
|
|
0: | 254/711 [25:05<39:10, 5.14s/it]
36%|ββββ | 255/711 [25:10<39:07, 5.15s/it]
36%|ββββ | 256/711 [25:15<39:17, 5.18s/it]
36%|ββββ | 257/711 [25:20<38:57, 5.15s/it]
36%|ββββ | 258/711 [25:26<39:48, 5.27s/it]
36%|ββββ | 259/711 [25:31<39:17, 5.22s/it]
37%|ββββ | 260/711 [25:36<38:58, 5.19s/it]
37%|ββββ | 260/711 [25:36<38:58, 5.19s/it]
37%|ββββ | 261/711 [25:41<38:40, 5.16s/it]
37%|ββββ | 262/711 [25:46<38:26, 5.14s/it]
37%|ββββ | 263/711 [25:51<38:26, 5.15s/it]
37%|ββββ | 264/711 [25:56<38:28, 5.16s/it]
37%|ββββ | 265/711 [26:02<38:16, 5.15s/it]
37%|ββββ | 266/711 [26:07<38:12, 5.15s/it]
38%|ββββ | 267/711 [26:12<38:50, 5.25s/it]
38%|ββββ | 268/711 [26:18<39:17, 5.32s/it]
38%|ββββ | 269/711 [26:23<38:44, 5.26s/it]
38%|βοΏ½ |
|
|
0: {'loss': 0.4021, 'grad_norm': 1.0060252029086465, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.23} |
|
|
0: [2025-11-24 00:38:52,072] [WARNING] [stage3.py:2150:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time |
|
|
0: {'loss': 0.4124, 'grad_norm': 0.9014415482740915, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.24} |
|
|
0: οΏ½ββ | 270/711 [26:28<38:17, 5.21s/it]
38%|ββββ | 270/711 [26:28<38:17, 5.21s/it]
38%|ββββ | 271/711 [26:33<37:58, 5.18s/it]
38%|ββββ | 272/711 [26:38<37:42, 5.15s/it]
38%|ββββ | 273/711 [26:43<37:32, 5.14s/it]
39%|ββββ | 274/711 [26:49<39:58, 5.49s/it]
39%|ββββ | 275/711 [26:55<39:08, 5.39s/it]
39%|ββββ | 276/711 [27:00<38:28, 5.31s/it]
39%|ββββ | 277/711 [27:05<38:44, 5.36s/it]
39%|ββββ | 278/711 [27:10<38:16, 5.30s/it]
39%|ββββ | 279/711 [27:16<37:45, 5.24s/it]
39%|ββββ | 280/711 [27:21<37:26, 5.21s/it]
39%|ββββ | 280/711 [27:21<37:26, 5.21s/it]
40%|ββββ | 281/711 [27:26<38:16, 5.34s/it]
40%|ββββ | 282/711 [27:31<37:40, 5.27s/it]
40%|ββββ | 283/711 [27:37<37:37, 5.28s/it]
40% |
|
|
0: {'loss': 0.3928, 'grad_norm': 1.1303634088009527, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.24} |
|
|
0: |ββββ | 284/711 [27:42<37:57, 5.33s/it]
40%|ββββ | 285/711 [27:47<37:34, 5.29s/it]
40%|ββββ | 286/711 [27:53<37:14, 5.26s/it]
40%|ββββ | 287/711 [27:58<37:31, 5.31s/it]
41%|ββββ | 288/711 [28:03<37:04, 5.26s/it]
41%|ββββ | 289/711 [28:08<36:40, 5.21s/it]
41%|ββββ | 290/711 [28:13<36:19, 5.18s/it]
41%|ββββ | 290/711 [28:13<36:19, 5.18s/it]
41%|ββββ | 291/711 [28:19<36:32, 5.22s/it]
41%|ββββ | 292/711 [28:24<36:57, 5.29s/it]
41%|ββββ | 293/711 [28:29<36:57, 5.31s/it]
41%|βββββ | 294/711 [28:35<36:36, 5.27s/it]
41%|βββββ | 295/711 [28:40<36:18, 5.24s/it]
42%|βββββ | 296/711 [28:45<35:59, 5.20s/it]
42%|βββββ | 297/711 [28:50<36:22, 5.27s/it]
42%|βββββ | 298/711 [28:55<36:02, 5.24s/it]
42%|βββββ | 299/711 [29: |
|
|
0: {'loss': 0.4025, 'grad_norm': 0.8789278025527175, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.25} |
|
|
0: {'loss': 0.4015, 'grad_norm': 0.7615557087401322, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.26} |
|
|
0: 01<35:58, 5.24s/it]
42%|βββββ | 300/711 [29:06<36:01, 5.26s/it]
42%|βββββ | 300/711 [29:06<36:01, 5.26s/it]
42%|βββββ | 301/711 [29:11<35:42, 5.23s/it]
42%|βββββ | 302/711 [29:16<35:28, 5.20s/it]
43%|βββββ | 303/711 [29:21<35:11, 5.17s/it]
43%|βββββ | 304/711 [29:27<35:03, 5.17s/it]
43%|βββββ | 305/711 [29:32<35:16, 5.21s/it]
43%|βββββ | 306/711 [29:37<34:57, 5.18s/it]
43%|βββββ | 307/711 [29:42<34:42, 5.15s/it]
43%|βββββ | 308/711 [29:47<34:33, 5.15s/it]
43%|βββββ | 309/711 [29:53<35:23, 5.28s/it]
44%|βββββ | 310/711 [29:59<36:23, 5.45s/it]
44%|βββββ | 310/711 [29:59<36:23, 5.45s/it]
44%|βββββ | 311/711 [30:04<35:40, 5.35s/it]
44%|βββββ | 312/711 [30:09<35:47, 5.38s/it]
|
|
|
0: {'loss': 0.4047, 'grad_norm': 1.0096950251075136, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.27} |
|
|
0: 44%|βββββ | 313/711 [30:14<35:07, 5.30s/it]
44%|βββββ | 314/711 [30:20<35:18, 5.34s/it]
44%|βββββ | 315/711 [30:25<35:09, 5.33s/it]
44%|βββββ | 316/711 [30:30<34:42, 5.27s/it]
45%|βββββ | 317/711 [30:35<34:17, 5.22s/it]
45%|βββββ | 318/711 [30:40<34:02, 5.20s/it]
45%|βββββ | 319/711 [30:46<33:48, 5.18s/it]
45%|βββββ | 320/711 [30:51<33:36, 5.16s/it]
45%|βββββ | 320/711 [30:51<33:36, 5.16s/it]
45%|βββββ | 321/711 [30:56<33:30, 5.16s/it]
45%|βββββ | 322/711 [31:01<33:19, 5.14s/it]
45%|βββββ | 323/711 [31:06<33:37, 5.20s/it]
46%|βββββ | 324/711 [31:11<33:25, 5.18s/it]
46%|βββββ | 325/711 [31:17<33:17, 5.17s/it]
46%|βββββ | 326/711 [31:22<33:09, 5.17s/it]
46%|βββββ | 327/711 [31:27<33:08, 5.18s/it]
46%|βββ |
|
|
0: {'loss': 0.4025, 'grad_norm': 0.9227721040091849, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.28} |
|
|
0: {'loss': 0.3987, 'grad_norm': 1.8038936518267323, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.29} |
|
|
0: ββ | 328/711 [31:32<32:55, 5.16s/it]
46%|βββββ | 329/711 [31:37<32:44, 5.14s/it]
46%|βββββ | 330/711 [31:42<32:58, 5.19s/it]
46%|βββββ | 330/711 [31:42<32:58, 5.19s/it]
47%|βββββ | 331/711 [31:48<32:46, 5.18s/it]
47%|βββββ | 332/711 [31:53<32:35, 5.16s/it]
47%|βββββ | 333/711 [31:58<33:05, 5.25s/it]
47%|βββββ | 334/711 [32:03<32:47, 5.22s/it]
47%|βββββ | 335/711 [32:09<33:10, 5.29s/it]
47%|βββββ | 336/711 [32:14<32:41, 5.23s/it]
47%|βββββ | 337/711 [32:19<32:21, 5.19s/it]
48%|βββββ | 338/711 [32:24<32:19, 5.20s/it]
48%|βββββ | 339/711 [32:29<32:05, 5.18s/it]
48%|βββββ | 340/711 [32:34<31:51, 5.15s/it]
48%|βββββ | 340/711 [32:34<31:51, 5.15s/it]
48%|βββββ | 341/711 [ |
|
|
0: {'loss': 0.4004, 'grad_norm': 0.8530478906547682, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.3} |
|
|
0: 32:40<31:54, 5.17s/it]
48%|βββββ | 342/711 [32:45<31:45, 5.16s/it]
48%|βββββ | 343/711 [32:50<31:37, 5.16s/it]
48%|βββββ | 344/711 [32:55<31:44, 5.19s/it]
49%|βββββ | 345/711 [33:00<31:29, 5.16s/it]
49%|βββββ | 346/711 [33:05<31:18, 5.15s/it]
49%|βββββ | 347/711 [33:10<31:07, 5.13s/it]
49%|βββββ | 348/711 [33:16<30:59, 5.12s/it]
49%|βββββ | 349/711 [33:21<30:57, 5.13s/it]
49%|βββββ | 350/711 [33:26<30:52, 5.13s/it]
49%|βββββ | 350/711 [33:26<30:52, 5.13s/it]
49%|βββββ | 351/711 [33:31<30:45, 5.13s/it]
50%|βββββ | 352/711 [33:36<30:36, 5.12s/it]
50%|βββββ | 353/711 [33:41<30:32, 5.12s/it]
50%|βββββ | 354/711 [33:46<30:26, 5.12s/it]
50%|βββββ | 355/711 [33:51<30:20, 5.11s/it]
50%|βββββ | 356/711 [33:57<30:13, |
|
|
0: {'loss': 0.4055, 'grad_norm': 0.8072887895552343, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.3} |
|
|
0: 5.11s/it]
50%|βββββ | 357/711 [34:02<30:10, 5.11s/it]
50%|βββββ | 358/711 [34:07<30:07, 5.12s/it]
50%|βββββ | 359/711 [34:12<30:03, 5.12s/it]
51%|βββββ | 360/711 [34:17<29:57, 5.12s/it]
51%|βββββ | 360/711 [34:17<29:57, 5.12s/it]
51%|βββββ | 361/711 [34:22<29:53, 5.12s/it]
51%|βββββ | 362/711 [34:27<29:46, 5.12s/it]
51%|βββββ | 363/711 [34:32<29:41, 5.12s/it]
51%|βββββ | 364/711 [34:38<29:39, 5.13s/it]
51%|ββββββ | 365/711 [34:43<29:59, 5.20s/it]
51%|ββββββ | 366/711 [34:48<29:44, 5.17s/it]
52%|ββββββ | 367/711 [34:53<29:30, 5.15s/it]
52%|ββββββ | 368/711 [34:58<29:27, 5.15s/it]
52%|ββββββ | 369/711 [35:03<29:14, 5.13s/it]
52%|ββββββ | 370/711 [35:09<29:11, 5.14s/it]
|
|
|
0: {'loss': 0.4024, 'grad_norm': 0.8486839849343547, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.31} |
|
|
0: {'loss': 0.4021, 'grad_norm': 0.8529581759108179, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.32} |
|
|
0:
52%|ββββββ | 370/711 [35:09<29:11, 5.14s/it]
52%|ββββββ | 371/711 [35:14<29:01, 5.12s/it]
52%|ββββββ | 372/711 [35:19<28:54, 5.12s/it]
52%|ββββββ | 373/711 [35:24<29:01, 5.15s/it]
53%|ββββββ | 374/711 [35:29<28:53, 5.14s/it]
53%|ββββββ | 375/711 [35:34<28:43, 5.13s/it]
53%|ββββββ | 376/711 [35:39<28:33, 5.11s/it]
53%|ββββββ | 377/711 [35:44<28:25, 5.11s/it]
53%|ββββββ | 378/711 [35:49<28:21, 5.11s/it]
53%|ββββββ | 379/711 [35:55<28:52, 5.22s/it]
53%|ββββββ | 380/711 [36:00<28:33, 5.18s/it]
53%|ββββββ | 380/711 [36:00<28:33, 5.18s/it]
54%|ββββββ | 381/711 [36:05<28:53, 5.25s/it]
54%|ββββββ | 382/711 [36:11<28:32, 5.21s/it]
54%|ββββββ | 383/711 [36:16<28:38, 5.24s/it]
54%|ββββββ | 384/711 [36:2 |
|
|
0: {'loss': 0.3972, 'grad_norm': 0.8357610935717936, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.33} |
|
|
0: 1<28:17, 5.19s/it]
54%|ββββββ | 385/711 [36:26<28:02, 5.16s/it]
54%|ββββββ | 386/711 [36:31<27:52, 5.15s/it]
54%|ββββββ | 387/711 [36:36<27:46, 5.14s/it]
55%|ββββββ | 388/711 [36:41<27:35, 5.12s/it]
55%|ββββββ | 389/711 [36:47<27:35, 5.14s/it]
55%|ββββββ | 390/711 [36:52<27:38, 5.17s/it]
55%|ββββββ | 390/711 [36:52<27:38, 5.17s/it]
55%|ββββββ | 391/711 [36:57<27:26, 5.14s/it]
55%|ββββββ | 392/711 [37:02<27:35, 5.19s/it]
55%|ββββββ | 393/711 [37:07<27:27, 5.18s/it]
55%|ββββββ | 394/711 [37:12<27:12, 5.15s/it]
56%|ββββββ | 395/711 [37:18<27:39, 5.25s/it]
56%|ββββββ | 396/711 [37:23<27:19, 5.20s/it]
56%|ββββββ | 397/711 [37:28<27:03, 5.17s/it]
56%|ββββββ | 398/711 [37:33<26:52, 5.15s/it]
56%|ββββββ |
|
|
0: {'loss': 0.3859, 'grad_norm': 0.8058568338786659, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.34} |
|
|
0: {'loss': 0.3898, 'grad_norm': 0.7954384150397931, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.35} |
|
|
0: | 399/711 [37:38<26:46, 5.15s/it]
56%|ββββββ | 400/711 [37:43<26:35, 5.13s/it]
56%|ββββββ | 400/711 [37:43<26:35, 5.13s/it]
56%|ββββββ | 401/711 [37:48<26:27, 5.12s/it]
57%|ββββββ | 402/711 [37:54<26:18, 5.11s/it]
57%|ββββββ | 403/711 [37:59<26:11, 5.10s/it]
57%|ββββββ | 404/711 [38:04<26:10, 5.11s/it]
57%|ββββββ | 405/711 [38:09<26:34, 5.21s/it]
57%|ββββββ | 406/711 [38:14<26:25, 5.20s/it]
57%|ββββββ | 407/711 [38:19<26:11, 5.17s/it]
57%|ββββββ | 408/711 [38:25<26:03, 5.16s/it]
58%|ββββββ | 409/711 [38:30<26:30, 5.27s/it]
58%|ββββββ | 410/711 [38:35<26:14, 5.23s/it]
58%|ββββββ | 410/711 [38:35<26:14, 5.23s/it]
58%|ββββββ | 411/711 [38:41<26:30, 5.30s/it]
58%|βββοΏ½ |
|
|
0: {'loss': 0.3925, 'grad_norm': 0.8145567494437453, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.35} |
|
|
0: οΏ½οΏ½ββ | 412/711 [38:46<26:10, 5.25s/it]
58%|ββββββ | 413/711 [38:51<25:55, 5.22s/it]
58%|ββββββ | 414/711 [38:56<25:40, 5.19s/it]
58%|ββββββ | 415/711 [39:01<25:28, 5.16s/it]
59%|ββββββ | 416/711 [39:07<25:49, 5.25s/it]
59%|ββββββ | 417/711 [39:12<25:31, 5.21s/it]
59%|ββββββ | 418/711 [39:17<25:17, 5.18s/it]
59%|ββββββ | 419/711 [39:22<25:06, 5.16s/it]
59%|ββββββ | 420/711 [39:27<24:59, 5.15s/it]
59%|ββββββ | 420/711 [39:27<24:59, 5.15s/it]
59%|ββββββ | 421/711 [39:32<24:49, 5.14s/it]
59%|ββββββ | 422/711 [39:37<24:44, 5.14s/it]
59%|ββββββ | 423/711 [39:43<24:35, 5.12s/it]
60%|ββββββ | 424/711 [39:48<24:42, 5.17s/it]
60%|ββββββ | 425/711 [39:53<24:34, 5.16s/it]
60%|ββββββ | 426/711 [39:58<24:30, 5.16s/ |
|
|
0: {'loss': 0.3927, 'grad_norm': 0.8237856091804933, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.36} |
|
|
0: it]
60%|ββββββ | 427/711 [40:03<24:40, 5.21s/it]
60%|ββββββ | 428/711 [40:09<24:55, 5.28s/it]
60%|ββββββ | 429/711 [40:14<24:33, 5.23s/it]
60%|ββββββ | 430/711 [40:19<24:23, 5.21s/it]
60%|ββββββ | 430/711 [40:19<24:23, 5.21s/it]
61%|ββββββ | 431/711 [40:24<24:10, 5.18s/it]
61%|ββββββ | 432/711 [40:29<23:56, 5.15s/it]
61%|ββββββ | 433/711 [40:34<23:50, 5.14s/it]
61%|ββββββ | 434/711 [40:40<23:39, 5.12s/it]
61%|ββββββ | 435/711 [40:45<23:32, 5.12s/it]
61%|βββββββ | 436/711 [40:50<24:14, 5.29s/it]
61%|βββββββ | 437/711 [40:56<24:11, 5.30s/it]
62%|βββββββ | 438/711 [41:01<23:49, 5.24s/it]
62%|βββββββ | 439/711 [41:06<23:33, 5.20s/it]
62%|βββββββ | 440/711 [41:11<23:37, 5.23s/it]
|
|
|
0: {'loss': 0.3937, 'grad_norm': 0.8553439901672909, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.37} |
|
|
0: {'loss': 0.3873, 'grad_norm': 0.8286249080798415, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.38} |
|
|
0:
62%|βββββββ | 440/711 [41:11<23:37, 5.23s/it]
62%|βββββββ | 441/711 [41:16<23:23, 5.20s/it]
62%|βββββββ | 442/711 [41:21<23:13, 5.18s/it]
62%|βββββββ | 443/711 [41:27<23:01, 5.16s/it]
62%|βββββββ | 444/711 [41:32<22:57, 5.16s/it]
63%|βββββββ | 445/711 [41:37<23:14, 5.24s/it]
63%|βββββββ | 446/711 [41:42<22:58, 5.20s/it]
63%|βββββββ | 447/711 [41:47<22:45, 5.17s/it]
63%|βββββββ | 448/711 [41:52<22:34, 5.15s/it]
63%|βββββββ | 449/711 [41:58<22:43, 5.20s/it]
63%|βββββββ | 450/711 [42:03<22:32, 5.18s/it]
63%|βββββββ | 450/711 [42:03<22:32, 5.18s/it]
63%|βββββββ | 451/711 [42:08<22:20, 5.16s/it]
64%|βββββββ | 452/711 [42:13<22:14, 5.15s/it]
64%|βββββββ | 453/711 [42:18<22:09, 5 |
|
|
0: {'loss': 0.384, 'grad_norm': 0.8623716385758442, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.39} |
|
|
0: .15s/it]
64%|βββββββ | 454/711 [42:23<22:03, 5.15s/it]
64%|βββββββ | 455/711 [42:29<21:56, 5.14s/it]
64%|βββββββ | 456/711 [42:34<21:51, 5.14s/it]
64%|βββββββ | 457/711 [42:39<21:44, 5.14s/it]
64%|βββββββ | 458/711 [42:44<21:40, 5.14s/it]
65%|βββββββ | 459/711 [42:49<21:39, 5.16s/it]
65%|βββββββ | 460/711 [42:54<21:46, 5.21s/it]
65%|βββββββ | 460/711 [42:54<21:46, 5.21s/it]
65%|βββββββ | 461/711 [43:00<21:38, 5.19s/it]
65%|βββββββ | 462/711 [43:05<21:30, 5.18s/it]
65%|βββββββ | 463/711 [43:10<21:25, 5.18s/it]
65%|βββββββ | 464/711 [43:15<21:17, 5.17s/it]
65%|βββββββ | 465/711 [43:20<21:05, 5.15s/it]
66%|βββββββ | 466/711 [43:25<20:58, 5.14s/it]
66%|βββββββ | 467/711 [43:30<20:53, 5.14s/it]
66% |
|
|
0: {'loss': 0.3893, 'grad_norm': 0.7980262942281969, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.4} |
|
|
0: {'loss': 0.3928, 'grad_norm': 0.9024656134697462, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.4} |
|
|
0: |βββββββ | 468/711 [43:36<20:50, 5.14s/it]
66%|βββββββ | 469/711 [43:41<21:10, 5.25s/it]
66%|βββββββ | 470/711 [43:46<20:53, 5.20s/it]
66%|βββββββ | 470/711 [43:46<20:53, 5.20s/it]
66%|βββββββ | 471/711 [43:51<20:45, 5.19s/it]
66%|βββββββ | 472/711 [43:57<20:40, 5.19s/it]
67%|βββββββ | 473/711 [44:02<20:28, 5.16s/it]
67%|βββββββ | 474/711 [44:07<20:23, 5.16s/it]
67%|βββββββ | 475/711 [44:12<20:12, 5.14s/it]
67%|βββββββ | 476/711 [44:18<20:43, 5.29s/it]
67%|βββββββ | 477/711 [44:23<20:23, 5.23s/it]
67%|βββββββ | 478/711 [44:28<20:24, 5.26s/it]
67%|βββββββ | 479/711 [44:33<20:10, 5.22s/it]
68%|βββββββ | 480/711 [44:38<19:59, 5.19s/it]
68%|βββββββ | |
|
|
0: {'loss': 0.3747, 'grad_norm': 1.4532167164219425, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.41} |
|
|
0: 480/711 [44:38<19:59, 5.19s/it]
68%|βββββββ | 481/711 [44:44<20:13, 5.28s/it]
68%|βββββββ | 482/711 [44:49<20:00, 5.24s/it]
68%|βββββββ | 483/711 [44:54<19:48, 5.21s/it]
68%|βββββββ | 484/711 [44:59<19:39, 5.20s/it]
68%|βββββββ | 485/711 [45:04<19:33, 5.19s/it]
68%|βββββββ | 486/711 [45:10<19:26, 5.18s/it]
68%|βββββββ | 487/711 [45:15<19:19, 5.18s/it]
69%|βββββββ | 488/711 [45:20<19:09, 5.15s/it]
69%|βββββββ | 489/711 [45:25<19:24, 5.25s/it]
69%|βββββββ | 490/711 [45:30<19:09, 5.20s/it]
69%|βββββββ | 490/711 [45:30<19:09, 5.20s/it]
69%|βββββββ | 491/711 [45:36<19:16, 5.26s/it]
69%|βββββββ | 492/711 [45:41<19:00, 5.21s/it]
69%|βββββββ | 493/711 [45:46<18:47, 5.17s/it]
69%|βββββββ | 494/711 [45: |
|
|
0: {'loss': 0.3797, 'grad_norm': 0.8355553409451639, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.42} |
|
|
0: 51<19:02, 5.27s/it]
70%|βββββββ | 495/711 [45:56<18:45, 5.21s/it]
70%|βββββββ | 496/711 [46:02<18:34, 5.19s/it]
70%|βββββββ | 497/711 [46:07<18:38, 5.22s/it]
70%|βββββββ | 498/711 [46:12<18:25, 5.19s/it]
70%|βββββββ | 499/711 [46:18<18:55, 5.35s/it]
70%|βββββββ | 500/711 [46:23<18:34, 5.28s/it]
70%|βββββββ | 500/711 [46:23<18:34, 5.28s/it]
70%|βββββββ | 501/711 [46:28<18:22, 5.25s/it]
71%|βββββββ | 502/711 [46:33<18:12, 5.23s/it]
71%|βββββββ | 503/711 [46:38<18:04, 5.21s/it]
71%|βββββββ | 504/711 [46:44<17:53, 5.19s/it]
71%|βββββββ | 505/711 [46:49<17:43, 5.16s/it]
71%|βββββββ | 506/711 [46:54<17:36, 5.16s/it]
71%|ββββββββ | 507/711 [46:59<17:29, 5.14s/it]
71%|ββββββββ | 508/711 [47:04<17:21, |
|
|
0: {'loss': 0.3917, 'grad_norm': 0.8425621928447311, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.43} |
|
|
0: {'loss': 0.3825, 'grad_norm': 0.7816311224212293, 'learning_rate': 1.9929032311830302e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.44} |
|
|
0: 5.13s/it]
72%|ββββββββ | 509/711 [47:09<17:16, 5.13s/it]
72%|ββββββββ | 510/711 [47:14<17:09, 5.12s/it]
72%|ββββββββ | 510/711 [47:14<17:09, 5.12s/it]
72%|ββββββββ | 511/711 [47:20<17:21, 5.21s/it]
72%|ββββββββ | 512/711 [47:25<17:12, 5.19s/it]
72%|ββββββββ | 513/711 [47:30<17:00, 5.16s/it]
72%|ββββββββ | 514/711 [47:35<16:51, 5.14s/it]
72%|ββββββββ | 515/711 [47:40<16:45, 5.13s/it]
73%|ββββββββ | 516/711 [47:45<16:40, 5.13s/it]
73%|ββββββββ | 517/711 [47:50<16:32, 5.12s/it]
73%|ββββββββ | 518/711 [47:55<16:30, 5.13s/it]
73%|ββββββββ | 519/711 [48:01<16:25, 5.13s/it]
73%|ββββββββ | 520/711 [48:06<16:42, 5.25s/it]
73%|ββββββββ | 520/711 [48:06<16:4 |
|
|
0: {'loss': 0.3819, 'grad_norm': 0.8516269037345293, 'learning_rate': 1.9642643171092486e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.45} |
|
|
0: 2, 5.25s/it]
73%|ββββββββ | 521/711 [48:11<16:30, 5.21s/it]
73%|ββββββββ | 522/711 [48:16<16:18, 5.18s/it]
74%|ββββββββ | 523/711 [48:22<16:20, 5.22s/it]
74%|ββββββββ | 524/711 [48:27<16:36, 5.33s/it]
74%|ββββββββ | 525/711 [48:32<16:20, 5.27s/it]
74%|ββββββββ | 526/711 [48:37<16:07, 5.23s/it]
74%|ββββββββ | 527/711 [48:43<15:53, 5.18s/it]
74%|ββββββββ | 528/711 [48:48<15:54, 5.22s/it]
74%|ββββββββ | 529/711 [48:53<15:42, 5.18s/it]
75%|ββββββββ | 530/711 [48:58<15:38, 5.18s/it]
75%|ββββββββ | 530/711 [48:58<15:38, 5.18s/it]
75%|ββββββββ | 531/711 [49:03<15:27, 5.15s/it]
75%|ββββββββ | 532/711 [49:08<15:19, 5.14s/it]
75%|ββββββββ | 533/711 [49:13<15:11, 5.12s/it]
75%|ββββββββ | 53 |
|
|
0: {'loss': 0.3918, 'grad_norm': 0.9451621145813273, 'learning_rate': 1.9143443472194176e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.46} |
|
|
0: 4/711 [49:18<15:05, 5.12s/it]
75%|ββββββββ | 535/711 [49:24<14:58, 5.11s/it]
75%|ββββββββ | 536/711 [49:29<15:04, 5.17s/it]
76%|ββββββββ | 537/711 [49:34<14:59, 5.17s/it]
76%|ββββββββ | 538/711 [49:39<14:48, 5.14s/it]
76%|ββββββββ | 539/711 [49:44<14:46, 5.16s/it]
76%|ββββββββ | 540/711 [49:49<14:41, 5.15s/it]
76%|ββββββββ | 540/711 [49:49<14:41, 5.15s/it]
76%|ββββββββ | 541/711 [49:55<14:33, 5.14s/it]
76%|ββββββββ | 542/711 [50:00<14:26, 5.13s/it]
76%|ββββββββ | 543/711 [50:05<14:23, 5.14s/it]
77%|ββββββββ | 544/711 [50:10<14:17, 5.13s/it]
77%|ββββββββ | 545/711 [50:15<14:10, 5.12s/it]
77%|ββββββββ | 546/711 [50:20<14:04, 5.12s/it]
77%|ββββββββ | 547/711 [50:25<13:59, 5.12s/it]
77%|ββββοΏ½ |
|
|
0: {'loss': 0.3907, 'grad_norm': 0.8481507125427856, 'learning_rate': 1.8443725168471053e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.46} |
|
|
0: {'loss': 0.3803, 'grad_norm': 0.8683953024369212, 'learning_rate': 1.7560717646792703e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.47} |
|
|
0: οΏ½οΏ½βββ | 548/711 [50:30<13:52, 5.11s/it]
77%|ββββββββ | 549/711 [50:35<13:47, 5.11s/it]
77%|ββββββββ | 550/711 [50:41<13:42, 5.11s/it]
77%|ββββββββ | 550/711 [50:41<13:42, 5.11s/it]
77%|ββββββββ | 551/711 [50:46<13:40, 5.13s/it]
78%|ββββββββ | 552/711 [50:51<13:36, 5.13s/it]
78%|ββββββββ | 553/711 [50:56<13:29, 5.12s/it]
78%|ββββββββ | 554/711 [51:01<13:24, 5.12s/it]
78%|ββββββββ | 555/711 [51:06<13:21, 5.14s/it]
78%|ββββββββ | 556/711 [51:11<13:14, 5.12s/it]
78%|ββββββββ | 557/711 [51:17<13:25, 5.23s/it]
78%|ββββββββ | 558/711 [51:22<13:32, 5.31s/it]
79%|ββββββββ | 559/711 [51:27<13:18, 5.25s/it]
79%|ββββββββ | 560/711 [51:33<13:06, 5.21s/it]
79%|βββοΏ½ |
|
|
0: {'loss': 0.3904, 'grad_norm': 0.8636181194234771, 'learning_rate': 1.6516163482876789e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.48} |
|
|
0: οΏ½ββββ | 560/711 [51:33<13:06, 5.21s/it]
79%|ββββββββ | 561/711 [51:38<12:58, 5.19s/it]
79%|ββββββββ | 562/711 [51:43<13:13, 5.33s/it]
79%|ββββββββ | 563/711 [51:49<13:11, 5.35s/it]
79%|ββββββββ | 564/711 [51:54<12:58, 5.29s/it]
79%|ββββββββ | 565/711 [51:59<12:47, 5.26s/it]
80%|ββββββββ | 566/711 [52:04<12:34, 5.21s/it]
80%|ββββββββ | 567/711 [52:10<12:40, 5.28s/it]
80%|ββββββββ | 568/711 [52:15<12:28, 5.23s/it]
80%|ββββββββ | 569/711 [52:20<12:27, 5.26s/it]
80%|ββββββββ | 570/711 [52:25<12:14, 5.21s/it]
80%|ββββββββ | 570/711 [52:25<12:14, 5.21s/it]
80%|ββββββββ | 571/711 [52:31<12:14, 5.24s/it]
80%|ββββββββ | 572/711 [52:36<12:04, 5.21s/it]
81%|ββββββββ | 573/711 [52:41<12:02, 5.24s/it] |
|
|
0: {'loss': 0.3828, 'grad_norm': 0.8879170447340103, 'learning_rate': 1.5335783066915436e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.49} |
|
|
0:
81%|ββββββββ | 574/711 [52:46<11:53, 5.21s/it]
81%|ββββββββ | 575/711 [52:51<11:44, 5.18s/it]
81%|ββββββββ | 576/711 [52:56<11:38, 5.17s/it]
81%|ββββββββ | 577/711 [53:02<11:46, 5.27s/it]
81%|βββββββββ | 578/711 [53:07<11:49, 5.33s/it]
81%|βββββββββ | 579/711 [53:12<11:34, 5.26s/it]
82%|βββββββββ | 580/711 [53:18<11:25, 5.23s/it]
82%|βββββββββ | 580/711 [53:18<11:25, 5.23s/it]
82%|βββββββββ | 581/711 [53:23<11:15, 5.19s/it]
82%|βββββββββ | 582/711 [53:28<11:06, 5.16s/it]
82%|βββββββββ | 583/711 [53:33<10:59, 5.15s/it]
82%|βββββββββ | 584/711 [53:38<10:52, 5.14s/it]
82%|βββββββββ | 585/711 [53:43<10:46, 5.13s/it]
82%|βββββββββ | 586/711 [53:48<10:42, 5.14s/it]
83%|βββββββοΏ½ |
|
|
0: {'loss': 0.3763, 'grad_norm': 0.829865464468096, 'learning_rate': 1.4048641282207622e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.5} |
|
|
0: οΏ½β | 587/711 [53:54<11:03, 5.35s/it]
83%|βββββββββ | 588/711 [53:59<10:55, 5.33s/it]
83%|βββββββββ | 589/711 [54:05<10:40, 5.25s/it]
83%|βββββββββ | 590/711 [54:10<10:43, 5.32s/it]
83%|βββββββββ | 590/711 [54:10<10:43, 5.32s/it]
83%|βββββββββ | 591/711 [54:15<10:32, 5.27s/it]
83%|βββββββββ | 592/711 [54:20<10:23, 5.24s/it]
83%|βββββββββ | 593/711 [54:25<10:14, 5.21s/it]
84%|βββββββββ | 594/711 [54:31<10:07, 5.19s/it]
84%|βββββββββ | 595/711 [54:36<09:59, 5.17s/it]
84%|βββββββββ | 596/711 [54:41<09:51, 5.14s/it]
84%|βββββββββ | 597/711 [54:46<09:53, 5.21s/it]
84%|βββββββββ | 598/711 [54:51<09:45, 5.18s/it]
84%|βββββββββ | 599/711 [54:57<10:07, 5.42s/it]
84%|βββββββββ | 600/711 [55:02 |
|
|
0: {'loss': 0.3895, 'grad_norm': 1.0461353857982227, 'learning_rate': 1.2686431831271522e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.51} |
|
|
0: {'loss': 0.3726, 'grad_norm': 0.8291006806562558, 'learning_rate': 1.1282696831703153e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.51} |
|
|
0: <09:53, 5.35s/it]
84%|βββββββββ | 600/711 [55:02<09:53, 5.35s/it]
85%|βββββββββ | 601/711 [55:08<09:48, 5.35s/it]
85%|βββββββββ | 602/711 [55:13<09:35, 5.28s/it]
85%|βββββββββ | 603/711 [55:18<09:24, 5.23s/it]
85%|βββββββββ | 604/711 [55:23<09:22, 5.26s/it]
85%|βββββββββ | 605/711 [55:28<09:12, 5.22s/it]
85%|βββββββββ | 606/711 [55:34<09:27, 5.41s/it]
85%|βββββββββ | 607/711 [55:40<09:29, 5.47s/it]
86%|βββββββββ | 608/711 [55:45<09:17, 5.41s/it]
86%|βββββββββ | 609/711 [55:50<09:03, 5.32s/it]
86%|βββββββββ | 610/711 [55:56<08:56, 5.31s/it]
86%|βββββββββ | 610/711 [55:56<08:56, 5.31s/it]
86%|βββββββββ | 611/711 [56:01<08:45, 5.25s/it]
86%|ββββοΏ½ |
|
|
0: {'loss': 0.3749, 'grad_norm': 0.8273940149239204, 'learning_rate': 9.87200089792126e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.52} |
|
|
0: οΏ½ββββ | 612/711 [56:06<08:36, 5.22s/it]
86%|βββββββββ | 613/711 [56:11<08:34, 5.24s/it]
86%|βββββββββ | 614/711 [56:16<08:25, 5.22s/it]
86%|βββββββββ | 615/711 [56:21<08:17, 5.19s/it]
87%|βββββββββ | 616/711 [56:28<08:41, 5.48s/it]
87%|βββββββββ | 617/711 [56:33<08:24, 5.37s/it]
87%|βββββββββ | 618/711 [56:38<08:23, 5.41s/it]
87%|βββββββββ | 619/711 [56:43<08:09, 5.33s/it]
87%|βββββββββ | 620/711 [56:49<07:59, 5.27s/it]
87%|βββββββββ | 620/711 [56:49<07:59, 5.27s/it]
87%|βββββββββ | 621/711 [56:54<07:51, 5.24s/it]
87%|βββββββββ | 622/711 [56:59<07:43, 5.21s/it]
88%|βββββββββ | 623/711 [57:04<07:44, 5.28s/it]
88%|βββββββββ | 624/711 [57:09<07:35, 5.23s/it]
88%|βββββββββ | 625/7 |
|
|
0: {'loss': 0.3717, 'grad_norm': 0.7609373973543032, 'learning_rate': 8.489080045646937e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.53} |
|
|
0: 11 [57:14<07:26, 5.19s/it]
88%|βββββββββ | 626/711 [57:20<07:19, 5.17s/it]
88%|βββββββββ | 627/711 [57:25<07:12, 5.14s/it]
88%|βββββββββ | 628/711 [57:30<07:07, 5.15s/it]
88%|βββββββββ | 629/711 [57:35<07:03, 5.16s/it]
89%|βββββββββ | 630/711 [57:41<07:25, 5.49s/it]
89%|βββββββββ | 630/711 [57:41<07:25, 5.49s/it]
89%|βββββββββ | 631/711 [57:47<07:13, 5.42s/it]
89%|βββββββββ | 632/711 [57:52<07:01, 5.33s/it]
89%|βββββββββ | 633/711 [57:57<06:51, 5.28s/it]
89%|βββββββββ | 634/711 [58:02<06:42, 5.23s/it]
89%|βββββββββ | 635/711 [58:07<06:35, 5.21s/it]
89%|βββββββββ | 636/711 [58:13<06:36, 5.28s/it]
90%|βββββββββ | 637/711 [58:18<06:26, 5.23s/it]
90%|βββββββββ | 638/711 [58:23<06:19, 5.2 |
|
|
0: {'loss': 0.3746, 'grad_norm': 0.8664669621801867, 'learning_rate': 7.167986375914345e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.54} |
|
|
0: {'loss': 0.3774, 'grad_norm': 0.7890688699500655, 'learning_rate': 5.941249599330827e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.55} |
|
|
0: 0s/it]
90%|βββββββββ | 639/711 [58:28<06:19, 5.28s/it]
90%|βββββββββ | 640/711 [58:33<06:11, 5.24s/it]
90%|βββββββββ | 640/711 [58:33<06:11, 5.24s/it]
90%|βββββββββ | 641/711 [58:39<06:10, 5.30s/it]
90%|βββββββββ | 642/711 [58:44<06:02, 5.25s/it]
90%|βββββββββ | 643/711 [58:49<05:53, 5.20s/it]
91%|βββββββββ | 644/711 [58:54<05:46, 5.17s/it]
91%|βββββββββ | 645/711 [59:00<05:46, 5.26s/it]
91%|βββββββββ | 646/711 [59:05<05:38, 5.21s/it]
91%|βββββββββ | 647/711 [59:10<05:31, 5.18s/it]
91%|βββββββββ | 648/711 [59:15<05:25, 5.17s/it]
91%|ββββββββββ| 649/711 [59:20<05:18, 5.14s/it]
91%|ββββββββββ| 650/711 [59:25<05:18, 5.23s/it]
91%|βββββββοΏ½ |
|
|
0: {'loss': 0.3821, 'grad_norm': 0.8161369753902079, 'learning_rate': 4.839076046641801e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.56} |
|
|
0: οΏ½οΏ½ββ| 650/711 [59:25<05:18, 5.23s/it]
92%|ββββββββββ| 651/711 [59:31<05:11, 5.20s/it]
92%|ββββββββββ| 652/711 [59:36<05:04, 5.16s/it]
92%|ββββββββββ| 653/711 [59:41<05:07, 5.31s/it]
92%|ββββββββββ| 654/711 [59:46<04:58, 5.24s/it]
92%|ββββββββββ| 655/711 [59:52<04:52, 5.22s/it]
92%|ββββββββββ| 656/711 [59:57<04:51, 5.30s/it]
92%|ββββββββββ| 657/711 [1:00:02<04:43, 5.26s/it]
93%|ββββββββββ| 658/711 [1:00:07<04:36, 5.22s/it]
93%|ββββββββββ| 659/711 [1:00:13<04:32, 5.23s/it]
93%|ββββββββββ| 660/711 [1:00:18<04:24, 5.19s/it]
93%|ββββββββββ| 660/711 [1:00:18<04:24, 5.19s/it]
93%|ββββββββββ| 661/711 [1:00:23<04:23, 5.28s/it]
93%|ββββββββββ| 662/711 [1:00:28<04:16, 5.23s/it]
93% |
|
|
0: {'loss': 0.3655, 'grad_norm': 0.8247018856496126, 'learning_rate': 3.888604888618786e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.56} |
|
|
0: |ββββββββββ| 663/711 [1:00:33<04:09, 5.20s/it]
93%|ββββββββββ| 664/711 [1:00:39<04:02, 5.17s/it]
94%|ββββββββββ| 665/711 [1:00:44<03:57, 5.16s/it]
94%|ββββββββββ| 666/711 [1:00:49<03:54, 5.21s/it]
94%|ββββββββββ| 667/711 [1:00:54<03:48, 5.19s/it]
94%|ββββββββββ| 668/711 [1:00:59<03:41, 5.16s/it]
94%|ββββββββββ| 669/711 [1:01:04<03:35, 5.14s/it]
94%|ββββββββββ| 670/711 [1:01:10<03:33, 5.22s/it]
94%|ββββββββββ| 670/711 [1:01:10<03:33, 5.22s/it]
94%|ββββββββββ| 671/711 [1:01:15<03:27, 5.18s/it]
95%|ββββββββββ| 672/711 [1:01:20<03:23, 5.22s/it]
95%|ββββββββββ| 673/711 [1:01:25<03:17, 5.20s/it]
95%|ββββββββββ| 674/711 [1:01:30<03:10, 5.16s/it]
95%|ββββββββββ| 67 |
|
|
0: {'loss': 0.3832, 'grad_norm': 0.7788264476641573, 'learning_rate': 3.1132398796052294e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.57} |
|
|
0: 5/711 [1:01:36<03:06, 5.19s/it]
95%|ββββββββββ| 676/711 [1:01:42<03:13, 5.53s/it]
95%|ββββββββββ| 677/711 [1:01:48<03:11, 5.63s/it]
95%|ββββββββββ| 678/711 [1:01:53<03:02, 5.52s/it]
95%|ββββββββββ| 679/711 [1:01:58<02:52, 5.38s/it]
96%|ββββββββββ| 680/711 [1:02:03<02:44, 5.29s/it]
96%|ββββββββββ| 680/711 [1:02:03<02:44, 5.29s/it]
96%|ββββββββββ| 681/711 [1:02:08<02:37, 5.25s/it]
96%|ββββββββββ| 682/711 [1:02:14<02:33, 5.31s/it]
96%|ββββββββββ| 683/711 [1:02:19<02:27, 5.25s/it]
96%|ββββββββββ| 684/711 [1:02:24<02:20, 5.21s/it]
96%|ββββββββββ| 685/711 [1:02:29<02:14, 5.18s/it]
96%|ββββββββββ| 686/711 [1:02:34<02:09, 5.17s/it]
97%|ββββββββββ| 687/711 [1:02:39<02:03, 5.17s/it]
9 |
|
|
0: {'loss': 0.3818, 'grad_norm': 0.7875994058840892, 'learning_rate': 2.532073079411971e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.58} |
|
|
0: 7%|ββββββββββ| 688/711 [1:02:45<01:58, 5.15s/it]
97%|ββββββββββ| 689/711 [1:02:50<01:53, 5.15s/it]
97%|ββββββββββ| 690/711 [1:02:55<01:47, 5.14s/it]
97%|ββββββββββ| 690/711 [1:02:55<01:47, 5.14s/it]
97%|ββββββββββ| 691/711 [1:03:00<01:43, 5.19s/it]
97%|ββββββββββ| 692/711 [1:03:05<01:37, 5.16s/it]
97%|ββββββββββ| 693/711 [1:03:10<01:32, 5.14s/it]
98%|ββββββββββ| 694/711 [1:03:15<01:27, 5.13s/it]
98%|ββββββββββ| 695/711 [1:03:21<01:22, 5.13s/it]
98%|ββββββββββ| 696/711 [1:03:26<01:17, 5.14s/it]
98%|ββββββββββ| 697/711 [1:03:31<01:11, 5.14s/it]
98%|ββββββββββ| 698/711 [1:03:36<01:06, 5.14s/it]
98%|ββββββββββ| 699/711 [1:03:41<01:02, 5.17s/it]
98%|ββββββββββ| |
|
|
0: {'loss': 0.3763, 'grad_norm': 0.7322258611860605, 'learning_rate': 2.1594147434418026e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.59} |
|
|
0: {'loss': 0.3734, 'grad_norm': 0.7653492019720352, 'learning_rate': 2.0044409567084156e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.6} |
|
|
0: 700/711 [1:03:47<00:57, 5.24s/it]
98%|ββββββββββ| 700/711 [1:03:47<00:57, 5.24s/it]
99%|ββββββββββ| 701/711 [1:03:52<00:52, 5.26s/it]
99%|ββββββββββ| 702/711 [1:03:57<00:47, 5.23s/it]
99%|ββββββββββ| 703/711 [1:04:02<00:41, 5.20s/it]
99%|ββββββββββ| 704/711 [1:04:07<00:36, 5.20s/it]
99%|ββββββββββ| 705/711 [1:04:13<00:31, 5.30s/it]
99%|ββββββββββ| 706/711 [1:04:18<00:26, 5.24s/it]
99%|ββββββββββ| 707/711 [1:04:23<00:21, 5.27s/it]
100%|ββββββββββ| 708/711 [1:04:28<00:15, 5.21s/it]
100%|ββββββββββ| 709/711 [1:04:34<00:10, 5.19s/it]
100%|ββββββββββ| 710/711 [1:04:39<00:05, 5.28s/it]
100%|ββββββββββ| 710/711 [1:04:39<00:05, 5.28s/it]
100%|ββββοΏ½ |
|
|
0: [2025-11-24 01:16:53,191] [INFO] [axolotl.core.trainers.base._save:613] [PID:3081979] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0/checkpoint-711[39m |
|
|
0: [2025-11-24 01:17:11,725] [INFO] [axolotl.core.trainers.base._save:662] [PID:3081979] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
|
|
0: {'train_runtime': 3910.4069, 'train_samples_per_second': 2.909, 'train_steps_per_second': 0.182, 'train_loss': 0.4125736778295493, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.6} |
|
|
0: οΏ½βββββ| 711/711 [1:04:44<00:00, 5.22s/it]
100%|ββββββββββ| 711/711 [1:05:10<00:00, 5.22s/it]
100%|ββββββββββ| 711/711 [1:05:10<00:00, 5.50s/it] |
|
|
0: [2025-11-24 01:17:21,056] [INFO] [axolotl.train.save_trained_model:228] [PID:3081979] [RANK:0] Training completed! Saving trained model to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0.[39m |
|
|
0: [2025-11-24 01:17:26,694] [INFO] [axolotl.core.trainers.base._save:613] [PID:3081979] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0[39m |
|
|
0: [2025-11-24 01:17:44,493] [INFO] [axolotl.core.trainers.base._save:662] [PID:3081979] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
|
|
0: [2025-11-24 01:17:44,817] [INFO] [axolotl.train.save_trained_model:350] [PID:3081979] [RANK:0] Model successfully saved to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0[39m |
|
|
|