Nicolas-BZRD commited on
Commit
cc05668
·
verified ·
1 Parent(s): db54019

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
14
+ <details><summary>See axolotl config</summary>
15
+
16
+ axolotl version: `0.12.2`
17
+ ```yaml
18
+ base_model: /lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b
19
+
20
+ datasets:
21
+ - path: /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking
22
+ ds_type: json
23
+ type: chat_template
24
+ field_messages: conversations
25
+ data_files:
26
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0007.jsonl
27
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0009.jsonl
28
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0005.jsonl
29
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0006.jsonl
30
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0014.jsonl
31
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0010.jsonl
32
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0012.jsonl
33
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0008.jsonl
34
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0001.jsonl
35
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0002.jsonl
36
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0013.jsonl
37
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0015.jsonl
38
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0004.jsonl
39
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0011.jsonl
40
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0000.jsonl
41
+ - /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0003.jsonl
42
+
43
+ dataset_prepared_path: /lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0
44
+ tokenizer_config: "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-27b"
45
+ chat_template: gemma3
46
+ eot_tokens:
47
+ - "<end_of_turn>"
48
+
49
+ shuffle_merged_datasets: true
50
+ output_dir: /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0
51
+
52
+ sequence_len: 16384
53
+ sample_packing: true
54
+
55
+ gradient_accumulation_steps: 1
56
+ micro_batch_size: 1
57
+ num_epochs: 0.6
58
+ auto_resume_from_checkpoints: true
59
+
60
+ optimizer: adamw_torch_fused
61
+ lr_scheduler: warmup_stable_decay
62
+ learning_rate: 2e-6
63
+ lr_scheduler_kwargs:
64
+ num_decay_steps: 200
65
+ min_lr_ratio: 0.1
66
+ warmup_steps: 100
67
+
68
+ bf16: true
69
+ tf32: false
70
+
71
+ gradient_checkpointing: true
72
+ logging_steps: 10
73
+ flash_attention: true
74
+
75
+ evals_per_epoch: 0
76
+ saves_per_epoch: 1
77
+ save_total_limit: 20
78
+ save_only_model: true
79
+
80
+ use_tensorboard: true
81
+ deepspeed: /lustre/fswork/projects/rech/qwv/udv55np/axolotl/zero3.json
82
+
83
+ ```
84
+
85
+ </details><br>
86
+
87
+ # lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0
88
+
89
+ This model was trained from scratch on the None dataset.
90
+
91
+ ## Model description
92
+
93
+ More information needed
94
+
95
+ ## Intended uses & limitations
96
+
97
+ More information needed
98
+
99
+ ## Training and evaluation data
100
+
101
+ More information needed
102
+
103
+ ## Training procedure
104
+
105
+ ### Training hyperparameters
106
+
107
+ The following hyperparameters were used during training:
108
+ - learning_rate: 2e-06
109
+ - train_batch_size: 1
110
+ - eval_batch_size: 1
111
+ - seed: 42
112
+ - distributed_type: multi-GPU
113
+ - num_devices: 16
114
+ - total_train_batch_size: 16
115
+ - total_eval_batch_size: 16
116
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
117
+ - lr_scheduler_type: warmup_stable_decay
118
+ - lr_scheduler_warmup_steps: 100
119
+ - training_steps: 711
120
+
121
+ ### Training results
122
+
123
+
124
+
125
+ ### Framework versions
126
+
127
+ - Transformers 4.55.2
128
+ - Pytorch 2.6.0+cu124
129
+ - Datasets 4.0.0
130
+ - Tokenizers 0.21.1
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3ForConditionalGeneration"
4
+ ],
5
+ "boi_token_index": 255999,
6
+ "eoi_token_index": 256000,
7
+ "image_token_index": 262144,
8
+ "initializer_range": 0.02,
9
+ "mm_tokens_per_image": 256,
10
+ "model_type": "gemma3",
11
+ "text_config": {
12
+ "_sliding_window_pattern": 6,
13
+ "attention_bias": false,
14
+ "attention_dropout": 0.0,
15
+ "attn_logit_softcapping": null,
16
+ "final_logit_softcapping": null,
17
+ "head_dim": 256,
18
+ "hidden_activation": "gelu_pytorch_tanh",
19
+ "hidden_size": 3840,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 15360,
22
+ "layer_types": [
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "full_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "full_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "full_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "full_attention",
59
+ "sliding_attention",
60
+ "sliding_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "full_attention",
65
+ "sliding_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "sliding_attention",
70
+ "full_attention"
71
+ ],
72
+ "max_position_embeddings": 131072,
73
+ "model_type": "gemma3_text",
74
+ "num_attention_heads": 16,
75
+ "num_hidden_layers": 48,
76
+ "num_key_value_heads": 8,
77
+ "query_pre_attn_scalar": 256,
78
+ "rms_norm_eps": 1e-06,
79
+ "rope_local_base_freq": 10000.0,
80
+ "rope_scaling": {
81
+ "factor": 8.0,
82
+ "rope_type": "linear"
83
+ },
84
+ "rope_theta": 1000000.0,
85
+ "sliding_window": 1024,
86
+ "torch_dtype": "bfloat16",
87
+ "use_cache": false,
88
+ "vocab_size": 262208
89
+ },
90
+ "torch_dtype": "bfloat16",
91
+ "transformers_version": "4.55.2",
92
+ "vision_config": {
93
+ "attention_dropout": 0.0,
94
+ "hidden_act": "gelu_pytorch_tanh",
95
+ "hidden_size": 1152,
96
+ "image_size": 896,
97
+ "intermediate_size": 4304,
98
+ "layer_norm_eps": 1e-06,
99
+ "model_type": "siglip_vision_model",
100
+ "num_attention_heads": 16,
101
+ "num_channels": 3,
102
+ "num_hidden_layers": 27,
103
+ "patch_size": 14,
104
+ "torch_dtype": "bfloat16",
105
+ "vision_use_head": false
106
+ }
107
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 1,
7
+ 106
8
+ ],
9
+ "pad_token_id": 0,
10
+ "top_k": 64,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.55.2"
13
+ }
model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48713f75a388bf872e79b3cdf174382aa5b2969c2681393afca5e2d6d8a14763
3
+ size 4979902192
model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a420cf54d4f15c12aec81b16c493493710751552f296f122a04765066848758
3
+ size 4931296592
model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac5e315f04c8f2e2fc82758ab450578958518e2b2fb4ec1743775b5f4d6a6683
3
+ size 4931296656
model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a851bc5d8dfee5bd6a242188d006af820dca1c9775340bb3e543fd5b466041f
3
+ size 4931296656
model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d2e99d57cd426e6d1e5004830b646ddaf5f52c50fdb117706ffa11356ccb63a
3
+ size 4601000928
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
runs/Nov24_00-10-21_jzxh071/events.out.tfevents.1763939522.jzxh071.3081979.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f37098112ad379ad01786305414c07051d2d994842774ecf72b33167987104ed
3
+ size 42188
slurm.out ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/711 [00:00<?, ?it/s]
1
  0%| | 1/711 [03:14<38:20:16, 194.39s/it]
2
  0%| | 2/711 [03:19<16:22:13, 83.12s/it]
3
  0%| | 3/711 [03:24<9:20:24, 47.49s/it]
4
  1%| | 4/711 [03:29<6:02:21, 30.75s/it]
5
  1%| | 5/711 [03:35<4:14:24, 21.62s/it]
6
  1%| | 6/711 [03:40<3:09:35, 16.14s/it]
7
  1%| | 7/711 [03:45<2:27:08, 12.54s/it]
8
  1%| | 8/711 [03:50<1:59:06, 10.17s/it]
9
  1%|▏ | 9/711 [03:56<1:40:22, 8.58s/it]
10
  1%|▏ | 10/711 [04:01<1:27:41, 7.51s/it]
11
 
12
  1%|▏ | 10/711 [04:01<1:27:41, 7.51s/it]
13
  2%|▏ | 11/711 [04:06<1:18:52, 6.76s/it]
14
  2%|▏ | 12/711 [04:11<1:12:59, 6.27s/it]
15
  2%|▏ | 13/711 [04:16<1:08:45, 5.91s/it]
16
  2%|▏ | 14/711 [04:21<1:06:01, 5.68s/it]
17
  2%|▏ | 15/711 [04:26<1:03:51, 5.51s/it]
18
  2%|▏ | 16/711 [04:31<1:02:15, 5.38s/it]
19
  2%|▏ | 17/711 [04:37<1:01:45, 5.34s/i
 
 
 
20
  3%|▎ | 18/711 [04:42<1:01:12, 5.30s/it]
21
  3%|▎ | 19/711 [04:47<1:00:37, 5.26s/it]
22
  3%|▎ | 20/711 [04:52<1:00:10, 5.23s/it]
23
 
24
  3%|▎ | 20/711 [04:52<1:00:10, 5.23s/it]
25
  3%|▎ | 21/711 [04:57<59:44, 5.19s/it]
26
  3%|▎ | 22/711 [05:02<59:22, 5.17s/it]
27
  3%|▎ | 23/711 [05:07<59:00, 5.15s/it]
28
  3%|▎ | 24/711 [05:13<58:57, 5.15s/it]
29
  4%|▎ | 25/711 [05:18<59:09, 5.17s/it]
30
  4%|▎ | 26/711 [05:23<58:51, 5.16s/it]
31
  4%|▍ | 27/711 [05:28<58:27, 5.13s/it]
32
  4%|▍ | 28/711 [05:33<58:17, 5.12s/it]
33
  4%|▍ | 29/711 [05:38<58:07, 5.11s/it]
34
  4%|▍ | 30/711 [05:44<59:10, 5.21s/it]
35
 
36
  4%|▍ | 30/711 [05:44<59:10, 5.21s/it]
37
  4%|▍ | 31/711 [05:49<58:50, 5.19s/it]
38
  5%|▍ | 32/711 [05:54<58:36, 5.18s/it]
39
  5%|▍ | 33/711 [05:59<59:31, 5
 
 
40
  5%|▍ | 34/711 [06:04<58:50, 5.22s/it]
41
  5%|▍ | 35/711 [06:10<58:21, 5.18s/it]
42
  5%|▌ | 36/711 [06:15<57:59, 5.16s/it]
43
  5%|▌ | 37/711 [06:20<57:35, 5.13s/it]
44
  5%|▌ | 38/711 [06:25<57:22, 5.12s/it]
45
  5%|▌ | 39/711 [06:30<57:16, 5.11s/it]
46
  6%|▌ | 40/711 [06:35<57:20, 5.13s/it]
47
 
48
  6%|▌ | 40/711 [06:35<57:20, 5.13s/it]
49
  6%|▌ | 41/711 [06:40<57:23, 5.14s/it]
50
  6%|▌ | 42/711 [06:45<57:09, 5.13s/it]
51
  6%|▌ | 43/711 [06:51<57:31, 5.17s/it]
52
  6%|▌ | 44/711 [06:56<57:25, 5.17s/it]
53
  6%|▋ | 45/711 [07:01<57:08, 5.15s/it]
54
  6%|▋ | 46/711 [07:06<57:01, 5.15s/it]
55
  7%|▋ | 47/711 [07:11<56:53, 5.14s/it]
56
  7%|▋ | 48/711 [07:16<57:06, 5.17s/it]
57
  7%|▋ | 49/711 [07:22<57:03, 5.17s/it]
58
  7%|▋ | 50/711 [07:27<57:26, 5.21s/it]
59
 
 
 
 
60
  7%|▋ | 50/711 [07:27<57:26, 5.21s/it]
61
  7%|▋ | 51/711 [07:32<57:06, 5.19s/it]
62
  7%|▋ | 52/711 [07:37<57:02, 5.19s/it]
63
  7%|▋ | 53/711 [07:43<57:29, 5.24s/it]
64
  8%|▊ | 54/711 [07:48<58:04, 5.30s/it]
65
  8%|▊ | 55/711 [07:53<57:20, 5.25s/it]
66
  8%|▊ | 56/711 [07:59<57:54, 5.30s/it]
67
  8%|▊ | 57/711 [08:04<57:10, 5.25s/it]
68
  8%|▊ | 58/711 [08:09<57:55, 5.32s/it]
69
  8%|▊ | 59/711 [08:14<57:11, 5.26s/it]
70
  8%|▊ | 60/711 [08:19<56:30, 5.21s/it]
71
 
72
  8%|▊ | 60/711 [08:19<56:30, 5.21s/it]
73
  9%|▊ | 61/711 [08:25<56:35, 5.22s/it]
74
  9%|▊ | 62/711 [08:30<56:18, 5.21s/it]
75
  9%|▉ | 63/711 [08:35<57:08, 5.29s/it]
76
  9%|▉ | 64/711 [08:41<56:59, 5.28s/it]
77
  9%|▉ | 65/711 [08:46<56:22, 5.24s/it]
78
  9%|▉ | 66/711 [08:51<56:36, 5.27s/it]
79
  9%|▉ | 67/711 [08:56<56:06, 5.23s/it]
80
  1
 
 
 
81
  10%|▉ | 69/711 [09:06<55:12, 5.16s/it]
82
  10%|▉ | 70/711 [09:12<55:17, 5.18s/it]
83
 
84
  10%|▉ | 70/711 [09:12<55:17, 5.18s/it]
85
  10%|▉ | 71/711 [09:17<54:55, 5.15s/it]
86
  10%|█ | 72/711 [09:22<54:39, 5.13s/it]
87
  10%|█ | 73/711 [09:27<54:25, 5.12s/it]
88
  10%|█ | 74/711 [09:32<54:18, 5.12s/it]
89
  11%|█ | 75/711 [09:37<54:05, 5.10s/it]
90
  11%|█ | 76/711 [09:42<54:01, 5.11s/it]
91
  11%|█ | 77/711 [09:47<53:56, 5.11s/it]
92
  11%|█ | 78/711 [09:52<54:06, 5.13s/it]
93
  11%|█ | 79/711 [09:57<53:56, 5.12s/it]
94
  11%|█▏ | 80/711 [10:03<54:04, 5.14s/it]
95
 
96
  11%|█▏ | 80/711 [10:03<54:04, 5.14s/it]
97
  11%|█▏ | 81/711 [10:08<55:07, 5.25s/it]
98
  12%|█▏ | 82/711 [10:13<54:50, 5.23s/it]
99
  12%|█▏ | 83/711 [10:19<55:29, 5.30s/it
 
 
100
  12%|█▏ | 84/711 [10:24<54:57, 5.26s/it]
101
  12%|█▏ | 85/711 [10:29<54:21, 5.21s/it]
102
  12%|█▏ | 86/711 [10:35<55:05, 5.29s/it]
103
  12%|█▏ | 87/711 [10:40<54:33, 5.25s/it]
104
  12%|█▏ | 88/711 [10:45<54:02, 5.20s/it]
105
  13%|█▎ | 89/711 [10:50<53:35, 5.17s/it]
106
  13%|█▎ | 90/711 [10:55<53:14, 5.14s/it]
107
 
108
  13%|█▎ | 90/711 [10:55<53:14, 5.14s/it]
109
  13%|█▎ | 91/711 [11:00<52:59, 5.13s/it]
110
  13%|█▎ | 92/711 [11:05<53:42, 5.21s/it]
111
  13%|█▎ | 93/711 [11:11<54:32, 5.30s/it]
112
  13%|█▎ | 94/711 [11:16<54:50, 5.33s/it]
113
  13%|█▎ | 95/711 [11:22<55:36, 5.42s/it]
114
  14%|█▎ | 96/711 [11:27<55:11, 5.38s/it]
115
  14%|█▎ | 97/711 [11:32<54:13, 5.30s/it]
116
  14%|█▍ | 98/711 [11:38<54:39, 5.35s/it]
117
  14%|█▍ | 99/711 [11:43<53:58, 5.29s/it]
118
  14%|█▍ | 100/711 [11:48<53:15, 5.23s/it]
119
 
 
 
 
120
  14%|█▍ | 100/711 [11:48<53:15, 5.23s/it]
121
  14%|█▍ | 101/711 [11:53<52:46, 5.19s/it]
122
  14%|█▍ | 102/711 [11:58<52:21, 5.16s/it]
123
  14%|█▍ | 103/711 [12:03<52:00, 5.13s/it]
124
  15%|█▍ | 104/711 [12:08<51:44, 5.11s/it]
125
  15%|█▍ | 105/711 [12:14<51:39, 5.11s/it]
126
  15%|█▍ | 106/711 [12:19<51:43, 5.13s/it]
127
  15%|█▌ | 107/711 [12:24<51:34, 5.12s/it]
128
  15%|█▌ | 108/711 [12:29<51:22, 5.11s/it]
129
  15%|█▌ | 109/711 [12:34<51:11, 5.10s/it]
130
  15%|█▌ | 110/711 [12:39<51:14, 5.12s/it]
131
 
132
  15%|█▌ | 110/711 [12:39<51:14, 5.12s/it]
133
  16%|█▌ | 111/711 [12:44<51:43, 5.17s/it]
134
  16%|█▌ | 112/711 [12:50<51:33, 5.16s/it]
135
  16%|█▌ | 113/711 [12:55<52:08, 5.23s/it]
136
  16%|█▌ | 114/711 [13:00<51:48, 5.21s/it]
137
  16%|█▌ | 115/711 [13:05<51:23, 5.17s/it]
138
  16%|█▋ |
 
 
 
139
  16%|█▋ | 117/711 [13:16<51:03, 5.16s/it]
140
  17%|█▋ | 118/711 [13:21<50:59, 5.16s/it]
141
  17%|█▋ | 119/711 [13:26<51:20, 5.20s/it]
142
  17%|█▋ | 120/711 [13:31<50:54, 5.17s/it]
143
 
144
  17%|█▋ | 120/711 [13:31<50:54, 5.17s/it]
145
  17%|█▋ | 121/711 [13:36<50:42, 5.16s/it]
146
  17%|█▋ | 122/711 [13:41<50:37, 5.16s/it]
147
  17%|█▋ | 123/711 [13:47<50:33, 5.16s/it]
148
  17%|█▋ | 124/711 [13:52<50:44, 5.19s/it]
149
  18%|█▊ | 125/711 [13:57<51:31, 5.28s/it]
150
  18%|█▊ | 126/711 [14:03<52:11, 5.35s/it]
151
  18%|█▊ | 127/711 [14:08<52:12, 5.36s/it]
152
  18%|█▊ | 128/711 [14:14<52:10, 5.37s/it]
153
  18%|█▊ | 129/711 [14:19<51:21, 5.29s/it]
154
  18%|█▊ | 130/711 [14:24<50:50, 5.25s/it]
155
 
156
  18%|█▊ | 130/711 [14:24<50:50, 5.25s/it]
157
  18%|█▊ | 131/
 
 
158
  19%|█▊ | 132/711 [14:34<50:04, 5.19s/it]
159
  19%|█▊ | 133/711 [14:39<50:21, 5.23s/it]
160
  19%|█▉ | 134/711 [14:45<49:58, 5.20s/it]
161
  19%|█▉ | 135/711 [14:50<49:37, 5.17s/it]
162
  19%|█▉ | 136/711 [14:55<49:54, 5.21s/it]
163
  19%|█▉ | 137/711 [15:00<49:29, 5.17s/it]
164
  19%|█▉ | 138/711 [15:05<49:14, 5.16s/it]
165
  20%|█▉ | 139/711 [15:10<49:05, 5.15s/it]
166
  20%|█▉ | 140/711 [15:16<49:19, 5.18s/it]
167
 
168
  20%|█▉ | 140/711 [15:16<49:19, 5.18s/it]
169
  20%|█▉ | 141/711 [15:21<49:04, 5.17s/it]
170
  20%|█▉ | 142/711 [15:26<48:43, 5.14s/it]
171
  20%|██ | 143/711 [15:31<48:31, 5.13s/it]
172
  20%|██ | 144/711 [15:36<48:59, 5.19s/it]
173
  20%|██ | 145/711 [15:41<48:54, 5.19s/it]
174
  21%|██ | 146/711 [15:46<48:39, 5.17s/it]
175
  21%|██ | 147/711 [15:52<48:22, 5.15s/it]
176
  21%|██ | 148/7
 
 
 
177
  21%|██ | 149/711 [16:02<48:06, 5.14s/it]
178
  21%|██ | 150/711 [16:07<48:06, 5.14s/it]
179
 
180
  21%|██ | 150/711 [16:07<48:06, 5.14s/it]
181
  21%|██ | 151/711 [16:12<48:00, 5.14s/it]
182
  21%|██▏ | 152/711 [16:17<48:35, 5.22s/it]
183
  22%|██▏ | 153/711 [16:23<48:21, 5.20s/it]
184
  22%|██▏ | 154/711 [16:28<48:00, 5.17s/it]
185
  22%|██▏ | 155/711 [16:33<47:54, 5.17s/it]
186
  22%|██▏ | 156/711 [16:38<47:42, 5.16s/it]
187
  22%|██▏ | 157/711 [16:43<47:36, 5.16s/it]
188
  22%|██▏ | 158/711 [16:48<47:27, 5.15s/it]
189
  22%|██▏ | 159/711 [16:54<48:09, 5.23s/it]
190
  23%|██▎ | 160/711 [16:59<47:40, 5.19s/it]
191
 
192
  23%|██▎ | 160/711 [16:59<47:40, 5.19s/it]
193
  23%|██▎ | 161/711 [17:04<47:17, 5.16s/it]
194
  23%|██▎ | 162/711 [17:09<47:17, 5.17s/it]
195
  23%|�
 
 
196
  23%|██▎ | 164/711 [17:20<47:49, 5.25s/it]
197
  23%|██▎ | 165/711 [17:25<47:18, 5.20s/it]
198
  23%|██▎ | 166/711 [17:30<47:54, 5.27s/it]
199
  23%|██▎ | 167/711 [17:35<47:21, 5.22s/it]
200
  24%|██▎ | 168/711 [17:41<47:10, 5.21s/it]
201
  24%|██▍ | 169/711 [17:46<46:57, 5.20s/it]
202
  24%|██▍ | 170/711 [17:51<46:38, 5.17s/it]
203
 
204
  24%|██▍ | 170/711 [17:51<46:38, 5.17s/it]
205
  24%|██▍ | 171/711 [17:56<46:21, 5.15s/it]
206
  24%|██▍ | 172/711 [18:01<46:17, 5.15s/it]
207
  24%|██▍ | 173/711 [18:06<46:18, 5.16s/it]
208
  24%|██▍ | 174/711 [18:11<46:01, 5.14s/it]
209
  25%|██▍ | 175/711 [18:16<45:50, 5.13s/it]
210
  25%|██▍ | 176/711 [18:22<45:49, 5.14s/it]
211
  25%|██▍ | 177/711 [18:27<45:49, 5.15s/it]
212
  25%|██▌ | 178/711 [18:32<46:45, 5.26s/it]
213
  25%|██▌ | 179/
 
 
 
214
  25%|██▌ | 180/711 [18:43<45:49, 5.18s/it]
215
 
216
  25%|██▌ | 180/711 [18:43<45:49, 5.18s/it]
217
  25%|██▌ | 181/711 [18:48<45:33, 5.16s/it]
218
  26%|██▌ | 182/711 [18:53<45:22, 5.15s/it]
219
  26%|██▌ | 183/711 [18:58<45:10, 5.13s/it]
220
  26%|██▌ | 184/711 [19:03<45:33, 5.19s/it]
221
  26%|██▌ | 185/711 [19:08<45:10, 5.15s/it]
222
  26%|██▌ | 186/711 [19:13<44:56, 5.14s/it]
223
  26%|██▋ | 187/711 [19:18<44:44, 5.12s/it]
224
  26%|██▋ | 188/711 [19:24<45:35, 5.23s/it]
225
  27%|██▋ | 189/711 [19:29<45:10, 5.19s/it]
226
  27%|██▋ | 190/711 [19:34<44:59, 5.18s/it]
227
 
228
  27%|██▋ | 190/711 [19:34<44:59, 5.18s/it]
229
  27%|██▋ | 191/711 [19:39<44:41, 5.16s/it]
230
  27%|██▋ | 192/711 [19:44<44:29, 5.14s/it]
231
  27%|██▋ | 193/711 [19:50<44:25, 5.15s/i
 
 
232
  27%|██▋ | 194/711 [19:55<44:10, 5.13s/it]
233
  27%|██▋ | 195/711 [20:00<44:25, 5.17s/it]
234
  28%|██▊ | 196/711 [20:05<44:10, 5.15s/it]
235
  28%|██▊ | 197/711 [20:10<44:27, 5.19s/it]
236
  28%|██▊ | 198/711 [20:16<45:08, 5.28s/it]
237
  28%|██▊ | 199/711 [20:21<44:34, 5.22s/it]
238
  28%|██▊ | 200/711 [20:26<44:17, 5.20s/it]
239
 
240
  28%|██▊ | 200/711 [20:26<44:17, 5.20s/it]
241
  28%|██▊ | 201/711 [20:31<43:55, 5.17s/it]
242
  28%|██▊ | 202/711 [20:36<43:35, 5.14s/it]
243
  29%|██▊ | 203/711 [20:41<43:37, 5.15s/it]
244
  29%|██▊ | 204/711 [20:47<44:14, 5.24s/it]
245
  29%|██▉ | 205/711 [20:52<44:01, 5.22s/it]
246
  29%|██▉ | 206/711 [20:57<43:43, 5.20s/it]
247
  29%|██▉ | 207/711 [21:02<43:30, 5.18s/it]
248
  29%|██▉ | 208/711 [21:07<43:14, 5.16s/it]
249
  29%|██▉ | 209/711 [21:12<43:02, 5.15s/it]
250
  30%|██▉
 
 
 
251
 
252
  30%|██▉ | 210/711 [21:18<42:50, 5.13s/it]
253
  30%|██▉ | 211/711 [21:23<42:40, 5.12s/it]
254
  30%|██▉ | 212/711 [21:28<42:38, 5.13s/it]
255
  30%|██▉ | 213/711 [21:33<42:38, 5.14s/it]
256
  30%|███ | 214/711 [21:38<42:36, 5.14s/it]
257
  30%|███ | 215/711 [21:43<42:24, 5.13s/it]
258
  30%|███ | 216/711 [21:48<42:15, 5.12s/it]
259
  31%|███ | 217/711 [21:53<42:11, 5.12s/it]
260
  31%|███ | 218/711 [21:59<42:01, 5.12s/it]
261
  31%|███ | 219/711 [22:04<42:01, 5.12s/it]
262
  31%|███ | 220/711 [22:09<41:50, 5.11s/it]
263
 
264
  31%|███ | 220/711 [22:09<41:50, 5.11s/it]
265
  31%|███ | 221/711 [22:14<42:03, 5.15s/it]
266
  31%|███ | 222/711 [22:19<42:36, 5.23s/it]
267
  31%|███▏ | 223/711 [22:25<42:39, 5.25s/it]
268
  32%|███▏ | 224/711 [22:30<42
 
 
269
  32%|███▏ | 225/711 [22:35<42:13, 5.21s/it]
270
  32%|███▏ | 226/711 [22:40<42:40, 5.28s/it]
271
  32%|███▏ | 227/711 [22:46<42:11, 5.23s/it]
272
  32%|███▏ | 228/711 [22:51<41:45, 5.19s/it]
273
  32%|███▏ | 229/711 [22:56<41:33, 5.17s/it]
274
  32%|███▏ | 230/711 [23:01<41:16, 5.15s/it]
275
 
276
  32%|███▏ | 230/711 [23:01<41:16, 5.15s/it]
277
  32%|███▏ | 231/711 [23:06<41:02, 5.13s/it]
278
  33%|███▎ | 232/711 [23:11<40:52, 5.12s/it]
279
  33%|███▎ | 233/711 [23:16<40:57, 5.14s/it]
280
  33%|███▎ | 234/711 [23:21<40:58, 5.15s/it]
281
  33%|███▎ | 235/711 [23:27<40:52, 5.15s/it]
282
  33%|███▎ | 236/711 [23:32<40:41, 5.14s/it]
283
  33%|███▎ | 237/711 [23:37<40:32, 5.13s/it]
284
  33%|███▎ | 238/711 [23:42<40:21, 5.12s/it]
285
  34%|███▎ | 239/711 [23:47<40:22, 5.13s/it]
286
  34%|███▍ | 240/
 
 
 
287
 
288
  34%|███▍ | 240/711 [23:52<40:14, 5.13s/it]
289
  34%|███▍ | 241/711 [23:57<40:04, 5.12s/it]
290
  34%|███▍ | 242/711 [24:03<40:30, 5.18s/it]
291
  34%|███▍ | 243/711 [24:08<40:55, 5.25s/it]
292
  34%|███▍ | 244/711 [24:13<40:38, 5.22s/it]
293
  34%|███▍ | 245/711 [24:18<40:16, 5.18s/it]
294
  35%|███▍ | 246/711 [24:23<40:06, 5.18s/it]
295
  35%|███▍ | 247/711 [24:29<40:20, 5.22s/it]
296
  35%|███▍ | 248/711 [24:34<39:58, 5.18s/it]
297
  35%|███▌ | 249/711 [24:39<39:40, 5.15s/it]
298
  35%|███▌ | 250/711 [24:44<39:34, 5.15s/it]
299
 
300
  35%|███▌ | 250/711 [24:44<39:34, 5.15s/it]
301
  35%|███▌ | 251/711 [24:49<39:23, 5.14s/it]
302
  35%|███▌ | 252/711 [24:54<39:08, 5.12s/it]
303
  36%|███▌ | 253/711 [24:59<39:13, 5.14s/it]
304
  36%|███▌
 
 
305
  36%|███▌ | 255/711 [25:10<39:07, 5.15s/it]
306
  36%|███▌ | 256/711 [25:15<39:17, 5.18s/it]
307
  36%|███▌ | 257/711 [25:20<38:57, 5.15s/it]
308
  36%|███▋ | 258/711 [25:26<39:48, 5.27s/it]
309
  36%|███▋ | 259/711 [25:31<39:17, 5.22s/it]
310
  37%|███▋ | 260/711 [25:36<38:58, 5.19s/it]
311
 
312
  37%|███▋ | 260/711 [25:36<38:58, 5.19s/it]
313
  37%|███▋ | 261/711 [25:41<38:40, 5.16s/it]
314
  37%|███▋ | 262/711 [25:46<38:26, 5.14s/it]
315
  37%|███▋ | 263/711 [25:51<38:26, 5.15s/it]
316
  37%|███▋ | 264/711 [25:56<38:28, 5.16s/it]
317
  37%|███▋ | 265/711 [26:02<38:16, 5.15s/it]
318
  37%|███▋ | 266/711 [26:07<38:12, 5.15s/it]
319
  38%|███▊ | 267/711 [26:12<38:50, 5.25s/it]
320
  38%|███▊ | 268/711 [26:18<39:17, 5.32s/it]
321
  38%|███▊ | 269/711 [26:23<38:44, 5.26s/it]
322
  38%|█�
 
 
 
 
323
 
324
  38%|███▊ | 270/711 [26:28<38:17, 5.21s/it]
325
  38%|███▊ | 271/711 [26:33<37:58, 5.18s/it]
326
  38%|███▊ | 272/711 [26:38<37:42, 5.15s/it]
327
  38%|███▊ | 273/711 [26:43<37:32, 5.14s/it]
328
  39%|███▊ | 274/711 [26:49<39:58, 5.49s/it]
329
  39%|███▊ | 275/711 [26:55<39:08, 5.39s/it]
330
  39%|███▉ | 276/711 [27:00<38:28, 5.31s/it]
331
  39%|███▉ | 277/711 [27:05<38:44, 5.36s/it]
332
  39%|███▉ | 278/711 [27:10<38:16, 5.30s/it]
333
  39%|███▉ | 279/711 [27:16<37:45, 5.24s/it]
334
  39%|███▉ | 280/711 [27:21<37:26, 5.21s/it]
335
 
336
  39%|███▉ | 280/711 [27:21<37:26, 5.21s/it]
337
  40%|███▉ | 281/711 [27:26<38:16, 5.34s/it]
338
  40%|███▉ | 282/711 [27:31<37:40, 5.27s/it]
339
  40%|███▉ | 283/711 [27:37<37:37, 5.28s/it]
340
  40%
 
 
341
  40%|████ | 285/711 [27:47<37:34, 5.29s/it]
342
  40%|████ | 286/711 [27:53<37:14, 5.26s/it]
343
  40%|████ | 287/711 [27:58<37:31, 5.31s/it]
344
  41%|████ | 288/711 [28:03<37:04, 5.26s/it]
345
  41%|████ | 289/711 [28:08<36:40, 5.21s/it]
346
  41%|████ | 290/711 [28:13<36:19, 5.18s/it]
347
 
348
  41%|████ | 290/711 [28:13<36:19, 5.18s/it]
349
  41%|████ | 291/711 [28:19<36:32, 5.22s/it]
350
  41%|████ | 292/711 [28:24<36:57, 5.29s/it]
351
  41%|████ | 293/711 [28:29<36:57, 5.31s/it]
352
  41%|████▏ | 294/711 [28:35<36:36, 5.27s/it]
353
  41%|████▏ | 295/711 [28:40<36:18, 5.24s/it]
354
  42%|████▏ | 296/711 [28:45<35:59, 5.20s/it]
355
  42%|████▏ | 297/711 [28:50<36:22, 5.27s/it]
356
  42%|████▏ | 298/711 [28:55<36:02, 5.24s/it]
357
  42%|████▏ | 299/711 [29:
 
 
 
358
  42%|████▏ | 300/711 [29:06<36:01, 5.26s/it]
359
 
360
  42%|████▏ | 300/711 [29:06<36:01, 5.26s/it]
361
  42%|████▏ | 301/711 [29:11<35:42, 5.23s/it]
362
  42%|████▏ | 302/711 [29:16<35:28, 5.20s/it]
363
  43%|████▎ | 303/711 [29:21<35:11, 5.17s/it]
364
  43%|████▎ | 304/711 [29:27<35:03, 5.17s/it]
365
  43%|████▎ | 305/711 [29:32<35:16, 5.21s/it]
366
  43%|████▎ | 306/711 [29:37<34:57, 5.18s/it]
367
  43%|████▎ | 307/711 [29:42<34:42, 5.15s/it]
368
  43%|████▎ | 308/711 [29:47<34:33, 5.15s/it]
369
  43%|████▎ | 309/711 [29:53<35:23, 5.28s/it]
370
  44%|████▎ | 310/711 [29:59<36:23, 5.45s/it]
371
 
372
  44%|████▎ | 310/711 [29:59<36:23, 5.45s/it]
373
  44%|████▎ | 311/711 [30:04<35:40, 5.35s/it]
374
  44%|████▍ | 312/711 [30:09<35:47, 5.38s/it]
375
 
 
 
376
  44%|████▍ | 314/711 [30:20<35:18, 5.34s/it]
377
  44%|████▍ | 315/711 [30:25<35:09, 5.33s/it]
378
  44%|████▍ | 316/711 [30:30<34:42, 5.27s/it]
379
  45%|████▍ | 317/711 [30:35<34:17, 5.22s/it]
380
  45%|████▍ | 318/711 [30:40<34:02, 5.20s/it]
381
  45%|████▍ | 319/711 [30:46<33:48, 5.18s/it]
382
  45%|████▌ | 320/711 [30:51<33:36, 5.16s/it]
383
 
384
  45%|████▌ | 320/711 [30:51<33:36, 5.16s/it]
385
  45%|████▌ | 321/711 [30:56<33:30, 5.16s/it]
386
  45%|████▌ | 322/711 [31:01<33:19, 5.14s/it]
387
  45%|████▌ | 323/711 [31:06<33:37, 5.20s/it]
388
  46%|████▌ | 324/711 [31:11<33:25, 5.18s/it]
389
  46%|████▌ | 325/711 [31:17<33:17, 5.17s/it]
390
  46%|████▌ | 326/711 [31:22<33:09, 5.17s/it]
391
  46%|████▌ | 327/711 [31:27<33:08, 5.18s/it]
392
  46%|███
 
 
 
393
  46%|████▋ | 329/711 [31:37<32:44, 5.14s/it]
394
  46%|████▋ | 330/711 [31:42<32:58, 5.19s/it]
395
 
396
  46%|████▋ | 330/711 [31:42<32:58, 5.19s/it]
397
  47%|████▋ | 331/711 [31:48<32:46, 5.18s/it]
398
  47%|████▋ | 332/711 [31:53<32:35, 5.16s/it]
399
  47%|████▋ | 333/711 [31:58<33:05, 5.25s/it]
400
  47%|████▋ | 334/711 [32:03<32:47, 5.22s/it]
401
  47%|████▋ | 335/711 [32:09<33:10, 5.29s/it]
402
  47%|████▋ | 336/711 [32:14<32:41, 5.23s/it]
403
  47%|████▋ | 337/711 [32:19<32:21, 5.19s/it]
404
  48%|████▊ | 338/711 [32:24<32:19, 5.20s/it]
405
  48%|████▊ | 339/711 [32:29<32:05, 5.18s/it]
406
  48%|████▊ | 340/711 [32:34<31:51, 5.15s/it]
407
 
408
  48%|████▊ | 340/711 [32:34<31:51, 5.15s/it]
409
  48%|████▊ | 341/711 [
 
 
410
  48%|████▊ | 342/711 [32:45<31:45, 5.16s/it]
411
  48%|████▊ | 343/711 [32:50<31:37, 5.16s/it]
412
  48%|████▊ | 344/711 [32:55<31:44, 5.19s/it]
413
  49%|████▊ | 345/711 [33:00<31:29, 5.16s/it]
414
  49%|████▊ | 346/711 [33:05<31:18, 5.15s/it]
415
  49%|████▉ | 347/711 [33:10<31:07, 5.13s/it]
416
  49%|████▉ | 348/711 [33:16<30:59, 5.12s/it]
417
  49%|████▉ | 349/711 [33:21<30:57, 5.13s/it]
418
  49%|████▉ | 350/711 [33:26<30:52, 5.13s/it]
419
 
420
  49%|████▉ | 350/711 [33:26<30:52, 5.13s/it]
421
  49%|████▉ | 351/711 [33:31<30:45, 5.13s/it]
422
  50%|████▉ | 352/711 [33:36<30:36, 5.12s/it]
423
  50%|████▉ | 353/711 [33:41<30:32, 5.12s/it]
424
  50%|████▉ | 354/711 [33:46<30:26, 5.12s/it]
425
  50%|████▉ | 355/711 [33:51<30:20, 5.11s/it]
426
  50%|█████ | 356/711 [33:57<30:13,
 
 
427
  50%|█████ | 357/711 [34:02<30:10, 5.11s/it]
428
  50%|█████ | 358/711 [34:07<30:07, 5.12s/it]
429
  50%|█████ | 359/711 [34:12<30:03, 5.12s/it]
430
  51%|█████ | 360/711 [34:17<29:57, 5.12s/it]
431
 
432
  51%|█████ | 360/711 [34:17<29:57, 5.12s/it]
433
  51%|█████ | 361/711 [34:22<29:53, 5.12s/it]
434
  51%|█████ | 362/711 [34:27<29:46, 5.12s/it]
435
  51%|█████ | 363/711 [34:32<29:41, 5.12s/it]
436
  51%|█████ | 364/711 [34:38<29:39, 5.13s/it]
437
  51%|█████▏ | 365/711 [34:43<29:59, 5.20s/it]
438
  51%|█████▏ | 366/711 [34:48<29:44, 5.17s/it]
439
  52%|█████▏ | 367/711 [34:53<29:30, 5.15s/it]
440
  52%|█████▏ | 368/711 [34:58<29:27, 5.15s/it]
441
  52%|█████▏ | 369/711 [35:03<29:14, 5.13s/it]
442
  52%|█████▏ | 370/711 [35:09<29:11, 5.14s/it]
443
 
 
 
 
444
  52%|█████▏ | 370/711 [35:09<29:11, 5.14s/it]
445
  52%|█████▏ | 371/711 [35:14<29:01, 5.12s/it]
446
  52%|█████▏ | 372/711 [35:19<28:54, 5.12s/it]
447
  52%|█████▏ | 373/711 [35:24<29:01, 5.15s/it]
448
  53%|█████▎ | 374/711 [35:29<28:53, 5.14s/it]
449
  53%|█████▎ | 375/711 [35:34<28:43, 5.13s/it]
450
  53%|█████▎ | 376/711 [35:39<28:33, 5.11s/it]
451
  53%|█████▎ | 377/711 [35:44<28:25, 5.11s/it]
452
  53%|█████▎ | 378/711 [35:49<28:21, 5.11s/it]
453
  53%|█████▎ | 379/711 [35:55<28:52, 5.22s/it]
454
  53%|█████▎ | 380/711 [36:00<28:33, 5.18s/it]
455
 
456
  53%|█████▎ | 380/711 [36:00<28:33, 5.18s/it]
457
  54%|█████▎ | 381/711 [36:05<28:53, 5.25s/it]
458
  54%|█████▎ | 382/711 [36:11<28:32, 5.21s/it]
459
  54%|█████▍ | 383/711 [36:16<28:38, 5.24s/it]
460
  54%|█████▍ | 384/711 [36:2
 
 
461
  54%|█████▍ | 385/711 [36:26<28:02, 5.16s/it]
462
  54%|█████▍ | 386/711 [36:31<27:52, 5.15s/it]
463
  54%|█████▍ | 387/711 [36:36<27:46, 5.14s/it]
464
  55%|█████▍ | 388/711 [36:41<27:35, 5.12s/it]
465
  55%|█████▍ | 389/711 [36:47<27:35, 5.14s/it]
466
  55%|█████▍ | 390/711 [36:52<27:38, 5.17s/it]
467
 
468
  55%|█████▍ | 390/711 [36:52<27:38, 5.17s/it]
469
  55%|█████▍ | 391/711 [36:57<27:26, 5.14s/it]
470
  55%|█████▌ | 392/711 [37:02<27:35, 5.19s/it]
471
  55%|█████▌ | 393/711 [37:07<27:27, 5.18s/it]
472
  55%|█████▌ | 394/711 [37:12<27:12, 5.15s/it]
473
  56%|█████▌ | 395/711 [37:18<27:39, 5.25s/it]
474
  56%|█████▌ | 396/711 [37:23<27:19, 5.20s/it]
475
  56%|█████▌ | 397/711 [37:28<27:03, 5.17s/it]
476
  56%|█████▌ | 398/711 [37:33<26:52, 5.15s/it]
477
  56%|█████▌
 
 
 
478
  56%|█████▋ | 400/711 [37:43<26:35, 5.13s/it]
479
 
480
  56%|█████▋ | 400/711 [37:43<26:35, 5.13s/it]
481
  56%|█████▋ | 401/711 [37:48<26:27, 5.12s/it]
482
  57%|█████▋ | 402/711 [37:54<26:18, 5.11s/it]
483
  57%|█████▋ | 403/711 [37:59<26:11, 5.10s/it]
484
  57%|█████▋ | 404/711 [38:04<26:10, 5.11s/it]
485
  57%|█████▋ | 405/711 [38:09<26:34, 5.21s/it]
486
  57%|█████▋ | 406/711 [38:14<26:25, 5.20s/it]
487
  57%|█████▋ | 407/711 [38:19<26:11, 5.17s/it]
488
  57%|█████▋ | 408/711 [38:25<26:03, 5.16s/it]
489
  58%|█████▊ | 409/711 [38:30<26:30, 5.27s/it]
490
  58%|█████▊ | 410/711 [38:35<26:14, 5.23s/it]
491
 
492
  58%|█████▊ | 410/711 [38:35<26:14, 5.23s/it]
493
  58%|█████▊ | 411/711 [38:41<26:30, 5.30s/it]
494
  58%|███�
 
 
495
  58%|█████▊ | 413/711 [38:51<25:55, 5.22s/it]
496
  58%|█████▊ | 414/711 [38:56<25:40, 5.19s/it]
497
  58%|█████▊ | 415/711 [39:01<25:28, 5.16s/it]
498
  59%|█████▊ | 416/711 [39:07<25:49, 5.25s/it]
499
  59%|█████▊ | 417/711 [39:12<25:31, 5.21s/it]
500
  59%|█████▉ | 418/711 [39:17<25:17, 5.18s/it]
501
  59%|█████▉ | 419/711 [39:22<25:06, 5.16s/it]
502
  59%|█████▉ | 420/711 [39:27<24:59, 5.15s/it]
503
 
504
  59%|█████▉ | 420/711 [39:27<24:59, 5.15s/it]
505
  59%|█████▉ | 421/711 [39:32<24:49, 5.14s/it]
506
  59%|█████▉ | 422/711 [39:37<24:44, 5.14s/it]
507
  59%|█████▉ | 423/711 [39:43<24:35, 5.12s/it]
508
  60%|█████▉ | 424/711 [39:48<24:42, 5.17s/it]
509
  60%|█████▉ | 425/711 [39:53<24:34, 5.16s/it]
510
  60%|█████▉ | 426/711 [39:58<24:30, 5.16s/
 
 
511
  60%|██████ | 427/711 [40:03<24:40, 5.21s/it]
512
  60%|██████ | 428/711 [40:09<24:55, 5.28s/it]
513
  60%|██████ | 429/711 [40:14<24:33, 5.23s/it]
514
  60%|██████ | 430/711 [40:19<24:23, 5.21s/it]
515
 
516
  60%|██████ | 430/711 [40:19<24:23, 5.21s/it]
517
  61%|██████ | 431/711 [40:24<24:10, 5.18s/it]
518
  61%|██████ | 432/711 [40:29<23:56, 5.15s/it]
519
  61%|██████ | 433/711 [40:34<23:50, 5.14s/it]
520
  61%|██████ | 434/711 [40:40<23:39, 5.12s/it]
521
  61%|██████ | 435/711 [40:45<23:32, 5.12s/it]
522
  61%|██████▏ | 436/711 [40:50<24:14, 5.29s/it]
523
  61%|██████▏ | 437/711 [40:56<24:11, 5.30s/it]
524
  62%|██████▏ | 438/711 [41:01<23:49, 5.24s/it]
525
  62%|██████▏ | 439/711 [41:06<23:33, 5.20s/it]
526
  62%|██████▏ | 440/711 [41:11<23:37, 5.23s/it]
527
 
 
 
 
528
  62%|██████▏ | 440/711 [41:11<23:37, 5.23s/it]
529
  62%|██████▏ | 441/711 [41:16<23:23, 5.20s/it]
530
  62%|██████▏ | 442/711 [41:21<23:13, 5.18s/it]
531
  62%|██████▏ | 443/711 [41:27<23:01, 5.16s/it]
532
  62%|██████▏ | 444/711 [41:32<22:57, 5.16s/it]
533
  63%|██████▎ | 445/711 [41:37<23:14, 5.24s/it]
534
  63%|██████▎ | 446/711 [41:42<22:58, 5.20s/it]
535
  63%|██████▎ | 447/711 [41:47<22:45, 5.17s/it]
536
  63%|██████▎ | 448/711 [41:52<22:34, 5.15s/it]
537
  63%|██████▎ | 449/711 [41:58<22:43, 5.20s/it]
538
  63%|██████▎ | 450/711 [42:03<22:32, 5.18s/it]
539
 
540
  63%|██████▎ | 450/711 [42:03<22:32, 5.18s/it]
541
  63%|██████▎ | 451/711 [42:08<22:20, 5.16s/it]
542
  64%|██████▎ | 452/711 [42:13<22:14, 5.15s/it]
543
  64%|██████▎ | 453/711 [42:18<22:09, 5
 
 
544
  64%|██████▍ | 454/711 [42:23<22:03, 5.15s/it]
545
  64%|██████▍ | 455/711 [42:29<21:56, 5.14s/it]
546
  64%|██████▍ | 456/711 [42:34<21:51, 5.14s/it]
547
  64%|██████▍ | 457/711 [42:39<21:44, 5.14s/it]
548
  64%|██████▍ | 458/711 [42:44<21:40, 5.14s/it]
549
  65%|██████▍ | 459/711 [42:49<21:39, 5.16s/it]
550
  65%|██████▍ | 460/711 [42:54<21:46, 5.21s/it]
551
 
552
  65%|██████▍ | 460/711 [42:54<21:46, 5.21s/it]
553
  65%|██████▍ | 461/711 [43:00<21:38, 5.19s/it]
554
  65%|██████▍ | 462/711 [43:05<21:30, 5.18s/it]
555
  65%|██████▌ | 463/711 [43:10<21:25, 5.18s/it]
556
  65%|██████▌ | 464/711 [43:15<21:17, 5.17s/it]
557
  65%|██████▌ | 465/711 [43:20<21:05, 5.15s/it]
558
  66%|██████▌ | 466/711 [43:25<20:58, 5.14s/it]
559
  66%|██████▌ | 467/711 [43:30<20:53, 5.14s/it]
560
  66%
 
 
 
561
  66%|██████▌ | 469/711 [43:41<21:10, 5.25s/it]
562
  66%|██████▌ | 470/711 [43:46<20:53, 5.20s/it]
563
 
564
  66%|██████▌ | 470/711 [43:46<20:53, 5.20s/it]
565
  66%|██████▌ | 471/711 [43:51<20:45, 5.19s/it]
566
  66%|██████▋ | 472/711 [43:57<20:40, 5.19s/it]
567
  67%|██████▋ | 473/711 [44:02<20:28, 5.16s/it]
568
  67%|██████▋ | 474/711 [44:07<20:23, 5.16s/it]
569
  67%|██████▋ | 475/711 [44:12<20:12, 5.14s/it]
570
  67%|██████▋ | 476/711 [44:18<20:43, 5.29s/it]
571
  67%|██████▋ | 477/711 [44:23<20:23, 5.23s/it]
572
  67%|██████▋ | 478/711 [44:28<20:24, 5.26s/it]
573
  67%|██████▋ | 479/711 [44:33<20:10, 5.22s/it]
574
  68%|██████▊ | 480/711 [44:38<19:59, 5.19s/it]
575
 
576
  68%|██████▊ |
 
 
577
  68%|██████▊ | 481/711 [44:44<20:13, 5.28s/it]
578
  68%|██████▊ | 482/711 [44:49<20:00, 5.24s/it]
579
  68%|██████▊ | 483/711 [44:54<19:48, 5.21s/it]
580
  68%|██████▊ | 484/711 [44:59<19:39, 5.20s/it]
581
  68%|██████▊ | 485/711 [45:04<19:33, 5.19s/it]
582
  68%|██████▊ | 486/711 [45:10<19:26, 5.18s/it]
583
  68%|██████▊ | 487/711 [45:15<19:19, 5.18s/it]
584
  69%|██████▊ | 488/711 [45:20<19:09, 5.15s/it]
585
  69%|██████▉ | 489/711 [45:25<19:24, 5.25s/it]
586
  69%|██████▉ | 490/711 [45:30<19:09, 5.20s/it]
587
 
588
  69%|██████▉ | 490/711 [45:30<19:09, 5.20s/it]
589
  69%|██████▉ | 491/711 [45:36<19:16, 5.26s/it]
590
  69%|██████▉ | 492/711 [45:41<19:00, 5.21s/it]
591
  69%|██████▉ | 493/711 [45:46<18:47, 5.17s/it]
592
  69%|██████▉ | 494/711 [45:
 
 
593
  70%|██████▉ | 495/711 [45:56<18:45, 5.21s/it]
594
  70%|██████▉ | 496/711 [46:02<18:34, 5.19s/it]
595
  70%|██████▉ | 497/711 [46:07<18:38, 5.22s/it]
596
  70%|███████ | 498/711 [46:12<18:25, 5.19s/it]
597
  70%|███████ | 499/711 [46:18<18:55, 5.35s/it]
598
  70%|███████ | 500/711 [46:23<18:34, 5.28s/it]
599
 
600
  70%|███████ | 500/711 [46:23<18:34, 5.28s/it]
601
  70%|███████ | 501/711 [46:28<18:22, 5.25s/it]
602
  71%|███████ | 502/711 [46:33<18:12, 5.23s/it]
603
  71%|███████ | 503/711 [46:38<18:04, 5.21s/it]
604
  71%|███████ | 504/711 [46:44<17:53, 5.19s/it]
605
  71%|███████ | 505/711 [46:49<17:43, 5.16s/it]
606
  71%|███████ | 506/711 [46:54<17:36, 5.16s/it]
607
  71%|███████▏ | 507/711 [46:59<17:29, 5.14s/it]
608
  71%|███████▏ | 508/711 [47:04<17:21,
 
 
 
609
  72%|███████▏ | 509/711 [47:09<17:16, 5.13s/it]
610
  72%|███████▏ | 510/711 [47:14<17:09, 5.12s/it]
611
 
612
  72%|███████▏ | 510/711 [47:14<17:09, 5.12s/it]
613
  72%|███████▏ | 511/711 [47:20<17:21, 5.21s/it]
614
  72%|███████▏ | 512/711 [47:25<17:12, 5.19s/it]
615
  72%|███████▏ | 513/711 [47:30<17:00, 5.16s/it]
616
  72%|███████▏ | 514/711 [47:35<16:51, 5.14s/it]
617
  72%|███████▏ | 515/711 [47:40<16:45, 5.13s/it]
618
  73%|███████▎ | 516/711 [47:45<16:40, 5.13s/it]
619
  73%|███████▎ | 517/711 [47:50<16:32, 5.12s/it]
620
  73%|███████▎ | 518/711 [47:55<16:30, 5.13s/it]
621
  73%|███████▎ | 519/711 [48:01<16:25, 5.13s/it]
622
  73%|███████▎ | 520/711 [48:06<16:42, 5.25s/it]
623
 
624
  73%|███████▎ | 520/711 [48:06<16:4
 
 
625
  73%|███████▎ | 521/711 [48:11<16:30, 5.21s/it]
626
  73%|███████▎ | 522/711 [48:16<16:18, 5.18s/it]
627
  74%|███████▎ | 523/711 [48:22<16:20, 5.22s/it]
628
  74%|███████▎ | 524/711 [48:27<16:36, 5.33s/it]
629
  74%|███████▍ | 525/711 [48:32<16:20, 5.27s/it]
630
  74%|███████▍ | 526/711 [48:37<16:07, 5.23s/it]
631
  74%|███████▍ | 527/711 [48:43<15:53, 5.18s/it]
632
  74%|███████▍ | 528/711 [48:48<15:54, 5.22s/it]
633
  74%|███████▍ | 529/711 [48:53<15:42, 5.18s/it]
634
  75%|███████▍ | 530/711 [48:58<15:38, 5.18s/it]
635
 
636
  75%|███████▍ | 530/711 [48:58<15:38, 5.18s/it]
637
  75%|███████▍ | 531/711 [49:03<15:27, 5.15s/it]
638
  75%|███████▍ | 532/711 [49:08<15:19, 5.14s/it]
639
  75%|███████▍ | 533/711 [49:13<15:11, 5.12s/it]
640
  75%|███████▌ | 53
 
 
641
  75%|███████▌ | 535/711 [49:24<14:58, 5.11s/it]
642
  75%|███████▌ | 536/711 [49:29<15:04, 5.17s/it]
643
  76%|███████▌ | 537/711 [49:34<14:59, 5.17s/it]
644
  76%|███████▌ | 538/711 [49:39<14:48, 5.14s/it]
645
  76%|███████▌ | 539/711 [49:44<14:46, 5.16s/it]
646
  76%|███████▌ | 540/711 [49:49<14:41, 5.15s/it]
647
 
648
  76%|███████▌ | 540/711 [49:49<14:41, 5.15s/it]
649
  76%|███████▌ | 541/711 [49:55<14:33, 5.14s/it]
650
  76%|███████▌ | 542/711 [50:00<14:26, 5.13s/it]
651
  76%|███████▋ | 543/711 [50:05<14:23, 5.14s/it]
652
  77%|███████▋ | 544/711 [50:10<14:17, 5.13s/it]
653
  77%|███████▋ | 545/711 [50:15<14:10, 5.12s/it]
654
  77%|███████▋ | 546/711 [50:20<14:04, 5.12s/it]
655
  77%|███████▋ | 547/711 [50:25<13:59, 5.12s/it]
656
  77%|████�
 
 
 
657
  77%|███████▋ | 549/711 [50:35<13:47, 5.11s/it]
658
  77%|███████▋ | 550/711 [50:41<13:42, 5.11s/it]
659
 
660
  77%|███████▋ | 550/711 [50:41<13:42, 5.11s/it]
661
  77%|███████▋ | 551/711 [50:46<13:40, 5.13s/it]
662
  78%|███████▊ | 552/711 [50:51<13:36, 5.13s/it]
663
  78%|███████▊ | 553/711 [50:56<13:29, 5.12s/it]
664
  78%|███████▊ | 554/711 [51:01<13:24, 5.12s/it]
665
  78%|███████▊ | 555/711 [51:06<13:21, 5.14s/it]
666
  78%|███████▊ | 556/711 [51:11<13:14, 5.12s/it]
667
  78%|███████▊ | 557/711 [51:17<13:25, 5.23s/it]
668
  78%|███████▊ | 558/711 [51:22<13:32, 5.31s/it]
669
  79%|███████▊ | 559/711 [51:27<13:18, 5.25s/it]
670
  79%|███████▉ | 560/711 [51:33<13:06, 5.21s/it]
671
 
672
  79%|███�
 
 
673
  79%|███████▉ | 561/711 [51:38<12:58, 5.19s/it]
674
  79%|███████▉ | 562/711 [51:43<13:13, 5.33s/it]
675
  79%|███████▉ | 563/711 [51:49<13:11, 5.35s/it]
676
  79%|███████▉ | 564/711 [51:54<12:58, 5.29s/it]
677
  79%|███████▉ | 565/711 [51:59<12:47, 5.26s/it]
678
  80%|███████▉ | 566/711 [52:04<12:34, 5.21s/it]
679
  80%|███████▉ | 567/711 [52:10<12:40, 5.28s/it]
680
  80%|███████▉ | 568/711 [52:15<12:28, 5.23s/it]
681
  80%|████████ | 569/711 [52:20<12:27, 5.26s/it]
682
  80%|████████ | 570/711 [52:25<12:14, 5.21s/it]
683
 
684
  80%|████████ | 570/711 [52:25<12:14, 5.21s/it]
685
  80%|████████ | 571/711 [52:31<12:14, 5.24s/it]
686
  80%|████████ | 572/711 [52:36<12:04, 5.21s/it]
687
  81%|████████ | 573/711 [52:41<12:02, 5.24s/it]
 
 
688
  81%|████████ | 574/711 [52:46<11:53, 5.21s/it]
689
  81%|████████ | 575/711 [52:51<11:44, 5.18s/it]
690
  81%|████████ | 576/711 [52:56<11:38, 5.17s/it]
691
  81%|████████ | 577/711 [53:02<11:46, 5.27s/it]
692
  81%|████████▏ | 578/711 [53:07<11:49, 5.33s/it]
693
  81%|████████▏ | 579/711 [53:12<11:34, 5.26s/it]
694
  82%|████████▏ | 580/711 [53:18<11:25, 5.23s/it]
695
 
696
  82%|████████▏ | 580/711 [53:18<11:25, 5.23s/it]
697
  82%|████████▏ | 581/711 [53:23<11:15, 5.19s/it]
698
  82%|████████▏ | 582/711 [53:28<11:06, 5.16s/it]
699
  82%|████████▏ | 583/711 [53:33<10:59, 5.15s/it]
700
  82%|████████▏ | 584/711 [53:38<10:52, 5.14s/it]
701
  82%|████████▏ | 585/711 [53:43<10:46, 5.13s/it]
702
  82%|████████▏ | 586/711 [53:48<10:42, 5.14s/it]
703
  83%|███████�
 
 
704
  83%|████████▎ | 588/711 [53:59<10:55, 5.33s/it]
705
  83%|████████▎ | 589/711 [54:05<10:40, 5.25s/it]
706
  83%|████████▎ | 590/711 [54:10<10:43, 5.32s/it]
707
 
708
  83%|████████▎ | 590/711 [54:10<10:43, 5.32s/it]
709
  83%|████████▎ | 591/711 [54:15<10:32, 5.27s/it]
710
  83%|████████▎ | 592/711 [54:20<10:23, 5.24s/it]
711
  83%|████████▎ | 593/711 [54:25<10:14, 5.21s/it]
712
  84%|████████▎ | 594/711 [54:31<10:07, 5.19s/it]
713
  84%|████████▎ | 595/711 [54:36<09:59, 5.17s/it]
714
  84%|████████▍ | 596/711 [54:41<09:51, 5.14s/it]
715
  84%|████████▍ | 597/711 [54:46<09:53, 5.21s/it]
716
  84%|████████▍ | 598/711 [54:51<09:45, 5.18s/it]
717
  84%|████████▍ | 599/711 [54:57<10:07, 5.42s/it]
718
  84%|████████▍ | 600/711 [55:02
 
 
 
719
 
720
  84%|████████▍ | 600/711 [55:02<09:53, 5.35s/it]
721
  85%|████████▍ | 601/711 [55:08<09:48, 5.35s/it]
722
  85%|████████▍ | 602/711 [55:13<09:35, 5.28s/it]
723
  85%|████████▍ | 603/711 [55:18<09:24, 5.23s/it]
724
  85%|████████▍ | 604/711 [55:23<09:22, 5.26s/it]
725
  85%|████████▌ | 605/711 [55:28<09:12, 5.22s/it]
726
  85%|████████▌ | 606/711 [55:34<09:27, 5.41s/it]
727
  85%|████████▌ | 607/711 [55:40<09:29, 5.47s/it]
728
  86%|████████▌ | 608/711 [55:45<09:17, 5.41s/it]
729
  86%|████████▌ | 609/711 [55:50<09:03, 5.32s/it]
730
  86%|████████▌ | 610/711 [55:56<08:56, 5.31s/it]
731
 
732
  86%|████████▌ | 610/711 [55:56<08:56, 5.31s/it]
733
  86%|████████▌ | 611/711 [56:01<08:45, 5.25s/it]
734
  86%|████�
 
 
735
  86%|████████▌ | 613/711 [56:11<08:34, 5.24s/it]
736
  86%|████████▋ | 614/711 [56:16<08:25, 5.22s/it]
737
  86%|████████▋ | 615/711 [56:21<08:17, 5.19s/it]
738
  87%|████████▋ | 616/711 [56:28<08:41, 5.48s/it]
739
  87%|████████▋ | 617/711 [56:33<08:24, 5.37s/it]
740
  87%|████████▋ | 618/711 [56:38<08:23, 5.41s/it]
741
  87%|████████▋ | 619/711 [56:43<08:09, 5.33s/it]
742
  87%|████████▋ | 620/711 [56:49<07:59, 5.27s/it]
743
 
744
  87%|████████▋ | 620/711 [56:49<07:59, 5.27s/it]
745
  87%|████████▋ | 621/711 [56:54<07:51, 5.24s/it]
746
  87%|████████▋ | 622/711 [56:59<07:43, 5.21s/it]
747
  88%|████████▊ | 623/711 [57:04<07:44, 5.28s/it]
748
  88%|████████▊ | 624/711 [57:09<07:35, 5.23s/it]
749
  88%|████████▊ | 625/7
 
 
750
  88%|████████▊ | 626/711 [57:20<07:19, 5.17s/it]
751
  88%|████████▊ | 627/711 [57:25<07:12, 5.14s/it]
752
  88%|████████▊ | 628/711 [57:30<07:07, 5.15s/it]
753
  88%|████████▊ | 629/711 [57:35<07:03, 5.16s/it]
754
  89%|████████▊ | 630/711 [57:41<07:25, 5.49s/it]
755
 
756
  89%|████████▊ | 630/711 [57:41<07:25, 5.49s/it]
757
  89%|████████▊ | 631/711 [57:47<07:13, 5.42s/it]
758
  89%|████████▉ | 632/711 [57:52<07:01, 5.33s/it]
759
  89%|████████▉ | 633/711 [57:57<06:51, 5.28s/it]
760
  89%|████████▉ | 634/711 [58:02<06:42, 5.23s/it]
761
  89%|████████▉ | 635/711 [58:07<06:35, 5.21s/it]
762
  89%|████████▉ | 636/711 [58:13<06:36, 5.28s/it]
763
  90%|████████▉ | 637/711 [58:18<06:26, 5.23s/it]
764
  90%|████████▉ | 638/711 [58:23<06:19, 5.2
 
 
 
765
  90%|████████▉ | 639/711 [58:28<06:19, 5.28s/it]
766
  90%|█████████ | 640/711 [58:33<06:11, 5.24s/it]
767
 
768
  90%|█████████ | 640/711 [58:33<06:11, 5.24s/it]
769
  90%|█████████ | 641/711 [58:39<06:10, 5.30s/it]
770
  90%|█████████ | 642/711 [58:44<06:02, 5.25s/it]
771
  90%|█████████ | 643/711 [58:49<05:53, 5.20s/it]
772
  91%|█████████ | 644/711 [58:54<05:46, 5.17s/it]
773
  91%|█████████ | 645/711 [59:00<05:46, 5.26s/it]
774
  91%|█████████ | 646/711 [59:05<05:38, 5.21s/it]
775
  91%|█████████ | 647/711 [59:10<05:31, 5.18s/it]
776
  91%|█████████ | 648/711 [59:15<05:25, 5.17s/it]
777
  91%|█████████▏| 649/711 [59:20<05:18, 5.14s/it]
778
  91%|█████████▏| 650/711 [59:25<05:18, 5.23s/it]
779
 
780
  91%|███████�
 
 
781
  92%|█████████▏| 651/711 [59:31<05:11, 5.20s/it]
782
  92%|█████████▏| 652/711 [59:36<05:04, 5.16s/it]
783
  92%|█████████▏| 653/711 [59:41<05:07, 5.31s/it]
784
  92%|█████████▏| 654/711 [59:46<04:58, 5.24s/it]
785
  92%|█████████▏| 655/711 [59:52<04:52, 5.22s/it]
786
  92%|█████████▏| 656/711 [59:57<04:51, 5.30s/it]
787
  92%|█████████▏| 657/711 [1:00:02<04:43, 5.26s/it]
788
  93%|█████████▎| 658/711 [1:00:07<04:36, 5.22s/it]
789
  93%|█████████▎| 659/711 [1:00:13<04:32, 5.23s/it]
790
  93%|█████████▎| 660/711 [1:00:18<04:24, 5.19s/it]
791
 
792
  93%|█████████▎| 660/711 [1:00:18<04:24, 5.19s/it]
793
  93%|█████████▎| 661/711 [1:00:23<04:23, 5.28s/it]
794
  93%|█████████▎| 662/711 [1:00:28<04:16, 5.23s/it]
795
  93%
 
 
796
  93%|█████████▎| 664/711 [1:00:39<04:02, 5.17s/it]
797
  94%|█████████▎| 665/711 [1:00:44<03:57, 5.16s/it]
798
  94%|█████████▎| 666/711 [1:00:49<03:54, 5.21s/it]
799
  94%|█████████▍| 667/711 [1:00:54<03:48, 5.19s/it]
800
  94%|█████████▍| 668/711 [1:00:59<03:41, 5.16s/it]
801
  94%|█████████▍| 669/711 [1:01:04<03:35, 5.14s/it]
802
  94%|█████████▍| 670/711 [1:01:10<03:33, 5.22s/it]
803
 
804
  94%|█████████▍| 670/711 [1:01:10<03:33, 5.22s/it]
805
  94%|█████████▍| 671/711 [1:01:15<03:27, 5.18s/it]
806
  95%|█████████▍| 672/711 [1:01:20<03:23, 5.22s/it]
807
  95%|█████████▍| 673/711 [1:01:25<03:17, 5.20s/it]
808
  95%|█████████▍| 674/711 [1:01:30<03:10, 5.16s/it]
809
  95%|█████████▍| 67
 
 
810
  95%|█████████▌| 676/711 [1:01:42<03:13, 5.53s/it]
811
  95%|█████████▌| 677/711 [1:01:48<03:11, 5.63s/it]
812
  95%|█████████▌| 678/711 [1:01:53<03:02, 5.52s/it]
813
  95%|█████████▌| 679/711 [1:01:58<02:52, 5.38s/it]
814
  96%|█████████▌| 680/711 [1:02:03<02:44, 5.29s/it]
815
 
816
  96%|█████████▌| 680/711 [1:02:03<02:44, 5.29s/it]
817
  96%|█████████▌| 681/711 [1:02:08<02:37, 5.25s/it]
818
  96%|█████████▌| 682/711 [1:02:14<02:33, 5.31s/it]
819
  96%|█████████▌| 683/711 [1:02:19<02:27, 5.25s/it]
820
  96%|█████████▌| 684/711 [1:02:24<02:20, 5.21s/it]
821
  96%|█████████▋| 685/711 [1:02:29<02:14, 5.18s/it]
822
  96%|█████████▋| 686/711 [1:02:34<02:09, 5.17s/it]
823
  97%|█████████▋| 687/711 [1:02:39<02:03, 5.17s/it]
824
  9
 
 
825
  97%|█████████▋| 689/711 [1:02:50<01:53, 5.15s/it]
826
  97%|█████████▋| 690/711 [1:02:55<01:47, 5.14s/it]
827
 
828
  97%|█████████▋| 690/711 [1:02:55<01:47, 5.14s/it]
829
  97%|█████████▋| 691/711 [1:03:00<01:43, 5.19s/it]
830
  97%|█████████▋| 692/711 [1:03:05<01:37, 5.16s/it]
831
  97%|█████████▋| 693/711 [1:03:10<01:32, 5.14s/it]
832
  98%|█████████▊| 694/711 [1:03:15<01:27, 5.13s/it]
833
  98%|█████████▊| 695/711 [1:03:21<01:22, 5.13s/it]
834
  98%|█████████▊| 696/711 [1:03:26<01:17, 5.14s/it]
835
  98%|█████████▊| 697/711 [1:03:31<01:11, 5.14s/it]
836
  98%|█████████▊| 698/711 [1:03:36<01:06, 5.14s/it]
837
  98%|█████████▊| 699/711 [1:03:41<01:02, 5.17s/it]
838
  98%|█████████▊|
 
 
 
839
 
840
  98%|█████████▊| 700/711 [1:03:47<00:57, 5.24s/it]
841
  99%|█████████▊| 701/711 [1:03:52<00:52, 5.26s/it]
842
  99%|█████████▊| 702/711 [1:03:57<00:47, 5.23s/it]
843
  99%|█████████▉| 703/711 [1:04:02<00:41, 5.20s/it]
844
  99%|█████████▉| 704/711 [1:04:07<00:36, 5.20s/it]
845
  99%|█████████▉| 705/711 [1:04:13<00:31, 5.30s/it]
846
  99%|█████████▉| 706/711 [1:04:18<00:26, 5.24s/it]
847
  99%|█████████▉| 707/711 [1:04:23<00:21, 5.27s/it]
848
 
 
 
 
 
849
 
 
 
 
 
 
1
+ 1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792]
2
+ 1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] *****************************************
3
+ 1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ 1: W1124 00:08:17.923000 737761 torch/distributed/run.py:792] *****************************************
5
+ 0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792]
6
+ 0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] *****************************************
7
+ 0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
8
+ 0: W1124 00:08:17.924000 3081902 torch/distributed/run.py:792] *****************************************
9
+ 2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792]
10
+ 2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] *****************************************
11
+ 2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
12
+ 2: W1124 00:08:17.928000 1779991 torch/distributed/run.py:792] *****************************************
13
+ 3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792]
14
+ 3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] *****************************************
15
+ 3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
16
+ 3: W1124 00:08:17.934000 3626745 torch/distributed/run.py:792] *****************************************
17
+ 2: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:1780066] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`
18
+ 0: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3081979] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`
19
+ 2: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:1780066] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
20
+ 0: [2025-11-24 00:08:36,323] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3081979] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
21
+ 3: [2025-11-24 00:08:36,434] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3626820] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`
22
+ 3: [2025-11-24 00:08:36,434] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3626820] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
23
+ 1: [2025-11-24 00:08:36,535] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:737836] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`
24
+ 1: [2025-11-24 00:08:36,535] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:737836] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
25
+ 0: [2025-11-24 00:08:40,005] [WARNING] [axolotl.utils.config.normalize_config:139] [PID:3081979] [RANK:0] Invalid value for save_steps (1.6666666666666667) from saves_per_epoch and/or num_epochs. Saving at training end only.
26
+ 0: [2025-11-24 00:08:40,025] [INFO] [axolotl.cli.config.load_cfg:245] [PID:3081979] [RANK:0] config:
27
+ 0: {
28
+ 0: "activation_offloading": false,
29
+ 0: "auto_resume_from_checkpoints": true,
30
+ 0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1763939290349239182.yaml",
31
+ 0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b",
32
+ 0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b",
33
+ 0: "batch_size": 16,
34
+ 0: "bf16": true,
35
+ 0: "capabilities": {
36
+ 0: "bf16": true,
37
+ 0: "compute_capability": "sm_90",
38
+ 0: "fp8": false,
39
+ 0: "n_gpu": 16,
40
+ 0: "n_node": 1
41
+ 0: },
42
+ 0: "chat_template": "gemma3",
43
+ 0: "context_parallel_size": 1,
44
+ 0: "dataloader_num_workers": 16,
45
+ 0: "dataloader_pin_memory": true,
46
+ 0: "dataloader_prefetch_factor": 256,
47
+ 0: "dataset_prepared_path": "/lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0",
48
+ 0: "dataset_processes": 192,
49
+ 0: "datasets": [
50
+ 0: {
51
+ 0: "chat_template": "tokenizer_default",
52
+ 0: "data_files": [
53
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0007.jsonl",
54
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0009.jsonl",
55
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0005.jsonl",
56
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0006.jsonl",
57
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0014.jsonl",
58
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0010.jsonl",
59
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0012.jsonl",
60
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0008.jsonl",
61
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0001.jsonl",
62
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0002.jsonl",
63
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0013.jsonl",
64
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0015.jsonl",
65
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0004.jsonl",
66
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0011.jsonl",
67
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0000.jsonl",
68
+ 0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0003.jsonl"
69
+ 0: ],
70
+ 0: "ds_type": "json",
71
+ 0: "field_messages": "conversations",
72
+ 0: "message_property_mappings": {
73
+ 0: "content": "content",
74
+ 0: "role": "role"
75
+ 0: },
76
+ 0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking",
77
+ 0: "trust_remote_code": false,
78
+ 0: "type": "chat_template"
79
+ 0: }
80
+ 0: ],
81
+ 0: "ddp": true,
82
+ 0: "deepspeed": {
83
+ 0: "bf16": {
84
+ 0: "enabled": true
85
+ 0: },
86
+ 0: "gradient_accumulation_steps": "auto",
87
+ 0: "gradient_clipping": "auto",
88
+ 0: "train_batch_size": "auto",
89
+ 0: "train_micro_batch_size_per_gpu": "auto",
90
+ 0: "wall_clock_breakdown": false,
91
+ 0: "zero_optimization": {
92
+ 0: "contiguous_gradients": true,
93
+ 0: "overlap_comm": true,
94
+ 0: "reduce_bucket_size": "auto",
95
+ 0: "stage": 3,
96
+ 0: "stage3_gather_16bit_weights_on_model_save": true,
97
+ 0: "stage3_param_persistence_threshold": "auto",
98
+ 0: "stage3_prefetch_bucket_size": "auto",
99
+ 0: "sub_group_size": 0
100
+ 0: }
101
+ 0: },
102
+ 0: "device": "cuda:0",
103
+ 0: "device_map": {
104
+ 0: "": 0
105
+ 0: },
106
+ 0: "dion_rank_fraction": 1.0,
107
+ 0: "dion_rank_multiple_of": 1,
108
+ 0: "env_capabilities": {
109
+ 0: "torch_version": "2.6.0"
110
+ 0: },
111
+ 0: "eot_tokens": [
112
+ 0: "<end_of_turn>"
113
+ 0: ],
114
+ 0: "eval_batch_size": 1,
115
+ 0: "eval_causal_lm_metrics": [
116
+ 0: "sacrebleu",
117
+ 0: "comet",
118
+ 0: "ter",
119
+ 0: "chrf"
120
+ 0: ],
121
+ 0: "eval_max_new_tokens": 128,
122
+ 0: "eval_sample_packing": true,
123
+ 0: "eval_table_size": 0,
124
+ 0: "evals_per_epoch": 0,
125
+ 0: "flash_attention": true,
126
+ 0: "fp16": false,
127
+ 0: "gradient_accumulation_steps": 1,
128
+ 0: "gradient_checkpointing": true,
129
+ 0: "gradient_checkpointing_kwargs": {
130
+ 0: "use_reentrant": true
131
+ 0: },
132
+ 0: "is_multimodal": true,
133
+ 0: "learning_rate": 2e-06,
134
+ 0: "lisa_layers_attribute": "model.layers",
135
+ 0: "load_best_model_at_end": false,
136
+ 0: "load_in_4bit": false,
137
+ 0: "load_in_8bit": false,
138
+ 0: "local_rank": 0,
139
+ 0: "logging_steps": 10,
140
+ 0: "lora_dropout": 0.0,
141
+ 0: "loraplus_lr_embedding": 1e-06,
142
+ 0: "lr_scheduler": "warmup_stable_decay",
143
+ 0: "lr_scheduler_kwargs": {
144
+ 0: "min_lr_ratio": 0.1,
145
+ 0: "num_decay_steps": 200
146
+ 0: },
147
+ 0: "max_prompt_len": 512,
148
+ 0: "mean_resizing_embeddings": false,
149
+ 0: "micro_batch_size": 1,
150
+ 0: "model_config_type": "gemma3",
151
+ 0: "num_epochs": 0.6,
152
+ 0: "optimizer": "adamw_torch_fused",
153
+ 0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0",
154
+ 0: "pad_to_sequence_len": true,
155
+ 0: "pretrain_multipack_attn": true,
156
+ 0: "pretrain_multipack_buffer_size": 10000,
157
+ 0: "processor_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-12b",
158
+ 0: "profiler_steps_start": 0,
159
+ 0: "qlora_sharded_model_loading": false,
160
+ 0: "ray_num_workers": 1,
161
+ 0: "resources_per_worker": {
162
+ 0: "GPU": 1
163
+ 0: },
164
+ 0: "sample_packing": true,
165
+ 0: "sample_packing_bin_size": 200,
166
+ 0: "sample_packing_group_size": 100000,
167
+ 0: "save_only_model": true,
168
+ 0: "save_safetensors": true,
169
+ 0: "save_total_limit": 20,
170
+ 0: "saves_per_epoch": 1,
171
+ 0: "sequence_len": 16384,
172
+ 0: "shuffle_before_merging_datasets": false,
173
+ 0: "shuffle_merged_datasets": true,
174
+ 0: "skip_prepare_dataset": false,
175
+ 0: "strict": false,
176
+ 0: "tensor_parallel_size": 1,
177
+ 0: "tf32": false,
178
+ 0: "tiled_mlp_use_original_mlp": true,
179
+ 0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-27b",
180
+ 0: "torch_dtype": "torch.bfloat16",
181
+ 0: "train_on_inputs": false,
182
+ 0: "trl": {
183
+ 0: "log_completions": false,
184
+ 0: "mask_truncated_completions": false,
185
+ 0: "ref_model_mixup_alpha": 0.9,
186
+ 0: "ref_model_sync_steps": 64,
187
+ 0: "scale_rewards": true,
188
+ 0: "sync_ref_model": false,
189
+ 0: "use_vllm": false,
190
+ 0: "vllm_server_host": "0.0.0.0",
191
+ 0: "vllm_server_port": 8000
192
+ 0: },
193
+ 0: "use_ray": false,
194
+ 0: "use_tensorboard": true,
195
+ 0: "val_set_size": 0.0,
196
+ 0: "vllm": {
197
+ 0: "device": "auto",
198
+ 0: "dtype": "auto",
199
+ 0: "gpu_memory_utilization": 0.9,
200
+ 0: "host": "0.0.0.0",
201
+ 0: "port": 8000
202
+ 0: },
203
+ 0: "warmup_steps": 100,
204
+ 0: "weight_decay": 0.0,
205
+ 0: "world_size": 16
206
+ 0: }
207
+ 0: [2025-11-24 00:08:40,026] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:3081979] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.
208
+ 0: [2025-11-24 00:08:41,217] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:472] [PID:3081979] [RANK:0] Loading prepared dataset from disk at /lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0/06698e902d3dba325ca34849b1dea5ea...
209
+ 0: [2025-11-24 00:09:14,927] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3081979] [RANK:0] gather_len_batches: [18976, 18976, 18976, 18975, 18977, 18976, 18975, 18976, 18976, 18975, 18976, 18976, 18976, 18976, 18976, 18976]
210
+ 0: [2025-11-24 00:09:14,950] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:3081979] [RANK:0] sample_packing_eff_est across ranks: [0.9989354014396667, 0.9988301396369934, 0.9989880323410034, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9989354014396667, 0.9989354014396667, 0.9989354014396667, 0.9988827705383301, 0.9989354014396667, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9989354014396667]
211
+ 0: [2025-11-24 00:09:14,959] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:3081979] [RANK:0] Maximum number of steps set at 711
212
+ 3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
213
+ 1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
214
+ 3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
215
+ 1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
216
+ 2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
217
+ 2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
218
+ 2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
219
+ 2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
220
+ 1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
221
+ 1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
222
+ 0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
223
+ 3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
224
+ 3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
225
+ 0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
226
+ 0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
227
+ 0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
228
+ 0: [2025-11-24 00:09:22,718] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:3081979] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation
229
+ 0: [2025-11-24 00:09:22,719] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:3081979] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation
230
+ 3:
231
+ 1:
232
+ 2:
233
+ 0:
234
+ 0: �█ | 3/5 [00:31<00:20, 10.27s/it]
235
+ 3: �█ | 3/5 [00:31<00:20, 10.27s/it]
236
+ 1: �█ | 3/5 [00:31<00:20, 10.27s/it]
237
+ 1: s/it]
238
+ 1:
239
+ 3:
240
+ 0:
241
+ 1:
242
+ 3:
243
+ 0:
244
+ 2: �█ | 3/5 [00:31<00:20, 10.27s/it]
245
+ 3:
246
+ 2:
247
+ 2:
248
+ 2:
249
+ 1:
250
+ 0:
251
+ 0: [2025-11-24 00:10:19,017] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:3081979] [RANK:0] Converting modules to torch.bfloat16
252
+ 0: [2025-11-24 00:10:22,748] [INFO] [axolotl.train.save_initial_configs:416] [PID:3081979] [RANK:0] Pre-saving tokenizer to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0...
253
+ 0: [2025-11-24 00:10:23,317] [INFO] [axolotl.train.save_initial_configs:419] [PID:3081979] [RANK:0] Pre-saving model config to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0...
254
+ 0: [2025-11-24 00:10:23,327] [INFO] [axolotl.train.save_initial_configs:423] [PID:3081979] [RANK:0] Pre-saving processor to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0...
255
+ 0: [2025-11-24 00:10:26,392] [INFO] [axolotl.train.execute_training:203] [PID:3081979] [RANK:0] Starting trainer...
256
+ 0: [2025-11-24 00:11:58,358] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3081979] [RANK:0] gather_len_batches: [18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976]
257
+ 0: Parameter Offload - Persistent parameters statistics: param_count = 563, numel = 1166448
258
+ 0: {'loss': 0.6182, 'grad_norm': 2.8189747023390073, 'learning_rate': 3.62e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.01}
259
+ 0:
260
  0%| | 0/711 [00:00<?, ?it/s]
261
  0%| | 1/711 [03:14<38:20:16, 194.39s/it]
262
  0%| | 2/711 [03:19<16:22:13, 83.12s/it]
263
  0%| | 3/711 [03:24<9:20:24, 47.49s/it]
264
  1%| | 4/711 [03:29<6:02:21, 30.75s/it]
265
  1%| | 5/711 [03:35<4:14:24, 21.62s/it]
266
  1%| | 6/711 [03:40<3:09:35, 16.14s/it]
267
  1%| | 7/711 [03:45<2:27:08, 12.54s/it]
268
  1%| | 8/711 [03:50<1:59:06, 10.17s/it]
269
  1%|▏ | 9/711 [03:56<1:40:22, 8.58s/it]
270
  1%|▏ | 10/711 [04:01<1:27:41, 7.51s/it]
271
 
272
  1%|▏ | 10/711 [04:01<1:27:41, 7.51s/it]
273
  2%|▏ | 11/711 [04:06<1:18:52, 6.76s/it]
274
  2%|▏ | 12/711 [04:11<1:12:59, 6.27s/it]
275
  2%|▏ | 13/711 [04:16<1:08:45, 5.91s/it]
276
  2%|▏ | 14/711 [04:21<1:06:01, 5.68s/it]
277
  2%|▏ | 15/711 [04:26<1:03:51, 5.51s/it]
278
  2%|▏ | 16/711 [04:31<1:02:15, 5.38s/it]
279
  2%|▏ | 17/711 [04:37<1:01:45, 5.34s/i
280
+ 0: {'loss': 0.5822, 'grad_norm': 1.7276350224873818, 'learning_rate': 5.420000000000001e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.02}
281
+ 0: {'loss': 0.5571, 'grad_norm': 2.161001413543057, 'learning_rate': 7.219999999999999e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.03}
282
+ 0: t]
283
  3%|▎ | 18/711 [04:42<1:01:12, 5.30s/it]
284
  3%|▎ | 19/711 [04:47<1:00:37, 5.26s/it]
285
  3%|▎ | 20/711 [04:52<1:00:10, 5.23s/it]
286
 
287
  3%|▎ | 20/711 [04:52<1:00:10, 5.23s/it]
288
  3%|▎ | 21/711 [04:57<59:44, 5.19s/it]
289
  3%|▎ | 22/711 [05:02<59:22, 5.17s/it]
290
  3%|▎ | 23/711 [05:07<59:00, 5.15s/it]
291
  3%|▎ | 24/711 [05:13<58:57, 5.15s/it]
292
  4%|▎ | 25/711 [05:18<59:09, 5.17s/it]
293
  4%|▎ | 26/711 [05:23<58:51, 5.16s/it]
294
  4%|▍ | 27/711 [05:28<58:27, 5.13s/it]
295
  4%|▍ | 28/711 [05:33<58:17, 5.12s/it]
296
  4%|▍ | 29/711 [05:38<58:07, 5.11s/it]
297
  4%|▍ | 30/711 [05:44<59:10, 5.21s/it]
298
 
299
  4%|▍ | 30/711 [05:44<59:10, 5.21s/it]
300
  4%|▍ | 31/711 [05:49<58:50, 5.19s/it]
301
  5%|▍ | 32/711 [05:54<58:36, 5.18s/it]
302
  5%|▍ | 33/711 [05:59<59:31, 5
303
+ 0: {'loss': 0.5218, 'grad_norm': 1.0906530849609661, 'learning_rate': 9.020000000000001e-07, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.03}
304
+ 0: .27s/it]
305
  5%|▍ | 34/711 [06:04<58:50, 5.22s/it]
306
  5%|▍ | 35/711 [06:10<58:21, 5.18s/it]
307
  5%|▌ | 36/711 [06:15<57:59, 5.16s/it]
308
  5%|▌ | 37/711 [06:20<57:35, 5.13s/it]
309
  5%|▌ | 38/711 [06:25<57:22, 5.12s/it]
310
  5%|▌ | 39/711 [06:30<57:16, 5.11s/it]
311
  6%|▌ | 40/711 [06:35<57:20, 5.13s/it]
312
 
313
  6%|▌ | 40/711 [06:35<57:20, 5.13s/it]
314
  6%|▌ | 41/711 [06:40<57:23, 5.14s/it]
315
  6%|▌ | 42/711 [06:45<57:09, 5.13s/it]
316
  6%|▌ | 43/711 [06:51<57:31, 5.17s/it]
317
  6%|▌ | 44/711 [06:56<57:25, 5.17s/it]
318
  6%|▋ | 45/711 [07:01<57:08, 5.15s/it]
319
  6%|▋ | 46/711 [07:06<57:01, 5.15s/it]
320
  7%|▋ | 47/711 [07:11<56:53, 5.14s/it]
321
  7%|▋ | 48/711 [07:16<57:06, 5.17s/it]
322
  7%|▋ | 49/711 [07:22<57:03, 5.17s/it]
323
  7%|▋ | 50/711 [07:27<57:26, 5.21s/it]
324
 
325
+ 0: {'loss': 0.4959, 'grad_norm': 0.8804401875885586, 'learning_rate': 1.082e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.04}
326
+ 0: {'loss': 0.4729, 'grad_norm': 1.7572079203155466, 'learning_rate': 1.262e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.05}
327
+ 0:
328
  7%|▋ | 50/711 [07:27<57:26, 5.21s/it]
329
  7%|▋ | 51/711 [07:32<57:06, 5.19s/it]
330
  7%|▋ | 52/711 [07:37<57:02, 5.19s/it]
331
  7%|▋ | 53/711 [07:43<57:29, 5.24s/it]
332
  8%|▊ | 54/711 [07:48<58:04, 5.30s/it]
333
  8%|▊ | 55/711 [07:53<57:20, 5.25s/it]
334
  8%|▊ | 56/711 [07:59<57:54, 5.30s/it]
335
  8%|▊ | 57/711 [08:04<57:10, 5.25s/it]
336
  8%|▊ | 58/711 [08:09<57:55, 5.32s/it]
337
  8%|▊ | 59/711 [08:14<57:11, 5.26s/it]
338
  8%|▊ | 60/711 [08:19<56:30, 5.21s/it]
339
 
340
  8%|▊ | 60/711 [08:19<56:30, 5.21s/it]
341
  9%|▊ | 61/711 [08:25<56:35, 5.22s/it]
342
  9%|▊ | 62/711 [08:30<56:18, 5.21s/it]
343
  9%|▉ | 63/711 [08:35<57:08, 5.29s/it]
344
  9%|▉ | 64/711 [08:41<56:59, 5.28s/it]
345
  9%|▉ | 65/711 [08:46<56:22, 5.24s/it]
346
  9%|▉ | 66/711 [08:51<56:36, 5.27s/it]
347
  9%|▉ | 67/711 [08:56<56:06, 5.23s/it]
348
  1
349
+ 0: {'loss': 0.4737, 'grad_norm': 1.0832691281380182, 'learning_rate': 1.442e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.06}
350
+ 0: {'loss': 0.4648, 'grad_norm': 0.9351167776948649, 'learning_rate': 1.622e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.07}
351
+ 0: 0%|▉ | 68/711 [09:01<55:32, 5.18s/it]
352
  10%|▉ | 69/711 [09:06<55:12, 5.16s/it]
353
  10%|▉ | 70/711 [09:12<55:17, 5.18s/it]
354
 
355
  10%|▉ | 70/711 [09:12<55:17, 5.18s/it]
356
  10%|▉ | 71/711 [09:17<54:55, 5.15s/it]
357
  10%|█ | 72/711 [09:22<54:39, 5.13s/it]
358
  10%|█ | 73/711 [09:27<54:25, 5.12s/it]
359
  10%|█ | 74/711 [09:32<54:18, 5.12s/it]
360
  11%|█ | 75/711 [09:37<54:05, 5.10s/it]
361
  11%|█ | 76/711 [09:42<54:01, 5.11s/it]
362
  11%|█ | 77/711 [09:47<53:56, 5.11s/it]
363
  11%|█ | 78/711 [09:52<54:06, 5.13s/it]
364
  11%|█ | 79/711 [09:57<53:56, 5.12s/it]
365
  11%|█▏ | 80/711 [10:03<54:04, 5.14s/it]
366
 
367
  11%|█▏ | 80/711 [10:03<54:04, 5.14s/it]
368
  11%|█▏ | 81/711 [10:08<55:07, 5.25s/it]
369
  12%|█▏ | 82/711 [10:13<54:50, 5.23s/it]
370
  12%|█▏ | 83/711 [10:19<55:29, 5.30s/it
371
+ 0: {'loss': 0.4437, 'grad_norm': 1.0944333242533355, 'learning_rate': 1.802e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.08}
372
+ 0: ]
373
  12%|█▏ | 84/711 [10:24<54:57, 5.26s/it]
374
  12%|█▏ | 85/711 [10:29<54:21, 5.21s/it]
375
  12%|█▏ | 86/711 [10:35<55:05, 5.29s/it]
376
  12%|█▏ | 87/711 [10:40<54:33, 5.25s/it]
377
  12%|█▏ | 88/711 [10:45<54:02, 5.20s/it]
378
  13%|█▎ | 89/711 [10:50<53:35, 5.17s/it]
379
  13%|█▎ | 90/711 [10:55<53:14, 5.14s/it]
380
 
381
  13%|█▎ | 90/711 [10:55<53:14, 5.14s/it]
382
  13%|█▎ | 91/711 [11:00<52:59, 5.13s/it]
383
  13%|█▎ | 92/711 [11:05<53:42, 5.21s/it]
384
  13%|█▎ | 93/711 [11:11<54:32, 5.30s/it]
385
  13%|█▎ | 94/711 [11:16<54:50, 5.33s/it]
386
  13%|█▎ | 95/711 [11:22<55:36, 5.42s/it]
387
  14%|█▎ | 96/711 [11:27<55:11, 5.38s/it]
388
  14%|█▎ | 97/711 [11:32<54:13, 5.30s/it]
389
  14%|█▍ | 98/711 [11:38<54:39, 5.35s/it]
390
  14%|█▍ | 99/711 [11:43<53:58, 5.29s/it]
391
  14%|█▍ | 100/711 [11:48<53:15, 5.23s/it]
392
 
393
+ 0: {'loss': 0.4312, 'grad_norm': 0.821415164120209, 'learning_rate': 1.982e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.08}
394
+ 0: {'loss': 0.4519, 'grad_norm': 1.098049116364939, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.09}
395
+ 0:
396
  14%|█▍ | 100/711 [11:48<53:15, 5.23s/it]
397
  14%|█▍ | 101/711 [11:53<52:46, 5.19s/it]
398
  14%|█▍ | 102/711 [11:58<52:21, 5.16s/it]
399
  14%|█▍ | 103/711 [12:03<52:00, 5.13s/it]
400
  15%|█▍ | 104/711 [12:08<51:44, 5.11s/it]
401
  15%|█▍ | 105/711 [12:14<51:39, 5.11s/it]
402
  15%|█▍ | 106/711 [12:19<51:43, 5.13s/it]
403
  15%|█▌ | 107/711 [12:24<51:34, 5.12s/it]
404
  15%|█▌ | 108/711 [12:29<51:22, 5.11s/it]
405
  15%|█▌ | 109/711 [12:34<51:11, 5.10s/it]
406
  15%|█▌ | 110/711 [12:39<51:14, 5.12s/it]
407
 
408
  15%|█▌ | 110/711 [12:39<51:14, 5.12s/it]
409
  16%|█▌ | 111/711 [12:44<51:43, 5.17s/it]
410
  16%|█▌ | 112/711 [12:50<51:33, 5.16s/it]
411
  16%|█▌ | 113/711 [12:55<52:08, 5.23s/it]
412
  16%|█▌ | 114/711 [13:00<51:48, 5.21s/it]
413
  16%|█▌ | 115/711 [13:05<51:23, 5.17s/it]
414
  16%|█▋ |
415
+ 0: {'loss': 0.4418, 'grad_norm': 0.8654847799165983, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.1}
416
+ 0: {'loss': 0.4272, 'grad_norm': 0.8743836149172823, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.11}
417
+ 0: 116/711 [13:10<51:17, 5.17s/it]
418
  16%|█▋ | 117/711 [13:16<51:03, 5.16s/it]
419
  17%|█▋ | 118/711 [13:21<50:59, 5.16s/it]
420
  17%|█▋ | 119/711 [13:26<51:20, 5.20s/it]
421
  17%|█▋ | 120/711 [13:31<50:54, 5.17s/it]
422
 
423
  17%|█▋ | 120/711 [13:31<50:54, 5.17s/it]
424
  17%|█▋ | 121/711 [13:36<50:42, 5.16s/it]
425
  17%|█▋ | 122/711 [13:41<50:37, 5.16s/it]
426
  17%|█▋ | 123/711 [13:47<50:33, 5.16s/it]
427
  17%|█▋ | 124/711 [13:52<50:44, 5.19s/it]
428
  18%|█▊ | 125/711 [13:57<51:31, 5.28s/it]
429
  18%|█▊ | 126/711 [14:03<52:11, 5.35s/it]
430
  18%|█▊ | 127/711 [14:08<52:12, 5.36s/it]
431
  18%|█▊ | 128/711 [14:14<52:10, 5.37s/it]
432
  18%|█▊ | 129/711 [14:19<51:21, 5.29s/it]
433
  18%|█▊ | 130/711 [14:24<50:50, 5.25s/it]
434
 
435
  18%|█▊ | 130/711 [14:24<50:50, 5.25s/it]
436
  18%|█▊ | 131/
437
+ 0: {'loss': 0.4317, 'grad_norm': 0.886837889977122, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.12}
438
+ 0: 711 [14:29<50:24, 5.22s/it]
439
  19%|█▊ | 132/711 [14:34<50:04, 5.19s/it]
440
  19%|█▊ | 133/711 [14:39<50:21, 5.23s/it]
441
  19%|█▉ | 134/711 [14:45<49:58, 5.20s/it]
442
  19%|█▉ | 135/711 [14:50<49:37, 5.17s/it]
443
  19%|█▉ | 136/711 [14:55<49:54, 5.21s/it]
444
  19%|█▉ | 137/711 [15:00<49:29, 5.17s/it]
445
  19%|█▉ | 138/711 [15:05<49:14, 5.16s/it]
446
  20%|█▉ | 139/711 [15:10<49:05, 5.15s/it]
447
  20%|█▉ | 140/711 [15:16<49:19, 5.18s/it]
448
 
449
  20%|█▉ | 140/711 [15:16<49:19, 5.18s/it]
450
  20%|█▉ | 141/711 [15:21<49:04, 5.17s/it]
451
  20%|█▉ | 142/711 [15:26<48:43, 5.14s/it]
452
  20%|██ | 143/711 [15:31<48:31, 5.13s/it]
453
  20%|██ | 144/711 [15:36<48:59, 5.19s/it]
454
  20%|██ | 145/711 [15:41<48:54, 5.19s/it]
455
  21%|██ | 146/711 [15:46<48:39, 5.17s/it]
456
  21%|██ | 147/711 [15:52<48:22, 5.15s/it]
457
  21%|██ | 148/7
458
+ 0: {'loss': 0.4309, 'grad_norm': 1.0717708423744885, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.13}
459
+ 0: {'loss': 0.4316, 'grad_norm': 0.8573484702226136, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.13}
460
+ 0: 11 [15:57<48:11, 5.14s/it]
461
  21%|██ | 149/711 [16:02<48:06, 5.14s/it]
462
  21%|██ | 150/711 [16:07<48:06, 5.14s/it]
463
 
464
  21%|██ | 150/711 [16:07<48:06, 5.14s/it]
465
  21%|██ | 151/711 [16:12<48:00, 5.14s/it]
466
  21%|██▏ | 152/711 [16:17<48:35, 5.22s/it]
467
  22%|██▏ | 153/711 [16:23<48:21, 5.20s/it]
468
  22%|██▏ | 154/711 [16:28<48:00, 5.17s/it]
469
  22%|██▏ | 155/711 [16:33<47:54, 5.17s/it]
470
  22%|██▏ | 156/711 [16:38<47:42, 5.16s/it]
471
  22%|██▏ | 157/711 [16:43<47:36, 5.16s/it]
472
  22%|██▏ | 158/711 [16:48<47:27, 5.15s/it]
473
  22%|██▏ | 159/711 [16:54<48:09, 5.23s/it]
474
  23%|██▎ | 160/711 [16:59<47:40, 5.19s/it]
475
 
476
  23%|██▎ | 160/711 [16:59<47:40, 5.19s/it]
477
  23%|██▎ | 161/711 [17:04<47:17, 5.16s/it]
478
  23%|██▎ | 162/711 [17:09<47:17, 5.17s/it]
479
  23%|�
480
+ 0: {'loss': 0.4239, 'grad_norm': 0.8728825320101697, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.14}
481
+ 0: ��█▎ | 163/711 [17:14<46:56, 5.14s/it]
482
  23%|██▎ | 164/711 [17:20<47:49, 5.25s/it]
483
  23%|██▎ | 165/711 [17:25<47:18, 5.20s/it]
484
  23%|██▎ | 166/711 [17:30<47:54, 5.27s/it]
485
  23%|██▎ | 167/711 [17:35<47:21, 5.22s/it]
486
  24%|██▎ | 168/711 [17:41<47:10, 5.21s/it]
487
  24%|██▍ | 169/711 [17:46<46:57, 5.20s/it]
488
  24%|██▍ | 170/711 [17:51<46:38, 5.17s/it]
489
 
490
  24%|██▍ | 170/711 [17:51<46:38, 5.17s/it]
491
  24%|██▍ | 171/711 [17:56<46:21, 5.15s/it]
492
  24%|██▍ | 172/711 [18:01<46:17, 5.15s/it]
493
  24%|██▍ | 173/711 [18:06<46:18, 5.16s/it]
494
  24%|██▍ | 174/711 [18:11<46:01, 5.14s/it]
495
  25%|██▍ | 175/711 [18:16<45:50, 5.13s/it]
496
  25%|██▍ | 176/711 [18:22<45:49, 5.14s/it]
497
  25%|██▍ | 177/711 [18:27<45:49, 5.15s/it]
498
  25%|██▌ | 178/711 [18:32<46:45, 5.26s/it]
499
  25%|██▌ | 179/
500
+ 0: {'loss': 0.4173, 'grad_norm': 2.470513995230686, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.15}
501
+ 0: {'loss': 0.4151, 'grad_norm': 0.9038938137872402, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.16}
502
+ 0: 711 [18:37<46:12, 5.21s/it]
503
  25%|██▌ | 180/711 [18:43<45:49, 5.18s/it]
504
 
505
  25%|██▌ | 180/711 [18:43<45:49, 5.18s/it]
506
  25%|██▌ | 181/711 [18:48<45:33, 5.16s/it]
507
  26%|██▌ | 182/711 [18:53<45:22, 5.15s/it]
508
  26%|██▌ | 183/711 [18:58<45:10, 5.13s/it]
509
  26%|██▌ | 184/711 [19:03<45:33, 5.19s/it]
510
  26%|██▌ | 185/711 [19:08<45:10, 5.15s/it]
511
  26%|██▌ | 186/711 [19:13<44:56, 5.14s/it]
512
  26%|██▋ | 187/711 [19:18<44:44, 5.12s/it]
513
  26%|██▋ | 188/711 [19:24<45:35, 5.23s/it]
514
  27%|██▋ | 189/711 [19:29<45:10, 5.19s/it]
515
  27%|██▋ | 190/711 [19:34<44:59, 5.18s/it]
516
 
517
  27%|██▋ | 190/711 [19:34<44:59, 5.18s/it]
518
  27%|██▋ | 191/711 [19:39<44:41, 5.16s/it]
519
  27%|██▋ | 192/711 [19:44<44:29, 5.14s/it]
520
  27%|██▋ | 193/711 [19:50<44:25, 5.15s/i
521
+ 0: {'loss': 0.4194, 'grad_norm': 2.3527260378633015, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.17}
522
+ 0: t]
523
  27%|██▋ | 194/711 [19:55<44:10, 5.13s/it]
524
  27%|██▋ | 195/711 [20:00<44:25, 5.17s/it]
525
  28%|██▊ | 196/711 [20:05<44:10, 5.15s/it]
526
  28%|██▊ | 197/711 [20:10<44:27, 5.19s/it]
527
  28%|██▊ | 198/711 [20:16<45:08, 5.28s/it]
528
  28%|██▊ | 199/711 [20:21<44:34, 5.22s/it]
529
  28%|██▊ | 200/711 [20:26<44:17, 5.20s/it]
530
 
531
  28%|██▊ | 200/711 [20:26<44:17, 5.20s/it]
532
  28%|██▊ | 201/711 [20:31<43:55, 5.17s/it]
533
  28%|██▊ | 202/711 [20:36<43:35, 5.14s/it]
534
  29%|██▊ | 203/711 [20:41<43:37, 5.15s/it]
535
  29%|██▊ | 204/711 [20:47<44:14, 5.24s/it]
536
  29%|██▉ | 205/711 [20:52<44:01, 5.22s/it]
537
  29%|██▉ | 206/711 [20:57<43:43, 5.20s/it]
538
  29%|██▉ | 207/711 [21:02<43:30, 5.18s/it]
539
  29%|██▉ | 208/711 [21:07<43:14, 5.16s/it]
540
  29%|██▉ | 209/711 [21:12<43:02, 5.15s/it]
541
  30%|██▉
542
+ 0: {'loss': 0.413, 'grad_norm': 0.893185793908122, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.18}
543
+ 0: {'loss': 0.4217, 'grad_norm': 1.160958862723743, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.19}
544
+ 0: | 210/711 [21:18<42:50, 5.13s/it]
545
 
546
  30%|██▉ | 210/711 [21:18<42:50, 5.13s/it]
547
  30%|██▉ | 211/711 [21:23<42:40, 5.12s/it]
548
  30%|██▉ | 212/711 [21:28<42:38, 5.13s/it]
549
  30%|██▉ | 213/711 [21:33<42:38, 5.14s/it]
550
  30%|███ | 214/711 [21:38<42:36, 5.14s/it]
551
  30%|███ | 215/711 [21:43<42:24, 5.13s/it]
552
  30%|███ | 216/711 [21:48<42:15, 5.12s/it]
553
  31%|███ | 217/711 [21:53<42:11, 5.12s/it]
554
  31%|███ | 218/711 [21:59<42:01, 5.12s/it]
555
  31%|███ | 219/711 [22:04<42:01, 5.12s/it]
556
  31%|███ | 220/711 [22:09<41:50, 5.11s/it]
557
 
558
  31%|███ | 220/711 [22:09<41:50, 5.11s/it]
559
  31%|███ | 221/711 [22:14<42:03, 5.15s/it]
560
  31%|███ | 222/711 [22:19<42:36, 5.23s/it]
561
  31%|███▏ | 223/711 [22:25<42:39, 5.25s/it]
562
  32%|███▏ | 224/711 [22:30<42
563
+ 0: {'loss': 0.4102, 'grad_norm': 0.8461972280700218, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.19}
564
+ 0: :22, 5.22s/it]
565
  32%|███▏ | 225/711 [22:35<42:13, 5.21s/it]
566
  32%|███▏ | 226/711 [22:40<42:40, 5.28s/it]
567
  32%|███▏ | 227/711 [22:46<42:11, 5.23s/it]
568
  32%|███▏ | 228/711 [22:51<41:45, 5.19s/it]
569
  32%|███▏ | 229/711 [22:56<41:33, 5.17s/it]
570
  32%|███▏ | 230/711 [23:01<41:16, 5.15s/it]
571
 
572
  32%|███▏ | 230/711 [23:01<41:16, 5.15s/it]
573
  32%|███▏ | 231/711 [23:06<41:02, 5.13s/it]
574
  33%|███▎ | 232/711 [23:11<40:52, 5.12s/it]
575
  33%|███▎ | 233/711 [23:16<40:57, 5.14s/it]
576
  33%|███▎ | 234/711 [23:21<40:58, 5.15s/it]
577
  33%|███▎ | 235/711 [23:27<40:52, 5.15s/it]
578
  33%|███▎ | 236/711 [23:32<40:41, 5.14s/it]
579
  33%|███▎ | 237/711 [23:37<40:32, 5.13s/it]
580
  33%|███▎ | 238/711 [23:42<40:21, 5.12s/it]
581
  34%|███▎ | 239/711 [23:47<40:22, 5.13s/it]
582
  34%|███▍ | 240/
583
+ 0: {'loss': 0.4155, 'grad_norm': 0.8314605620161184, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 67.99, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.2}
584
+ 0: {'loss': 0.4045, 'grad_norm': 0.8328744939735258, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.21}
585
+ 0: 711 [23:52<40:14, 5.13s/it]
586
 
587
  34%|███▍ | 240/711 [23:52<40:14, 5.13s/it]
588
  34%|███▍ | 241/711 [23:57<40:04, 5.12s/it]
589
  34%|███▍ | 242/711 [24:03<40:30, 5.18s/it]
590
  34%|███▍ | 243/711 [24:08<40:55, 5.25s/it]
591
  34%|███▍ | 244/711 [24:13<40:38, 5.22s/it]
592
  34%|███▍ | 245/711 [24:18<40:16, 5.18s/it]
593
  35%|███▍ | 246/711 [24:23<40:06, 5.18s/it]
594
  35%|███▍ | 247/711 [24:29<40:20, 5.22s/it]
595
  35%|███▍ | 248/711 [24:34<39:58, 5.18s/it]
596
  35%|███▌ | 249/711 [24:39<39:40, 5.15s/it]
597
  35%|███▌ | 250/711 [24:44<39:34, 5.15s/it]
598
 
599
  35%|███▌ | 250/711 [24:44<39:34, 5.15s/it]
600
  35%|███▌ | 251/711 [24:49<39:23, 5.14s/it]
601
  35%|███▌ | 252/711 [24:54<39:08, 5.12s/it]
602
  36%|███▌ | 253/711 [24:59<39:13, 5.14s/it]
603
  36%|███▌
604
+ 0: {'loss': 0.4005, 'grad_norm': 0.8810433727017853, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.22}
605
+ 0: | 254/711 [25:05<39:10, 5.14s/it]
606
  36%|███▌ | 255/711 [25:10<39:07, 5.15s/it]
607
  36%|███▌ | 256/711 [25:15<39:17, 5.18s/it]
608
  36%|███▌ | 257/711 [25:20<38:57, 5.15s/it]
609
  36%|███▋ | 258/711 [25:26<39:48, 5.27s/it]
610
  36%|███▋ | 259/711 [25:31<39:17, 5.22s/it]
611
  37%|███▋ | 260/711 [25:36<38:58, 5.19s/it]
612
 
613
  37%|███▋ | 260/711 [25:36<38:58, 5.19s/it]
614
  37%|███▋ | 261/711 [25:41<38:40, 5.16s/it]
615
  37%|███▋ | 262/711 [25:46<38:26, 5.14s/it]
616
  37%|███▋ | 263/711 [25:51<38:26, 5.15s/it]
617
  37%|███▋ | 264/711 [25:56<38:28, 5.16s/it]
618
  37%|███▋ | 265/711 [26:02<38:16, 5.15s/it]
619
  37%|███▋ | 266/711 [26:07<38:12, 5.15s/it]
620
  38%|███▊ | 267/711 [26:12<38:50, 5.25s/it]
621
  38%|███▊ | 268/711 [26:18<39:17, 5.32s/it]
622
  38%|███▊ | 269/711 [26:23<38:44, 5.26s/it]
623
  38%|█�
624
+ 0: {'loss': 0.4021, 'grad_norm': 1.0060252029086465, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.23}
625
+ 0: [2025-11-24 00:38:52,072] [WARNING] [stage3.py:2150:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
626
+ 0: {'loss': 0.4124, 'grad_norm': 0.9014415482740915, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.24}
627
+ 0: �█▊ | 270/711 [26:28<38:17, 5.21s/it]
628
 
629
  38%|███▊ | 270/711 [26:28<38:17, 5.21s/it]
630
  38%|███▊ | 271/711 [26:33<37:58, 5.18s/it]
631
  38%|███▊ | 272/711 [26:38<37:42, 5.15s/it]
632
  38%|███▊ | 273/711 [26:43<37:32, 5.14s/it]
633
  39%|███▊ | 274/711 [26:49<39:58, 5.49s/it]
634
  39%|███▊ | 275/711 [26:55<39:08, 5.39s/it]
635
  39%|███▉ | 276/711 [27:00<38:28, 5.31s/it]
636
  39%|███▉ | 277/711 [27:05<38:44, 5.36s/it]
637
  39%|███▉ | 278/711 [27:10<38:16, 5.30s/it]
638
  39%|███▉ | 279/711 [27:16<37:45, 5.24s/it]
639
  39%|███▉ | 280/711 [27:21<37:26, 5.21s/it]
640
 
641
  39%|███▉ | 280/711 [27:21<37:26, 5.21s/it]
642
  40%|███▉ | 281/711 [27:26<38:16, 5.34s/it]
643
  40%|███▉ | 282/711 [27:31<37:40, 5.27s/it]
644
  40%|███▉ | 283/711 [27:37<37:37, 5.28s/it]
645
  40%
646
+ 0: {'loss': 0.3928, 'grad_norm': 1.1303634088009527, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.24}
647
+ 0: |███▉ | 284/711 [27:42<37:57, 5.33s/it]
648
  40%|████ | 285/711 [27:47<37:34, 5.29s/it]
649
  40%|████ | 286/711 [27:53<37:14, 5.26s/it]
650
  40%|████ | 287/711 [27:58<37:31, 5.31s/it]
651
  41%|████ | 288/711 [28:03<37:04, 5.26s/it]
652
  41%|████ | 289/711 [28:08<36:40, 5.21s/it]
653
  41%|████ | 290/711 [28:13<36:19, 5.18s/it]
654
 
655
  41%|████ | 290/711 [28:13<36:19, 5.18s/it]
656
  41%|████ | 291/711 [28:19<36:32, 5.22s/it]
657
  41%|████ | 292/711 [28:24<36:57, 5.29s/it]
658
  41%|████ | 293/711 [28:29<36:57, 5.31s/it]
659
  41%|████▏ | 294/711 [28:35<36:36, 5.27s/it]
660
  41%|████▏ | 295/711 [28:40<36:18, 5.24s/it]
661
  42%|████▏ | 296/711 [28:45<35:59, 5.20s/it]
662
  42%|████▏ | 297/711 [28:50<36:22, 5.27s/it]
663
  42%|████▏ | 298/711 [28:55<36:02, 5.24s/it]
664
  42%|████▏ | 299/711 [29:
665
+ 0: {'loss': 0.4025, 'grad_norm': 0.8789278025527175, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.25}
666
+ 0: {'loss': 0.4015, 'grad_norm': 0.7615557087401322, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.26}
667
+ 0: 01<35:58, 5.24s/it]
668
  42%|████▏ | 300/711 [29:06<36:01, 5.26s/it]
669
 
670
  42%|████▏ | 300/711 [29:06<36:01, 5.26s/it]
671
  42%|████▏ | 301/711 [29:11<35:42, 5.23s/it]
672
  42%|████▏ | 302/711 [29:16<35:28, 5.20s/it]
673
  43%|████▎ | 303/711 [29:21<35:11, 5.17s/it]
674
  43%|████▎ | 304/711 [29:27<35:03, 5.17s/it]
675
  43%|████▎ | 305/711 [29:32<35:16, 5.21s/it]
676
  43%|████▎ | 306/711 [29:37<34:57, 5.18s/it]
677
  43%|████▎ | 307/711 [29:42<34:42, 5.15s/it]
678
  43%|████▎ | 308/711 [29:47<34:33, 5.15s/it]
679
  43%|████▎ | 309/711 [29:53<35:23, 5.28s/it]
680
  44%|████▎ | 310/711 [29:59<36:23, 5.45s/it]
681
 
682
  44%|████▎ | 310/711 [29:59<36:23, 5.45s/it]
683
  44%|████▎ | 311/711 [30:04<35:40, 5.35s/it]
684
  44%|████▍ | 312/711 [30:09<35:47, 5.38s/it]
685
 
686
+ 0: {'loss': 0.4047, 'grad_norm': 1.0096950251075136, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.27}
687
+ 0: 44%|████▍ | 313/711 [30:14<35:07, 5.30s/it]
688
  44%|████▍ | 314/711 [30:20<35:18, 5.34s/it]
689
  44%|████▍ | 315/711 [30:25<35:09, 5.33s/it]
690
  44%|████▍ | 316/711 [30:30<34:42, 5.27s/it]
691
  45%|████▍ | 317/711 [30:35<34:17, 5.22s/it]
692
  45%|████▍ | 318/711 [30:40<34:02, 5.20s/it]
693
  45%|████▍ | 319/711 [30:46<33:48, 5.18s/it]
694
  45%|████▌ | 320/711 [30:51<33:36, 5.16s/it]
695
 
696
  45%|████▌ | 320/711 [30:51<33:36, 5.16s/it]
697
  45%|████▌ | 321/711 [30:56<33:30, 5.16s/it]
698
  45%|████▌ | 322/711 [31:01<33:19, 5.14s/it]
699
  45%|████▌ | 323/711 [31:06<33:37, 5.20s/it]
700
  46%|████▌ | 324/711 [31:11<33:25, 5.18s/it]
701
  46%|████▌ | 325/711 [31:17<33:17, 5.17s/it]
702
  46%|████▌ | 326/711 [31:22<33:09, 5.17s/it]
703
  46%|████▌ | 327/711 [31:27<33:08, 5.18s/it]
704
  46%|███
705
+ 0: {'loss': 0.4025, 'grad_norm': 0.9227721040091849, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.28}
706
+ 0: {'loss': 0.3987, 'grad_norm': 1.8038936518267323, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.29}
707
+ 0: █▌ | 328/711 [31:32<32:55, 5.16s/it]
708
  46%|████▋ | 329/711 [31:37<32:44, 5.14s/it]
709
  46%|████▋ | 330/711 [31:42<32:58, 5.19s/it]
710
 
711
  46%|████▋ | 330/711 [31:42<32:58, 5.19s/it]
712
  47%|████▋ | 331/711 [31:48<32:46, 5.18s/it]
713
  47%|████▋ | 332/711 [31:53<32:35, 5.16s/it]
714
  47%|████▋ | 333/711 [31:58<33:05, 5.25s/it]
715
  47%|████▋ | 334/711 [32:03<32:47, 5.22s/it]
716
  47%|████▋ | 335/711 [32:09<33:10, 5.29s/it]
717
  47%|████▋ | 336/711 [32:14<32:41, 5.23s/it]
718
  47%|████▋ | 337/711 [32:19<32:21, 5.19s/it]
719
  48%|████▊ | 338/711 [32:24<32:19, 5.20s/it]
720
  48%|████▊ | 339/711 [32:29<32:05, 5.18s/it]
721
  48%|████▊ | 340/711 [32:34<31:51, 5.15s/it]
722
 
723
  48%|████▊ | 340/711 [32:34<31:51, 5.15s/it]
724
  48%|████▊ | 341/711 [
725
+ 0: {'loss': 0.4004, 'grad_norm': 0.8530478906547682, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.3}
726
+ 0: 32:40<31:54, 5.17s/it]
727
  48%|████▊ | 342/711 [32:45<31:45, 5.16s/it]
728
  48%|████▊ | 343/711 [32:50<31:37, 5.16s/it]
729
  48%|████▊ | 344/711 [32:55<31:44, 5.19s/it]
730
  49%|████▊ | 345/711 [33:00<31:29, 5.16s/it]
731
  49%|████▊ | 346/711 [33:05<31:18, 5.15s/it]
732
  49%|████▉ | 347/711 [33:10<31:07, 5.13s/it]
733
  49%|████▉ | 348/711 [33:16<30:59, 5.12s/it]
734
  49%|████▉ | 349/711 [33:21<30:57, 5.13s/it]
735
  49%|████▉ | 350/711 [33:26<30:52, 5.13s/it]
736
 
737
  49%|████▉ | 350/711 [33:26<30:52, 5.13s/it]
738
  49%|████▉ | 351/711 [33:31<30:45, 5.13s/it]
739
  50%|████▉ | 352/711 [33:36<30:36, 5.12s/it]
740
  50%|████▉ | 353/711 [33:41<30:32, 5.12s/it]
741
  50%|████▉ | 354/711 [33:46<30:26, 5.12s/it]
742
  50%|████▉ | 355/711 [33:51<30:20, 5.11s/it]
743
  50%|█████ | 356/711 [33:57<30:13,
744
+ 0: {'loss': 0.4055, 'grad_norm': 0.8072887895552343, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.3}
745
+ 0: 5.11s/it]
746
  50%|█████ | 357/711 [34:02<30:10, 5.11s/it]
747
  50%|█████ | 358/711 [34:07<30:07, 5.12s/it]
748
  50%|█████ | 359/711 [34:12<30:03, 5.12s/it]
749
  51%|█████ | 360/711 [34:17<29:57, 5.12s/it]
750
 
751
  51%|█████ | 360/711 [34:17<29:57, 5.12s/it]
752
  51%|█████ | 361/711 [34:22<29:53, 5.12s/it]
753
  51%|█████ | 362/711 [34:27<29:46, 5.12s/it]
754
  51%|█████ | 363/711 [34:32<29:41, 5.12s/it]
755
  51%|█████ | 364/711 [34:38<29:39, 5.13s/it]
756
  51%|█████▏ | 365/711 [34:43<29:59, 5.20s/it]
757
  51%|█████▏ | 366/711 [34:48<29:44, 5.17s/it]
758
  52%|█████▏ | 367/711 [34:53<29:30, 5.15s/it]
759
  52%|█████▏ | 368/711 [34:58<29:27, 5.15s/it]
760
  52%|█████▏ | 369/711 [35:03<29:14, 5.13s/it]
761
  52%|█████▏ | 370/711 [35:09<29:11, 5.14s/it]
762
 
763
+ 0: {'loss': 0.4024, 'grad_norm': 0.8486839849343547, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.31}
764
+ 0: {'loss': 0.4021, 'grad_norm': 0.8529581759108179, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.32}
765
+ 0:
766
  52%|█████▏ | 370/711 [35:09<29:11, 5.14s/it]
767
  52%|█████▏ | 371/711 [35:14<29:01, 5.12s/it]
768
  52%|█████▏ | 372/711 [35:19<28:54, 5.12s/it]
769
  52%|█████▏ | 373/711 [35:24<29:01, 5.15s/it]
770
  53%|█████▎ | 374/711 [35:29<28:53, 5.14s/it]
771
  53%|█████▎ | 375/711 [35:34<28:43, 5.13s/it]
772
  53%|█████▎ | 376/711 [35:39<28:33, 5.11s/it]
773
  53%|█████▎ | 377/711 [35:44<28:25, 5.11s/it]
774
  53%|█████▎ | 378/711 [35:49<28:21, 5.11s/it]
775
  53%|█████▎ | 379/711 [35:55<28:52, 5.22s/it]
776
  53%|█████▎ | 380/711 [36:00<28:33, 5.18s/it]
777
 
778
  53%|█████▎ | 380/711 [36:00<28:33, 5.18s/it]
779
  54%|█████▎ | 381/711 [36:05<28:53, 5.25s/it]
780
  54%|█████▎ | 382/711 [36:11<28:32, 5.21s/it]
781
  54%|█████▍ | 383/711 [36:16<28:38, 5.24s/it]
782
  54%|█████▍ | 384/711 [36:2
783
+ 0: {'loss': 0.3972, 'grad_norm': 0.8357610935717936, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.33}
784
+ 0: 1<28:17, 5.19s/it]
785
  54%|█████▍ | 385/711 [36:26<28:02, 5.16s/it]
786
  54%|█████▍ | 386/711 [36:31<27:52, 5.15s/it]
787
  54%|█████▍ | 387/711 [36:36<27:46, 5.14s/it]
788
  55%|█████▍ | 388/711 [36:41<27:35, 5.12s/it]
789
  55%|█████▍ | 389/711 [36:47<27:35, 5.14s/it]
790
  55%|█████▍ | 390/711 [36:52<27:38, 5.17s/it]
791
 
792
  55%|█████▍ | 390/711 [36:52<27:38, 5.17s/it]
793
  55%|█████▍ | 391/711 [36:57<27:26, 5.14s/it]
794
  55%|█████▌ | 392/711 [37:02<27:35, 5.19s/it]
795
  55%|█████▌ | 393/711 [37:07<27:27, 5.18s/it]
796
  55%|█████▌ | 394/711 [37:12<27:12, 5.15s/it]
797
  56%|█████▌ | 395/711 [37:18<27:39, 5.25s/it]
798
  56%|█████▌ | 396/711 [37:23<27:19, 5.20s/it]
799
  56%|█████▌ | 397/711 [37:28<27:03, 5.17s/it]
800
  56%|█████▌ | 398/711 [37:33<26:52, 5.15s/it]
801
  56%|█████▌
802
+ 0: {'loss': 0.3859, 'grad_norm': 0.8058568338786659, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.34}
803
+ 0: {'loss': 0.3898, 'grad_norm': 0.7954384150397931, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.35}
804
+ 0: | 399/711 [37:38<26:46, 5.15s/it]
805
  56%|█████▋ | 400/711 [37:43<26:35, 5.13s/it]
806
 
807
  56%|█████▋ | 400/711 [37:43<26:35, 5.13s/it]
808
  56%|█████▋ | 401/711 [37:48<26:27, 5.12s/it]
809
  57%|█████▋ | 402/711 [37:54<26:18, 5.11s/it]
810
  57%|█████▋ | 403/711 [37:59<26:11, 5.10s/it]
811
  57%|█████▋ | 404/711 [38:04<26:10, 5.11s/it]
812
  57%|█████▋ | 405/711 [38:09<26:34, 5.21s/it]
813
  57%|█████▋ | 406/711 [38:14<26:25, 5.20s/it]
814
  57%|█████▋ | 407/711 [38:19<26:11, 5.17s/it]
815
  57%|█████▋ | 408/711 [38:25<26:03, 5.16s/it]
816
  58%|█████▊ | 409/711 [38:30<26:30, 5.27s/it]
817
  58%|█████▊ | 410/711 [38:35<26:14, 5.23s/it]
818
 
819
  58%|█████▊ | 410/711 [38:35<26:14, 5.23s/it]
820
  58%|█████▊ | 411/711 [38:41<26:30, 5.30s/it]
821
  58%|███�
822
+ 0: {'loss': 0.3925, 'grad_norm': 0.8145567494437453, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.35}
823
+ 0: ��█▊ | 412/711 [38:46<26:10, 5.25s/it]
824
  58%|█████▊ | 413/711 [38:51<25:55, 5.22s/it]
825
  58%|█████▊ | 414/711 [38:56<25:40, 5.19s/it]
826
  58%|█████▊ | 415/711 [39:01<25:28, 5.16s/it]
827
  59%|█████▊ | 416/711 [39:07<25:49, 5.25s/it]
828
  59%|█████▊ | 417/711 [39:12<25:31, 5.21s/it]
829
  59%|█████▉ | 418/711 [39:17<25:17, 5.18s/it]
830
  59%|█████▉ | 419/711 [39:22<25:06, 5.16s/it]
831
  59%|█████▉ | 420/711 [39:27<24:59, 5.15s/it]
832
 
833
  59%|█████▉ | 420/711 [39:27<24:59, 5.15s/it]
834
  59%|█████▉ | 421/711 [39:32<24:49, 5.14s/it]
835
  59%|█████▉ | 422/711 [39:37<24:44, 5.14s/it]
836
  59%|█████▉ | 423/711 [39:43<24:35, 5.12s/it]
837
  60%|█████▉ | 424/711 [39:48<24:42, 5.17s/it]
838
  60%|█████▉ | 425/711 [39:53<24:34, 5.16s/it]
839
  60%|█████▉ | 426/711 [39:58<24:30, 5.16s/
840
+ 0: {'loss': 0.3927, 'grad_norm': 0.8237856091804933, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.36}
841
+ 0: it]
842
  60%|██████ | 427/711 [40:03<24:40, 5.21s/it]
843
  60%|██████ | 428/711 [40:09<24:55, 5.28s/it]
844
  60%|██████ | 429/711 [40:14<24:33, 5.23s/it]
845
  60%|██████ | 430/711 [40:19<24:23, 5.21s/it]
846
 
847
  60%|██████ | 430/711 [40:19<24:23, 5.21s/it]
848
  61%|██████ | 431/711 [40:24<24:10, 5.18s/it]
849
  61%|██████ | 432/711 [40:29<23:56, 5.15s/it]
850
  61%|██████ | 433/711 [40:34<23:50, 5.14s/it]
851
  61%|██████ | 434/711 [40:40<23:39, 5.12s/it]
852
  61%|██████ | 435/711 [40:45<23:32, 5.12s/it]
853
  61%|██████▏ | 436/711 [40:50<24:14, 5.29s/it]
854
  61%|██████▏ | 437/711 [40:56<24:11, 5.30s/it]
855
  62%|██████▏ | 438/711 [41:01<23:49, 5.24s/it]
856
  62%|██████▏ | 439/711 [41:06<23:33, 5.20s/it]
857
  62%|██████▏ | 440/711 [41:11<23:37, 5.23s/it]
858
 
859
+ 0: {'loss': 0.3937, 'grad_norm': 0.8553439901672909, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.37}
860
+ 0: {'loss': 0.3873, 'grad_norm': 0.8286249080798415, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.38}
861
+ 0:
862
  62%|██████▏ | 440/711 [41:11<23:37, 5.23s/it]
863
  62%|██████▏ | 441/711 [41:16<23:23, 5.20s/it]
864
  62%|██████▏ | 442/711 [41:21<23:13, 5.18s/it]
865
  62%|██████▏ | 443/711 [41:27<23:01, 5.16s/it]
866
  62%|██████▏ | 444/711 [41:32<22:57, 5.16s/it]
867
  63%|██████▎ | 445/711 [41:37<23:14, 5.24s/it]
868
  63%|██████▎ | 446/711 [41:42<22:58, 5.20s/it]
869
  63%|██████▎ | 447/711 [41:47<22:45, 5.17s/it]
870
  63%|██████▎ | 448/711 [41:52<22:34, 5.15s/it]
871
  63%|██████▎ | 449/711 [41:58<22:43, 5.20s/it]
872
  63%|██████▎ | 450/711 [42:03<22:32, 5.18s/it]
873
 
874
  63%|██████▎ | 450/711 [42:03<22:32, 5.18s/it]
875
  63%|██████▎ | 451/711 [42:08<22:20, 5.16s/it]
876
  64%|██████▎ | 452/711 [42:13<22:14, 5.15s/it]
877
  64%|██████▎ | 453/711 [42:18<22:09, 5
878
+ 0: {'loss': 0.384, 'grad_norm': 0.8623716385758442, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.39}
879
+ 0: .15s/it]
880
  64%|██████▍ | 454/711 [42:23<22:03, 5.15s/it]
881
  64%|██████▍ | 455/711 [42:29<21:56, 5.14s/it]
882
  64%|██████▍ | 456/711 [42:34<21:51, 5.14s/it]
883
  64%|██████▍ | 457/711 [42:39<21:44, 5.14s/it]
884
  64%|██████▍ | 458/711 [42:44<21:40, 5.14s/it]
885
  65%|██████▍ | 459/711 [42:49<21:39, 5.16s/it]
886
  65%|██████▍ | 460/711 [42:54<21:46, 5.21s/it]
887
 
888
  65%|██████▍ | 460/711 [42:54<21:46, 5.21s/it]
889
  65%|██████▍ | 461/711 [43:00<21:38, 5.19s/it]
890
  65%|██████▍ | 462/711 [43:05<21:30, 5.18s/it]
891
  65%|██████▌ | 463/711 [43:10<21:25, 5.18s/it]
892
  65%|██████▌ | 464/711 [43:15<21:17, 5.17s/it]
893
  65%|██████▌ | 465/711 [43:20<21:05, 5.15s/it]
894
  66%|██████▌ | 466/711 [43:25<20:58, 5.14s/it]
895
  66%|██████▌ | 467/711 [43:30<20:53, 5.14s/it]
896
  66%
897
+ 0: {'loss': 0.3893, 'grad_norm': 0.7980262942281969, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.4}
898
+ 0: {'loss': 0.3928, 'grad_norm': 0.9024656134697462, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.4}
899
+ 0: |██████▌ | 468/711 [43:36<20:50, 5.14s/it]
900
  66%|██████▌ | 469/711 [43:41<21:10, 5.25s/it]
901
  66%|██████▌ | 470/711 [43:46<20:53, 5.20s/it]
902
 
903
  66%|██████▌ | 470/711 [43:46<20:53, 5.20s/it]
904
  66%|██████▌ | 471/711 [43:51<20:45, 5.19s/it]
905
  66%|██████▋ | 472/711 [43:57<20:40, 5.19s/it]
906
  67%|██████▋ | 473/711 [44:02<20:28, 5.16s/it]
907
  67%|██████▋ | 474/711 [44:07<20:23, 5.16s/it]
908
  67%|██████▋ | 475/711 [44:12<20:12, 5.14s/it]
909
  67%|██████▋ | 476/711 [44:18<20:43, 5.29s/it]
910
  67%|██████▋ | 477/711 [44:23<20:23, 5.23s/it]
911
  67%|██████▋ | 478/711 [44:28<20:24, 5.26s/it]
912
  67%|██████▋ | 479/711 [44:33<20:10, 5.22s/it]
913
  68%|██████▊ | 480/711 [44:38<19:59, 5.19s/it]
914
 
915
  68%|██████▊ |
916
+ 0: {'loss': 0.3747, 'grad_norm': 1.4532167164219425, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.41}
917
+ 0: 480/711 [44:38<19:59, 5.19s/it]
918
  68%|██████▊ | 481/711 [44:44<20:13, 5.28s/it]
919
  68%|██████▊ | 482/711 [44:49<20:00, 5.24s/it]
920
  68%|██████▊ | 483/711 [44:54<19:48, 5.21s/it]
921
  68%|██████▊ | 484/711 [44:59<19:39, 5.20s/it]
922
  68%|██████▊ | 485/711 [45:04<19:33, 5.19s/it]
923
  68%|██████▊ | 486/711 [45:10<19:26, 5.18s/it]
924
  68%|██████▊ | 487/711 [45:15<19:19, 5.18s/it]
925
  69%|██████▊ | 488/711 [45:20<19:09, 5.15s/it]
926
  69%|██████▉ | 489/711 [45:25<19:24, 5.25s/it]
927
  69%|██████▉ | 490/711 [45:30<19:09, 5.20s/it]
928
 
929
  69%|██████▉ | 490/711 [45:30<19:09, 5.20s/it]
930
  69%|██████▉ | 491/711 [45:36<19:16, 5.26s/it]
931
  69%|██████▉ | 492/711 [45:41<19:00, 5.21s/it]
932
  69%|██████▉ | 493/711 [45:46<18:47, 5.17s/it]
933
  69%|██████▉ | 494/711 [45:
934
+ 0: {'loss': 0.3797, 'grad_norm': 0.8355553409451639, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.42}
935
+ 0: 51<19:02, 5.27s/it]
936
  70%|██████▉ | 495/711 [45:56<18:45, 5.21s/it]
937
  70%|██████▉ | 496/711 [46:02<18:34, 5.19s/it]
938
  70%|██████▉ | 497/711 [46:07<18:38, 5.22s/it]
939
  70%|███████ | 498/711 [46:12<18:25, 5.19s/it]
940
  70%|███████ | 499/711 [46:18<18:55, 5.35s/it]
941
  70%|███████ | 500/711 [46:23<18:34, 5.28s/it]
942
 
943
  70%|███████ | 500/711 [46:23<18:34, 5.28s/it]
944
  70%|███████ | 501/711 [46:28<18:22, 5.25s/it]
945
  71%|███████ | 502/711 [46:33<18:12, 5.23s/it]
946
  71%|███████ | 503/711 [46:38<18:04, 5.21s/it]
947
  71%|███████ | 504/711 [46:44<17:53, 5.19s/it]
948
  71%|███████ | 505/711 [46:49<17:43, 5.16s/it]
949
  71%|███████ | 506/711 [46:54<17:36, 5.16s/it]
950
  71%|███████▏ | 507/711 [46:59<17:29, 5.14s/it]
951
  71%|███████▏ | 508/711 [47:04<17:21,
952
+ 0: {'loss': 0.3917, 'grad_norm': 0.8425621928447311, 'learning_rate': 2e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.43}
953
+ 0: {'loss': 0.3825, 'grad_norm': 0.7816311224212293, 'learning_rate': 1.9929032311830302e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.44}
954
+ 0: 5.13s/it]
955
  72%|███████▏ | 509/711 [47:09<17:16, 5.13s/it]
956
  72%|███████▏ | 510/711 [47:14<17:09, 5.12s/it]
957
 
958
  72%|███████▏ | 510/711 [47:14<17:09, 5.12s/it]
959
  72%|███████▏ | 511/711 [47:20<17:21, 5.21s/it]
960
  72%|███████▏ | 512/711 [47:25<17:12, 5.19s/it]
961
  72%|███████▏ | 513/711 [47:30<17:00, 5.16s/it]
962
  72%|███████▏ | 514/711 [47:35<16:51, 5.14s/it]
963
  72%|███████▏ | 515/711 [47:40<16:45, 5.13s/it]
964
  73%|███████▎ | 516/711 [47:45<16:40, 5.13s/it]
965
  73%|███████▎ | 517/711 [47:50<16:32, 5.12s/it]
966
  73%|███████▎ | 518/711 [47:55<16:30, 5.13s/it]
967
  73%|███████▎ | 519/711 [48:01<16:25, 5.13s/it]
968
  73%|███████▎ | 520/711 [48:06<16:42, 5.25s/it]
969
 
970
  73%|███████▎ | 520/711 [48:06<16:4
971
+ 0: {'loss': 0.3819, 'grad_norm': 0.8516269037345293, 'learning_rate': 1.9642643171092486e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.45}
972
+ 0: 2, 5.25s/it]
973
  73%|███████▎ | 521/711 [48:11<16:30, 5.21s/it]
974
  73%|███████▎ | 522/711 [48:16<16:18, 5.18s/it]
975
  74%|███████▎ | 523/711 [48:22<16:20, 5.22s/it]
976
  74%|███████▎ | 524/711 [48:27<16:36, 5.33s/it]
977
  74%|███████▍ | 525/711 [48:32<16:20, 5.27s/it]
978
  74%|███████▍ | 526/711 [48:37<16:07, 5.23s/it]
979
  74%|███████▍ | 527/711 [48:43<15:53, 5.18s/it]
980
  74%|███████▍ | 528/711 [48:48<15:54, 5.22s/it]
981
  74%|███████▍ | 529/711 [48:53<15:42, 5.18s/it]
982
  75%|███████▍ | 530/711 [48:58<15:38, 5.18s/it]
983
 
984
  75%|███████▍ | 530/711 [48:58<15:38, 5.18s/it]
985
  75%|███████▍ | 531/711 [49:03<15:27, 5.15s/it]
986
  75%|███████▍ | 532/711 [49:08<15:19, 5.14s/it]
987
  75%|███████▍ | 533/711 [49:13<15:11, 5.12s/it]
988
  75%|███████▌ | 53
989
+ 0: {'loss': 0.3918, 'grad_norm': 0.9451621145813273, 'learning_rate': 1.9143443472194176e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.46}
990
+ 0: 4/711 [49:18<15:05, 5.12s/it]
991
  75%|███████▌ | 535/711 [49:24<14:58, 5.11s/it]
992
  75%|███████▌ | 536/711 [49:29<15:04, 5.17s/it]
993
  76%|███████▌ | 537/711 [49:34<14:59, 5.17s/it]
994
  76%|███████▌ | 538/711 [49:39<14:48, 5.14s/it]
995
  76%|███████▌ | 539/711 [49:44<14:46, 5.16s/it]
996
  76%|███████▌ | 540/711 [49:49<14:41, 5.15s/it]
997
 
998
  76%|███████▌ | 540/711 [49:49<14:41, 5.15s/it]
999
  76%|███████▌ | 541/711 [49:55<14:33, 5.14s/it]
1000
  76%|███████▌ | 542/711 [50:00<14:26, 5.13s/it]
1001
  76%|███████▋ | 543/711 [50:05<14:23, 5.14s/it]
1002
  77%|███████▋ | 544/711 [50:10<14:17, 5.13s/it]
1003
  77%|███████▋ | 545/711 [50:15<14:10, 5.12s/it]
1004
  77%|███████▋ | 546/711 [50:20<14:04, 5.12s/it]
1005
  77%|███████▋ | 547/711 [50:25<13:59, 5.12s/it]
1006
  77%|████�
1007
+ 0: {'loss': 0.3907, 'grad_norm': 0.8481507125427856, 'learning_rate': 1.8443725168471053e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.46}
1008
+ 0: {'loss': 0.3803, 'grad_norm': 0.8683953024369212, 'learning_rate': 1.7560717646792703e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.47}
1009
+ 0: ��██▋ | 548/711 [50:30<13:52, 5.11s/it]
1010
  77%|███████▋ | 549/711 [50:35<13:47, 5.11s/it]
1011
  77%|███████▋ | 550/711 [50:41<13:42, 5.11s/it]
1012
 
1013
  77%|███████▋ | 550/711 [50:41<13:42, 5.11s/it]
1014
  77%|███████▋ | 551/711 [50:46<13:40, 5.13s/it]
1015
  78%|███████▊ | 552/711 [50:51<13:36, 5.13s/it]
1016
  78%|███████▊ | 553/711 [50:56<13:29, 5.12s/it]
1017
  78%|███████▊ | 554/711 [51:01<13:24, 5.12s/it]
1018
  78%|███████▊ | 555/711 [51:06<13:21, 5.14s/it]
1019
  78%|███████▊ | 556/711 [51:11<13:14, 5.12s/it]
1020
  78%|███████▊ | 557/711 [51:17<13:25, 5.23s/it]
1021
  78%|███████▊ | 558/711 [51:22<13:32, 5.31s/it]
1022
  79%|███████▊ | 559/711 [51:27<13:18, 5.25s/it]
1023
  79%|███████▉ | 560/711 [51:33<13:06, 5.21s/it]
1024
 
1025
  79%|███�
1026
+ 0: {'loss': 0.3904, 'grad_norm': 0.8636181194234771, 'learning_rate': 1.6516163482876789e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.48}
1027
+ 0: �███▉ | 560/711 [51:33<13:06, 5.21s/it]
1028
  79%|███████▉ | 561/711 [51:38<12:58, 5.19s/it]
1029
  79%|███████▉ | 562/711 [51:43<13:13, 5.33s/it]
1030
  79%|███████▉ | 563/711 [51:49<13:11, 5.35s/it]
1031
  79%|███████▉ | 564/711 [51:54<12:58, 5.29s/it]
1032
  79%|███████▉ | 565/711 [51:59<12:47, 5.26s/it]
1033
  80%|███████▉ | 566/711 [52:04<12:34, 5.21s/it]
1034
  80%|███████▉ | 567/711 [52:10<12:40, 5.28s/it]
1035
  80%|███████▉ | 568/711 [52:15<12:28, 5.23s/it]
1036
  80%|████████ | 569/711 [52:20<12:27, 5.26s/it]
1037
  80%|████████ | 570/711 [52:25<12:14, 5.21s/it]
1038
 
1039
  80%|████████ | 570/711 [52:25<12:14, 5.21s/it]
1040
  80%|████████ | 571/711 [52:31<12:14, 5.24s/it]
1041
  80%|████████ | 572/711 [52:36<12:04, 5.21s/it]
1042
  81%|████████ | 573/711 [52:41<12:02, 5.24s/it]
1043
+ 0: {'loss': 0.3828, 'grad_norm': 0.8879170447340103, 'learning_rate': 1.5335783066915436e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 75.77, 'epoch': 0.49}
1044
+ 0:
1045
  81%|████████ | 574/711 [52:46<11:53, 5.21s/it]
1046
  81%|████████ | 575/711 [52:51<11:44, 5.18s/it]
1047
  81%|████████ | 576/711 [52:56<11:38, 5.17s/it]
1048
  81%|████████ | 577/711 [53:02<11:46, 5.27s/it]
1049
  81%|████████▏ | 578/711 [53:07<11:49, 5.33s/it]
1050
  81%|████████▏ | 579/711 [53:12<11:34, 5.26s/it]
1051
  82%|████████▏ | 580/711 [53:18<11:25, 5.23s/it]
1052
 
1053
  82%|████████▏ | 580/711 [53:18<11:25, 5.23s/it]
1054
  82%|████████▏ | 581/711 [53:23<11:15, 5.19s/it]
1055
  82%|████████▏ | 582/711 [53:28<11:06, 5.16s/it]
1056
  82%|████████▏ | 583/711 [53:33<10:59, 5.15s/it]
1057
  82%|████████▏ | 584/711 [53:38<10:52, 5.14s/it]
1058
  82%|████████▏ | 585/711 [53:43<10:46, 5.13s/it]
1059
  82%|████████▏ | 586/711 [53:48<10:42, 5.14s/it]
1060
  83%|███████�
1061
+ 0: {'loss': 0.3763, 'grad_norm': 0.829865464468096, 'learning_rate': 1.4048641282207622e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.5}
1062
+ 0: �▎ | 587/711 [53:54<11:03, 5.35s/it]
1063
  83%|████████▎ | 588/711 [53:59<10:55, 5.33s/it]
1064
  83%|████████▎ | 589/711 [54:05<10:40, 5.25s/it]
1065
  83%|████████▎ | 590/711 [54:10<10:43, 5.32s/it]
1066
 
1067
  83%|████████▎ | 590/711 [54:10<10:43, 5.32s/it]
1068
  83%|████████▎ | 591/711 [54:15<10:32, 5.27s/it]
1069
  83%|████████▎ | 592/711 [54:20<10:23, 5.24s/it]
1070
  83%|████████▎ | 593/711 [54:25<10:14, 5.21s/it]
1071
  84%|████████▎ | 594/711 [54:31<10:07, 5.19s/it]
1072
  84%|████████▎ | 595/711 [54:36<09:59, 5.17s/it]
1073
  84%|████████▍ | 596/711 [54:41<09:51, 5.14s/it]
1074
  84%|████████▍ | 597/711 [54:46<09:53, 5.21s/it]
1075
  84%|████████▍ | 598/711 [54:51<09:45, 5.18s/it]
1076
  84%|████████▍ | 599/711 [54:57<10:07, 5.42s/it]
1077
  84%|████████▍ | 600/711 [55:02
1078
+ 0: {'loss': 0.3895, 'grad_norm': 1.0461353857982227, 'learning_rate': 1.2686431831271522e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.51}
1079
+ 0: {'loss': 0.3726, 'grad_norm': 0.8291006806562558, 'learning_rate': 1.1282696831703153e-06, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.51}
1080
+ 0: <09:53, 5.35s/it]
1081
 
1082
  84%|████████▍ | 600/711 [55:02<09:53, 5.35s/it]
1083
  85%|████████▍ | 601/711 [55:08<09:48, 5.35s/it]
1084
  85%|████████▍ | 602/711 [55:13<09:35, 5.28s/it]
1085
  85%|████████▍ | 603/711 [55:18<09:24, 5.23s/it]
1086
  85%|████████▍ | 604/711 [55:23<09:22, 5.26s/it]
1087
  85%|████████▌ | 605/711 [55:28<09:12, 5.22s/it]
1088
  85%|████████▌ | 606/711 [55:34<09:27, 5.41s/it]
1089
  85%|████████▌ | 607/711 [55:40<09:29, 5.47s/it]
1090
  86%|████████▌ | 608/711 [55:45<09:17, 5.41s/it]
1091
  86%|████████▌ | 609/711 [55:50<09:03, 5.32s/it]
1092
  86%|████████▌ | 610/711 [55:56<08:56, 5.31s/it]
1093
 
1094
  86%|████████▌ | 610/711 [55:56<08:56, 5.31s/it]
1095
  86%|████████▌ | 611/711 [56:01<08:45, 5.25s/it]
1096
  86%|████�
1097
+ 0: {'loss': 0.3749, 'grad_norm': 0.8273940149239204, 'learning_rate': 9.87200089792126e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.52}
1098
+ 0: �███▌ | 612/711 [56:06<08:36, 5.22s/it]
1099
  86%|████████▌ | 613/711 [56:11<08:34, 5.24s/it]
1100
  86%|████████▋ | 614/711 [56:16<08:25, 5.22s/it]
1101
  86%|████████▋ | 615/711 [56:21<08:17, 5.19s/it]
1102
  87%|████████▋ | 616/711 [56:28<08:41, 5.48s/it]
1103
  87%|████████▋ | 617/711 [56:33<08:24, 5.37s/it]
1104
  87%|████████▋ | 618/711 [56:38<08:23, 5.41s/it]
1105
  87%|████████▋ | 619/711 [56:43<08:09, 5.33s/it]
1106
  87%|████████▋ | 620/711 [56:49<07:59, 5.27s/it]
1107
 
1108
  87%|████████▋ | 620/711 [56:49<07:59, 5.27s/it]
1109
  87%|████████▋ | 621/711 [56:54<07:51, 5.24s/it]
1110
  87%|████████▋ | 622/711 [56:59<07:43, 5.21s/it]
1111
  88%|████████▊ | 623/711 [57:04<07:44, 5.28s/it]
1112
  88%|████████▊ | 624/711 [57:09<07:35, 5.23s/it]
1113
  88%|████████▊ | 625/7
1114
+ 0: {'loss': 0.3717, 'grad_norm': 0.7609373973543032, 'learning_rate': 8.489080045646937e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.53}
1115
+ 0: 11 [57:14<07:26, 5.19s/it]
1116
  88%|████████▊ | 626/711 [57:20<07:19, 5.17s/it]
1117
  88%|████████▊ | 627/711 [57:25<07:12, 5.14s/it]
1118
  88%|████████▊ | 628/711 [57:30<07:07, 5.15s/it]
1119
  88%|████████▊ | 629/711 [57:35<07:03, 5.16s/it]
1120
  89%|████████▊ | 630/711 [57:41<07:25, 5.49s/it]
1121
 
1122
  89%|████████▊ | 630/711 [57:41<07:25, 5.49s/it]
1123
  89%|████████▊ | 631/711 [57:47<07:13, 5.42s/it]
1124
  89%|████████▉ | 632/711 [57:52<07:01, 5.33s/it]
1125
  89%|████████▉ | 633/711 [57:57<06:51, 5.28s/it]
1126
  89%|████████▉ | 634/711 [58:02<06:42, 5.23s/it]
1127
  89%|████████▉ | 635/711 [58:07<06:35, 5.21s/it]
1128
  89%|████████▉ | 636/711 [58:13<06:36, 5.28s/it]
1129
  90%|████████▉ | 637/711 [58:18<06:26, 5.23s/it]
1130
  90%|████████▉ | 638/711 [58:23<06:19, 5.2
1131
+ 0: {'loss': 0.3746, 'grad_norm': 0.8664669621801867, 'learning_rate': 7.167986375914345e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.54}
1132
+ 0: {'loss': 0.3774, 'grad_norm': 0.7890688699500655, 'learning_rate': 5.941249599330827e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.55}
1133
+ 0: 0s/it]
1134
  90%|████████▉ | 639/711 [58:28<06:19, 5.28s/it]
1135
  90%|█████████ | 640/711 [58:33<06:11, 5.24s/it]
1136
 
1137
  90%|█████████ | 640/711 [58:33<06:11, 5.24s/it]
1138
  90%|█████████ | 641/711 [58:39<06:10, 5.30s/it]
1139
  90%|█████████ | 642/711 [58:44<06:02, 5.25s/it]
1140
  90%|█████████ | 643/711 [58:49<05:53, 5.20s/it]
1141
  91%|█████████ | 644/711 [58:54<05:46, 5.17s/it]
1142
  91%|█████████ | 645/711 [59:00<05:46, 5.26s/it]
1143
  91%|█████████ | 646/711 [59:05<05:38, 5.21s/it]
1144
  91%|█████████ | 647/711 [59:10<05:31, 5.18s/it]
1145
  91%|█████████ | 648/711 [59:15<05:25, 5.17s/it]
1146
  91%|█████████▏| 649/711 [59:20<05:18, 5.14s/it]
1147
  91%|█████████▏| 650/711 [59:25<05:18, 5.23s/it]
1148
 
1149
  91%|███████�
1150
+ 0: {'loss': 0.3821, 'grad_norm': 0.8161369753902079, 'learning_rate': 4.839076046641801e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.56}
1151
+ 0: ��█▏| 650/711 [59:25<05:18, 5.23s/it]
1152
  92%|█████████▏| 651/711 [59:31<05:11, 5.20s/it]
1153
  92%|█████████▏| 652/711 [59:36<05:04, 5.16s/it]
1154
  92%|█████████▏| 653/711 [59:41<05:07, 5.31s/it]
1155
  92%|█████████▏| 654/711 [59:46<04:58, 5.24s/it]
1156
  92%|█████████▏| 655/711 [59:52<04:52, 5.22s/it]
1157
  92%|█████████▏| 656/711 [59:57<04:51, 5.30s/it]
1158
  92%|█████████▏| 657/711 [1:00:02<04:43, 5.26s/it]
1159
  93%|█████████▎| 658/711 [1:00:07<04:36, 5.22s/it]
1160
  93%|█████████▎| 659/711 [1:00:13<04:32, 5.23s/it]
1161
  93%|█████████▎| 660/711 [1:00:18<04:24, 5.19s/it]
1162
 
1163
  93%|█████████▎| 660/711 [1:00:18<04:24, 5.19s/it]
1164
  93%|█████████▎| 661/711 [1:00:23<04:23, 5.28s/it]
1165
  93%|█████████▎| 662/711 [1:00:28<04:16, 5.23s/it]
1166
  93%
1167
+ 0: {'loss': 0.3655, 'grad_norm': 0.8247018856496126, 'learning_rate': 3.888604888618786e-07, 'memory/max_mem_active(gib)': 69.44, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.56}
1168
+ 0: |█████████▎| 663/711 [1:00:33<04:09, 5.20s/it]
1169
  93%|█████████▎| 664/711 [1:00:39<04:02, 5.17s/it]
1170
  94%|█████████▎| 665/711 [1:00:44<03:57, 5.16s/it]
1171
  94%|█████████▎| 666/711 [1:00:49<03:54, 5.21s/it]
1172
  94%|█████████▍| 667/711 [1:00:54<03:48, 5.19s/it]
1173
  94%|█████████▍| 668/711 [1:00:59<03:41, 5.16s/it]
1174
  94%|█████████▍| 669/711 [1:01:04<03:35, 5.14s/it]
1175
  94%|█████████▍| 670/711 [1:01:10<03:33, 5.22s/it]
1176
 
1177
  94%|█████████▍| 670/711 [1:01:10<03:33, 5.22s/it]
1178
  94%|█████████▍| 671/711 [1:01:15<03:27, 5.18s/it]
1179
  95%|█████████▍| 672/711 [1:01:20<03:23, 5.22s/it]
1180
  95%|█████████▍| 673/711 [1:01:25<03:17, 5.20s/it]
1181
  95%|█████████▍| 674/711 [1:01:30<03:10, 5.16s/it]
1182
  95%|█████████▍| 67
1183
+ 0: {'loss': 0.3832, 'grad_norm': 0.7788264476641573, 'learning_rate': 3.1132398796052294e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.57}
1184
+ 0: 5/711 [1:01:36<03:06, 5.19s/it]
1185
  95%|█████████▌| 676/711 [1:01:42<03:13, 5.53s/it]
1186
  95%|█████████▌| 677/711 [1:01:48<03:11, 5.63s/it]
1187
  95%|█████████▌| 678/711 [1:01:53<03:02, 5.52s/it]
1188
  95%|█████████▌| 679/711 [1:01:58<02:52, 5.38s/it]
1189
  96%|█████████▌| 680/711 [1:02:03<02:44, 5.29s/it]
1190
 
1191
  96%|█████████▌| 680/711 [1:02:03<02:44, 5.29s/it]
1192
  96%|█████████▌| 681/711 [1:02:08<02:37, 5.25s/it]
1193
  96%|█████████▌| 682/711 [1:02:14<02:33, 5.31s/it]
1194
  96%|█████████▌| 683/711 [1:02:19<02:27, 5.25s/it]
1195
  96%|█████████▌| 684/711 [1:02:24<02:20, 5.21s/it]
1196
  96%|█████████▋| 685/711 [1:02:29<02:14, 5.18s/it]
1197
  96%|█████████▋| 686/711 [1:02:34<02:09, 5.17s/it]
1198
  97%|█████████▋| 687/711 [1:02:39<02:03, 5.17s/it]
1199
  9
1200
+ 0: {'loss': 0.3818, 'grad_norm': 0.7875994058840892, 'learning_rate': 2.532073079411971e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.58}
1201
+ 0: 7%|█████████▋| 688/711 [1:02:45<01:58, 5.15s/it]
1202
  97%|█████████▋| 689/711 [1:02:50<01:53, 5.15s/it]
1203
  97%|█████████▋| 690/711 [1:02:55<01:47, 5.14s/it]
1204
 
1205
  97%|█████████▋| 690/711 [1:02:55<01:47, 5.14s/it]
1206
  97%|█████████▋| 691/711 [1:03:00<01:43, 5.19s/it]
1207
  97%|█████████▋| 692/711 [1:03:05<01:37, 5.16s/it]
1208
  97%|█████████▋| 693/711 [1:03:10<01:32, 5.14s/it]
1209
  98%|█████████▊| 694/711 [1:03:15<01:27, 5.13s/it]
1210
  98%|█████████▊| 695/711 [1:03:21<01:22, 5.13s/it]
1211
  98%|█████████▊| 696/711 [1:03:26<01:17, 5.14s/it]
1212
  98%|█████████▊| 697/711 [1:03:31<01:11, 5.14s/it]
1213
  98%|█████████▊| 698/711 [1:03:36<01:06, 5.14s/it]
1214
  98%|█████████▊| 699/711 [1:03:41<01:02, 5.17s/it]
1215
  98%|█████████▊|
1216
+ 0: {'loss': 0.3763, 'grad_norm': 0.7322258611860605, 'learning_rate': 2.1594147434418026e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.59}
1217
+ 0: {'loss': 0.3734, 'grad_norm': 0.7653492019720352, 'learning_rate': 2.0044409567084156e-07, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.6}
1218
+ 0: 700/711 [1:03:47<00:57, 5.24s/it]
1219
 
1220
  98%|█████████▊| 700/711 [1:03:47<00:57, 5.24s/it]
1221
  99%|█████████▊| 701/711 [1:03:52<00:52, 5.26s/it]
1222
  99%|█████████▊| 702/711 [1:03:57<00:47, 5.23s/it]
1223
  99%|█████████▉| 703/711 [1:04:02<00:41, 5.20s/it]
1224
  99%|█████████▉| 704/711 [1:04:07<00:36, 5.20s/it]
1225
  99%|█████████▉| 705/711 [1:04:13<00:31, 5.30s/it]
1226
  99%|█████████▉| 706/711 [1:04:18<00:26, 5.24s/it]
1227
  99%|█████████▉| 707/711 [1:04:23<00:21, 5.27s/it]
1228
 
1229
+ 0: [2025-11-24 01:16:53,191] [INFO] [axolotl.core.trainers.base._save:613] [PID:3081979] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0/checkpoint-711
1230
+ 0: [2025-11-24 01:17:11,725] [INFO] [axolotl.core.trainers.base._save:662] [PID:3081979] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
1231
+ 0: {'train_runtime': 3910.4069, 'train_samples_per_second': 2.909, 'train_steps_per_second': 0.182, 'train_loss': 0.4125736778295493, 'memory/max_mem_active(gib)': 69.55, 'memory/max_mem_allocated(gib)': 67.66, 'memory/device_mem_reserved(gib)': 76.53, 'epoch': 0.6}
1232
+ 0: �█████| 711/711 [1:04:44<00:00, 5.22s/it]
1233
 
1234
+ 0: [2025-11-24 01:17:21,056] [INFO] [axolotl.train.save_trained_model:228] [PID:3081979] [RANK:0] Training completed! Saving trained model to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0.
1235
+ 0: [2025-11-24 01:17:26,694] [INFO] [axolotl.core.trainers.base._save:613] [PID:3081979] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0
1236
+ 0: [2025-11-24 01:17:44,493] [INFO] [axolotl.core.trainers.base._save:662] [PID:3081979] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
1237
+ 0: [2025-11-24 01:17:44,817] [INFO] [axolotl.train.save_trained_model:350] [PID:3081979] [RANK:0] Model successfully saved to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-12b/0
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d94a47440372bd382b8068468e71d951722420260c48543cdcd097f95f9ee7fb
3
+ size 10424