added a couple of models and their checkpoints
Browse files- breaking_0.1_trained/10% most difficult/out.log +451 -0
- breaking_0.1_trained/10% most difficult/params.txt +91 -0
- breaking_0.1_trained/10_most_difficult/checkpoints/epoch_5.pt +3 -0
- breaking_0.1_trained/10_most_difficult/checkpoints/epoch_latest.pt +3 -0
- breaking_0.1_trained/10_most_difficult/info.pkl +3 -0
- breaking_0.1_trained/10_most_difficult/out.log +497 -0
- breaking_0.1_trained/10_most_difficult/params.txt +91 -0
- breaking_0.5_trained/50_most_difficult/checkpoints/epoch_5.pt +3 -0
- breaking_0.5_trained/50_most_difficult/checkpoints/epoch_latest.pt +3 -0
- breaking_0.5_trained/50_most_difficult/info.pkl +3 -0
- breaking_0.5_trained/50_most_difficult/out.log +497 -0
- breaking_0.5_trained/50_most_difficult/params.txt +91 -0
- minipile_style_trained/minipile_style/checkpoints/epoch_5.pt +3 -0
- minipile_style_trained/minipile_style/checkpoints/epoch_latest.pt +3 -0
- minipile_style_trained/minipile_style/info.pkl +3 -0
- minipile_style_trained/minipile_style/out.log +497 -0
- minipile_style_trained/minipile_style/params.txt +91 -0
breaking_0.1_trained/10% most difficult/out.log
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-06,22:40:58 | INFO | No latest resume checkpoint found in /home/breaking_0.1_trained/10% most difficult/checkpoints.
|
| 2 |
+
2024-09-06,22:41:00 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 3 |
+
2024-09-06,22:41:00 | INFO | Loaded ViT-B-32 model config.
|
| 4 |
+
2024-09-06,22:41:01 | INFO | Model:
|
| 5 |
+
2024-09-06,22:41:01 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(patchnorm_pre_ln): Identity()
|
| 8 |
+
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
|
| 9 |
+
(patch_dropout): Identity()
|
| 10 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 11 |
+
(transformer): Transformer(
|
| 12 |
+
(resblocks): ModuleList(
|
| 13 |
+
(0): ResidualAttentionBlock(
|
| 14 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 15 |
+
(attn): MultiheadAttention(
|
| 16 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 17 |
+
)
|
| 18 |
+
(ls_1): Identity()
|
| 19 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 20 |
+
(mlp): Sequential(
|
| 21 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 22 |
+
(gelu): GELU(approximate='none')
|
| 23 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 24 |
+
)
|
| 25 |
+
(ls_2): Identity()
|
| 26 |
+
)
|
| 27 |
+
(1): ResidualAttentionBlock(
|
| 28 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
(attn): MultiheadAttention(
|
| 30 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 31 |
+
)
|
| 32 |
+
(ls_1): Identity()
|
| 33 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(mlp): Sequential(
|
| 35 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 36 |
+
(gelu): GELU(approximate='none')
|
| 37 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(ls_2): Identity()
|
| 40 |
+
)
|
| 41 |
+
(2): ResidualAttentionBlock(
|
| 42 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 43 |
+
(attn): MultiheadAttention(
|
| 44 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 45 |
+
)
|
| 46 |
+
(ls_1): Identity()
|
| 47 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 48 |
+
(mlp): Sequential(
|
| 49 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 50 |
+
(gelu): GELU(approximate='none')
|
| 51 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 52 |
+
)
|
| 53 |
+
(ls_2): Identity()
|
| 54 |
+
)
|
| 55 |
+
(3): ResidualAttentionBlock(
|
| 56 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(attn): MultiheadAttention(
|
| 58 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 59 |
+
)
|
| 60 |
+
(ls_1): Identity()
|
| 61 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 62 |
+
(mlp): Sequential(
|
| 63 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 64 |
+
(gelu): GELU(approximate='none')
|
| 65 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 66 |
+
)
|
| 67 |
+
(ls_2): Identity()
|
| 68 |
+
)
|
| 69 |
+
(4): ResidualAttentionBlock(
|
| 70 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 71 |
+
(attn): MultiheadAttention(
|
| 72 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 73 |
+
)
|
| 74 |
+
(ls_1): Identity()
|
| 75 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 76 |
+
(mlp): Sequential(
|
| 77 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 78 |
+
(gelu): GELU(approximate='none')
|
| 79 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 80 |
+
)
|
| 81 |
+
(ls_2): Identity()
|
| 82 |
+
)
|
| 83 |
+
(5): ResidualAttentionBlock(
|
| 84 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 85 |
+
(attn): MultiheadAttention(
|
| 86 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 87 |
+
)
|
| 88 |
+
(ls_1): Identity()
|
| 89 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 90 |
+
(mlp): Sequential(
|
| 91 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 92 |
+
(gelu): GELU(approximate='none')
|
| 93 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 94 |
+
)
|
| 95 |
+
(ls_2): Identity()
|
| 96 |
+
)
|
| 97 |
+
(6): ResidualAttentionBlock(
|
| 98 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 99 |
+
(attn): MultiheadAttention(
|
| 100 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 101 |
+
)
|
| 102 |
+
(ls_1): Identity()
|
| 103 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 104 |
+
(mlp): Sequential(
|
| 105 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 106 |
+
(gelu): GELU(approximate='none')
|
| 107 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 108 |
+
)
|
| 109 |
+
(ls_2): Identity()
|
| 110 |
+
)
|
| 111 |
+
(7): ResidualAttentionBlock(
|
| 112 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 113 |
+
(attn): MultiheadAttention(
|
| 114 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 115 |
+
)
|
| 116 |
+
(ls_1): Identity()
|
| 117 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 118 |
+
(mlp): Sequential(
|
| 119 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 120 |
+
(gelu): GELU(approximate='none')
|
| 121 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 122 |
+
)
|
| 123 |
+
(ls_2): Identity()
|
| 124 |
+
)
|
| 125 |
+
(8): ResidualAttentionBlock(
|
| 126 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 127 |
+
(attn): MultiheadAttention(
|
| 128 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 129 |
+
)
|
| 130 |
+
(ls_1): Identity()
|
| 131 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 132 |
+
(mlp): Sequential(
|
| 133 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 134 |
+
(gelu): GELU(approximate='none')
|
| 135 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 136 |
+
)
|
| 137 |
+
(ls_2): Identity()
|
| 138 |
+
)
|
| 139 |
+
(9): ResidualAttentionBlock(
|
| 140 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 141 |
+
(attn): MultiheadAttention(
|
| 142 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 143 |
+
)
|
| 144 |
+
(ls_1): Identity()
|
| 145 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 146 |
+
(mlp): Sequential(
|
| 147 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 148 |
+
(gelu): GELU(approximate='none')
|
| 149 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 150 |
+
)
|
| 151 |
+
(ls_2): Identity()
|
| 152 |
+
)
|
| 153 |
+
(10): ResidualAttentionBlock(
|
| 154 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 155 |
+
(attn): MultiheadAttention(
|
| 156 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 157 |
+
)
|
| 158 |
+
(ls_1): Identity()
|
| 159 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 160 |
+
(mlp): Sequential(
|
| 161 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 162 |
+
(gelu): GELU(approximate='none')
|
| 163 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 164 |
+
)
|
| 165 |
+
(ls_2): Identity()
|
| 166 |
+
)
|
| 167 |
+
(11): ResidualAttentionBlock(
|
| 168 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 169 |
+
(attn): MultiheadAttention(
|
| 170 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 171 |
+
)
|
| 172 |
+
(ls_1): Identity()
|
| 173 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 174 |
+
(mlp): Sequential(
|
| 175 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 176 |
+
(gelu): GELU(approximate='none')
|
| 177 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 178 |
+
)
|
| 179 |
+
(ls_2): Identity()
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 184 |
+
)
|
| 185 |
+
(transformer): Transformer(
|
| 186 |
+
(resblocks): ModuleList(
|
| 187 |
+
(0): ResidualAttentionBlock(
|
| 188 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 189 |
+
(attn): MultiheadAttention(
|
| 190 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 191 |
+
)
|
| 192 |
+
(ls_1): Identity()
|
| 193 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 194 |
+
(mlp): Sequential(
|
| 195 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 196 |
+
(gelu): GELU(approximate='none')
|
| 197 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 198 |
+
)
|
| 199 |
+
(ls_2): Identity()
|
| 200 |
+
)
|
| 201 |
+
(1): ResidualAttentionBlock(
|
| 202 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 203 |
+
(attn): MultiheadAttention(
|
| 204 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 205 |
+
)
|
| 206 |
+
(ls_1): Identity()
|
| 207 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 208 |
+
(mlp): Sequential(
|
| 209 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 210 |
+
(gelu): GELU(approximate='none')
|
| 211 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 212 |
+
)
|
| 213 |
+
(ls_2): Identity()
|
| 214 |
+
)
|
| 215 |
+
(2): ResidualAttentionBlock(
|
| 216 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 217 |
+
(attn): MultiheadAttention(
|
| 218 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 219 |
+
)
|
| 220 |
+
(ls_1): Identity()
|
| 221 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 222 |
+
(mlp): Sequential(
|
| 223 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 224 |
+
(gelu): GELU(approximate='none')
|
| 225 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 226 |
+
)
|
| 227 |
+
(ls_2): Identity()
|
| 228 |
+
)
|
| 229 |
+
(3): ResidualAttentionBlock(
|
| 230 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 231 |
+
(attn): MultiheadAttention(
|
| 232 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 233 |
+
)
|
| 234 |
+
(ls_1): Identity()
|
| 235 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 236 |
+
(mlp): Sequential(
|
| 237 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 238 |
+
(gelu): GELU(approximate='none')
|
| 239 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 240 |
+
)
|
| 241 |
+
(ls_2): Identity()
|
| 242 |
+
)
|
| 243 |
+
(4): ResidualAttentionBlock(
|
| 244 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 245 |
+
(attn): MultiheadAttention(
|
| 246 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 247 |
+
)
|
| 248 |
+
(ls_1): Identity()
|
| 249 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 250 |
+
(mlp): Sequential(
|
| 251 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 252 |
+
(gelu): GELU(approximate='none')
|
| 253 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 254 |
+
)
|
| 255 |
+
(ls_2): Identity()
|
| 256 |
+
)
|
| 257 |
+
(5): ResidualAttentionBlock(
|
| 258 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 259 |
+
(attn): MultiheadAttention(
|
| 260 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 261 |
+
)
|
| 262 |
+
(ls_1): Identity()
|
| 263 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 264 |
+
(mlp): Sequential(
|
| 265 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 266 |
+
(gelu): GELU(approximate='none')
|
| 267 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 268 |
+
)
|
| 269 |
+
(ls_2): Identity()
|
| 270 |
+
)
|
| 271 |
+
(6): ResidualAttentionBlock(
|
| 272 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 273 |
+
(attn): MultiheadAttention(
|
| 274 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 275 |
+
)
|
| 276 |
+
(ls_1): Identity()
|
| 277 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 278 |
+
(mlp): Sequential(
|
| 279 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 280 |
+
(gelu): GELU(approximate='none')
|
| 281 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 282 |
+
)
|
| 283 |
+
(ls_2): Identity()
|
| 284 |
+
)
|
| 285 |
+
(7): ResidualAttentionBlock(
|
| 286 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 287 |
+
(attn): MultiheadAttention(
|
| 288 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 289 |
+
)
|
| 290 |
+
(ls_1): Identity()
|
| 291 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 292 |
+
(mlp): Sequential(
|
| 293 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 294 |
+
(gelu): GELU(approximate='none')
|
| 295 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 296 |
+
)
|
| 297 |
+
(ls_2): Identity()
|
| 298 |
+
)
|
| 299 |
+
(8): ResidualAttentionBlock(
|
| 300 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 301 |
+
(attn): MultiheadAttention(
|
| 302 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 303 |
+
)
|
| 304 |
+
(ls_1): Identity()
|
| 305 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 306 |
+
(mlp): Sequential(
|
| 307 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 308 |
+
(gelu): GELU(approximate='none')
|
| 309 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 310 |
+
)
|
| 311 |
+
(ls_2): Identity()
|
| 312 |
+
)
|
| 313 |
+
(9): ResidualAttentionBlock(
|
| 314 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 315 |
+
(attn): MultiheadAttention(
|
| 316 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 317 |
+
)
|
| 318 |
+
(ls_1): Identity()
|
| 319 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 320 |
+
(mlp): Sequential(
|
| 321 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 322 |
+
(gelu): GELU(approximate='none')
|
| 323 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 324 |
+
)
|
| 325 |
+
(ls_2): Identity()
|
| 326 |
+
)
|
| 327 |
+
(10): ResidualAttentionBlock(
|
| 328 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 329 |
+
(attn): MultiheadAttention(
|
| 330 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 331 |
+
)
|
| 332 |
+
(ls_1): Identity()
|
| 333 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 334 |
+
(mlp): Sequential(
|
| 335 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 336 |
+
(gelu): GELU(approximate='none')
|
| 337 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 338 |
+
)
|
| 339 |
+
(ls_2): Identity()
|
| 340 |
+
)
|
| 341 |
+
(11): ResidualAttentionBlock(
|
| 342 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 343 |
+
(attn): MultiheadAttention(
|
| 344 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 345 |
+
)
|
| 346 |
+
(ls_1): Identity()
|
| 347 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 348 |
+
(mlp): Sequential(
|
| 349 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 350 |
+
(gelu): GELU(approximate='none')
|
| 351 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 352 |
+
)
|
| 353 |
+
(ls_2): Identity()
|
| 354 |
+
)
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
(token_embedding): Embedding(49408, 512)
|
| 358 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 359 |
+
)
|
| 360 |
+
2024-09-06,22:41:01 | INFO | Params:
|
| 361 |
+
2024-09-06,22:41:01 | INFO | accum_freq: 1
|
| 362 |
+
2024-09-06,22:41:01 | INFO | aug_cfg: {}
|
| 363 |
+
2024-09-06,22:41:01 | INFO | batch_size: 2048
|
| 364 |
+
2024-09-06,22:41:01 | INFO | beta1: 0.9
|
| 365 |
+
2024-09-06,22:41:01 | INFO | beta2: 0.98
|
| 366 |
+
2024-09-06,22:41:01 | INFO | checkpoint_path: /home/breaking_0.1_trained/10% most difficult/checkpoints
|
| 367 |
+
2024-09-06,22:41:01 | INFO | coca_caption_loss_weight: 2.0
|
| 368 |
+
2024-09-06,22:41:01 | INFO | coca_contrastive_loss_weight: 1.0
|
| 369 |
+
2024-09-06,22:41:01 | INFO | copy_codebase: False
|
| 370 |
+
2024-09-06,22:41:01 | INFO | csv_caption_key: title
|
| 371 |
+
2024-09-06,22:41:01 | INFO | csv_img_key: filepath
|
| 372 |
+
2024-09-06,22:41:01 | INFO | csv_separator:
|
| 373 |
+
2024-09-06,22:41:01 | INFO | dataset_resampled: True
|
| 374 |
+
2024-09-06,22:41:01 | INFO | dataset_type: webdataset
|
| 375 |
+
2024-09-06,22:41:01 | INFO | ddp_static_graph: True
|
| 376 |
+
2024-09-06,22:41:01 | INFO | debug: False
|
| 377 |
+
2024-09-06,22:41:01 | INFO | delete_previous_checkpoint: False
|
| 378 |
+
2024-09-06,22:41:01 | INFO | device: cuda:0
|
| 379 |
+
2024-09-06,22:41:01 | INFO | dist_backend: nccl
|
| 380 |
+
2024-09-06,22:41:01 | INFO | dist_url: env://
|
| 381 |
+
2024-09-06,22:41:01 | INFO | distill: False
|
| 382 |
+
2024-09-06,22:41:01 | INFO | distill_model: None
|
| 383 |
+
2024-09-06,22:41:01 | INFO | distill_pretrained: None
|
| 384 |
+
2024-09-06,22:41:01 | INFO | distributed: True
|
| 385 |
+
2024-09-06,22:41:01 | INFO | epochs: 5
|
| 386 |
+
2024-09-06,22:41:01 | INFO | epochs_cooldown: None
|
| 387 |
+
2024-09-06,22:41:01 | INFO | eps: 1e-06
|
| 388 |
+
2024-09-06,22:41:01 | INFO | force_custom_text: False
|
| 389 |
+
2024-09-06,22:41:01 | INFO | force_image_size: None
|
| 390 |
+
2024-09-06,22:41:01 | INFO | force_patch_dropout: None
|
| 391 |
+
2024-09-06,22:41:01 | INFO | force_quick_gelu: False
|
| 392 |
+
2024-09-06,22:41:01 | INFO | gather_with_grad: True
|
| 393 |
+
2024-09-06,22:41:01 | INFO | grad_checkpointing: True
|
| 394 |
+
2024-09-06,22:41:01 | INFO | grad_clip_norm: None
|
| 395 |
+
2024-09-06,22:41:01 | INFO | horovod: False
|
| 396 |
+
2024-09-06,22:41:01 | INFO | image_mean: None
|
| 397 |
+
2024-09-06,22:41:01 | INFO | image_std: None
|
| 398 |
+
2024-09-06,22:41:01 | INFO | imagenet_v2: None
|
| 399 |
+
2024-09-06,22:41:01 | INFO | imagenet_val: None
|
| 400 |
+
2024-09-06,22:41:01 | INFO | local_loss: True
|
| 401 |
+
2024-09-06,22:41:01 | INFO | local_rank: 0
|
| 402 |
+
2024-09-06,22:41:01 | INFO | lock_image: False
|
| 403 |
+
2024-09-06,22:41:01 | INFO | lock_image_freeze_bn_stats: False
|
| 404 |
+
2024-09-06,22:41:01 | INFO | lock_image_unlocked_groups: 0
|
| 405 |
+
2024-09-06,22:41:01 | INFO | lock_text: False
|
| 406 |
+
2024-09-06,22:41:01 | INFO | lock_text_freeze_layer_norm: False
|
| 407 |
+
2024-09-06,22:41:01 | INFO | lock_text_unlocked_layers: 0
|
| 408 |
+
2024-09-06,22:41:01 | INFO | log_every_n_steps: 100
|
| 409 |
+
2024-09-06,22:41:01 | INFO | log_level: 20
|
| 410 |
+
2024-09-06,22:41:01 | INFO | log_local: False
|
| 411 |
+
2024-09-06,22:41:01 | INFO | log_path: /home/breaking_0.1_trained/10% most difficult/out.log
|
| 412 |
+
2024-09-06,22:41:01 | INFO | logs: /home/breaking_0.1_trained
|
| 413 |
+
2024-09-06,22:41:01 | INFO | lr: 0.0005
|
| 414 |
+
2024-09-06,22:41:01 | INFO | lr_cooldown_end: 0.0
|
| 415 |
+
2024-09-06,22:41:01 | INFO | lr_cooldown_power: 1.0
|
| 416 |
+
2024-09-06,22:41:01 | INFO | lr_scheduler: cosine
|
| 417 |
+
2024-09-06,22:41:01 | INFO | model: ViT-B-32
|
| 418 |
+
2024-09-06,22:41:01 | INFO | name: 10% most difficult
|
| 419 |
+
2024-09-06,22:41:01 | INFO | no_set_device_rank: False
|
| 420 |
+
2024-09-06,22:41:01 | INFO | precision: amp
|
| 421 |
+
2024-09-06,22:41:01 | INFO | pretrained:
|
| 422 |
+
2024-09-06,22:41:01 | INFO | pretrained_image: False
|
| 423 |
+
2024-09-06,22:41:01 | INFO | rank: 0
|
| 424 |
+
2024-09-06,22:41:01 | INFO | remote_sync: None
|
| 425 |
+
2024-09-06,22:41:01 | INFO | remote_sync_frequency: 300
|
| 426 |
+
2024-09-06,22:41:01 | INFO | remote_sync_protocol: s3
|
| 427 |
+
2024-09-06,22:41:01 | INFO | report_to: wandb
|
| 428 |
+
2024-09-06,22:41:01 | INFO | resume: None
|
| 429 |
+
2024-09-06,22:41:01 | INFO | save_frequency: 0
|
| 430 |
+
2024-09-06,22:41:01 | INFO | save_most_recent: True
|
| 431 |
+
2024-09-06,22:41:01 | INFO | seed: 0
|
| 432 |
+
2024-09-06,22:41:01 | INFO | skip_scheduler: False
|
| 433 |
+
2024-09-06,22:41:01 | INFO | tensorboard: False
|
| 434 |
+
2024-09-06,22:41:01 | INFO | tensorboard_path:
|
| 435 |
+
2024-09-06,22:41:01 | INFO | torchscript: False
|
| 436 |
+
2024-09-06,22:41:01 | INFO | trace: False
|
| 437 |
+
2024-09-06,22:41:01 | INFO | train_data: /home/breaking_0.1/{00000000..00000127}.tar
|
| 438 |
+
2024-09-06,22:41:01 | INFO | train_data_upsampling_factors: None
|
| 439 |
+
2024-09-06,22:41:01 | INFO | train_num_samples: 2560000
|
| 440 |
+
2024-09-06,22:41:01 | INFO | use_bn_sync: False
|
| 441 |
+
2024-09-06,22:41:01 | INFO | val_data: None
|
| 442 |
+
2024-09-06,22:41:01 | INFO | val_frequency: 1
|
| 443 |
+
2024-09-06,22:41:01 | INFO | val_num_samples: None
|
| 444 |
+
2024-09-06,22:41:01 | INFO | wandb: True
|
| 445 |
+
2024-09-06,22:41:01 | INFO | wandb_notes:
|
| 446 |
+
2024-09-06,22:41:01 | INFO | wandb_project_name: clip_text_hq_clusters
|
| 447 |
+
2024-09-06,22:41:01 | INFO | warmup: 500
|
| 448 |
+
2024-09-06,22:41:01 | INFO | wd: 0.2
|
| 449 |
+
2024-09-06,22:41:01 | INFO | workers: 4
|
| 450 |
+
2024-09-06,22:41:01 | INFO | world_size: 2
|
| 451 |
+
2024-09-06,22:41:01 | INFO | zeroshot_frequency: 2
|
breaking_0.1_trained/10% most difficult/params.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 1
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
checkpoint_path: /home/breaking_0.1_trained/10% most difficult/checkpoints
|
| 7 |
+
coca_caption_loss_weight: 2.0
|
| 8 |
+
coca_contrastive_loss_weight: 1.0
|
| 9 |
+
copy_codebase: False
|
| 10 |
+
csv_caption_key: title
|
| 11 |
+
csv_img_key: filepath
|
| 12 |
+
csv_separator:
|
| 13 |
+
dataset_resampled: True
|
| 14 |
+
dataset_type: webdataset
|
| 15 |
+
ddp_static_graph: True
|
| 16 |
+
debug: False
|
| 17 |
+
delete_previous_checkpoint: False
|
| 18 |
+
device: cuda:0
|
| 19 |
+
dist_backend: nccl
|
| 20 |
+
dist_url: env://
|
| 21 |
+
distill: False
|
| 22 |
+
distill_model: None
|
| 23 |
+
distill_pretrained: None
|
| 24 |
+
distributed: True
|
| 25 |
+
epochs: 5
|
| 26 |
+
epochs_cooldown: None
|
| 27 |
+
eps: 1e-06
|
| 28 |
+
force_custom_text: False
|
| 29 |
+
force_image_size: None
|
| 30 |
+
force_patch_dropout: None
|
| 31 |
+
force_quick_gelu: False
|
| 32 |
+
gather_with_grad: True
|
| 33 |
+
grad_checkpointing: True
|
| 34 |
+
grad_clip_norm: None
|
| 35 |
+
horovod: False
|
| 36 |
+
image_mean: None
|
| 37 |
+
image_std: None
|
| 38 |
+
imagenet_v2: None
|
| 39 |
+
imagenet_val: None
|
| 40 |
+
local_loss: True
|
| 41 |
+
local_rank: 0
|
| 42 |
+
lock_image: False
|
| 43 |
+
lock_image_freeze_bn_stats: False
|
| 44 |
+
lock_image_unlocked_groups: 0
|
| 45 |
+
lock_text: False
|
| 46 |
+
lock_text_freeze_layer_norm: False
|
| 47 |
+
lock_text_unlocked_layers: 0
|
| 48 |
+
log_every_n_steps: 100
|
| 49 |
+
log_level: 20
|
| 50 |
+
log_local: False
|
| 51 |
+
log_path: /home/breaking_0.1_trained/10% most difficult/out.log
|
| 52 |
+
logs: /home/breaking_0.1_trained
|
| 53 |
+
lr: 0.0005
|
| 54 |
+
lr_cooldown_end: 0.0
|
| 55 |
+
lr_cooldown_power: 1.0
|
| 56 |
+
lr_scheduler: cosine
|
| 57 |
+
model: ViT-B-32
|
| 58 |
+
name: 10% most difficult
|
| 59 |
+
no_set_device_rank: False
|
| 60 |
+
precision: amp
|
| 61 |
+
pretrained:
|
| 62 |
+
pretrained_image: False
|
| 63 |
+
rank: 0
|
| 64 |
+
remote_sync: None
|
| 65 |
+
remote_sync_frequency: 300
|
| 66 |
+
remote_sync_protocol: s3
|
| 67 |
+
report_to: wandb
|
| 68 |
+
resume: None
|
| 69 |
+
save_frequency: 0
|
| 70 |
+
save_most_recent: True
|
| 71 |
+
seed: 0
|
| 72 |
+
skip_scheduler: False
|
| 73 |
+
tensorboard: False
|
| 74 |
+
tensorboard_path:
|
| 75 |
+
torchscript: False
|
| 76 |
+
trace: False
|
| 77 |
+
train_data: /home/breaking_0.1/{00000000..00000127}.tar
|
| 78 |
+
train_data_upsampling_factors: None
|
| 79 |
+
train_num_samples: 2560000
|
| 80 |
+
use_bn_sync: False
|
| 81 |
+
val_data: None
|
| 82 |
+
val_frequency: 1
|
| 83 |
+
val_num_samples: None
|
| 84 |
+
wandb: True
|
| 85 |
+
wandb_notes:
|
| 86 |
+
wandb_project_name: clip_text_hq_clusters
|
| 87 |
+
warmup: 500
|
| 88 |
+
wd: 0.2
|
| 89 |
+
workers: 4
|
| 90 |
+
world_size: 2
|
| 91 |
+
zeroshot_frequency: 2
|
breaking_0.1_trained/10_most_difficult/checkpoints/epoch_5.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8dc3db112ee0ce9ba46da71fd45c067537628014f0c2d1463d42dda2d96c992e
|
| 3 |
+
size 1815701601
|
breaking_0.1_trained/10_most_difficult/checkpoints/epoch_latest.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bed19451924aa206e696e1e29d68d99ae1cf27e9dd19f135d97d1c95ec5c378f
|
| 3 |
+
size 1815639289
|
breaking_0.1_trained/10_most_difficult/info.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53c7b7cf9407023e5653d0bb23cb37decb80bcebd0372f7d951380709e9b73b2
|
| 3 |
+
size 321
|
breaking_0.1_trained/10_most_difficult/out.log
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-06,22:41:22 | INFO | No latest resume checkpoint found in /home/breaking_0.1_trained/10_most_difficult/checkpoints.
|
| 2 |
+
2024-09-06,22:41:24 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 3 |
+
2024-09-06,22:41:24 | INFO | Loaded ViT-B-32 model config.
|
| 4 |
+
2024-09-06,22:41:25 | INFO | Model:
|
| 5 |
+
2024-09-06,22:41:25 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(patchnorm_pre_ln): Identity()
|
| 8 |
+
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
|
| 9 |
+
(patch_dropout): Identity()
|
| 10 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 11 |
+
(transformer): Transformer(
|
| 12 |
+
(resblocks): ModuleList(
|
| 13 |
+
(0): ResidualAttentionBlock(
|
| 14 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 15 |
+
(attn): MultiheadAttention(
|
| 16 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 17 |
+
)
|
| 18 |
+
(ls_1): Identity()
|
| 19 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 20 |
+
(mlp): Sequential(
|
| 21 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 22 |
+
(gelu): GELU(approximate='none')
|
| 23 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 24 |
+
)
|
| 25 |
+
(ls_2): Identity()
|
| 26 |
+
)
|
| 27 |
+
(1): ResidualAttentionBlock(
|
| 28 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
(attn): MultiheadAttention(
|
| 30 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 31 |
+
)
|
| 32 |
+
(ls_1): Identity()
|
| 33 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(mlp): Sequential(
|
| 35 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 36 |
+
(gelu): GELU(approximate='none')
|
| 37 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(ls_2): Identity()
|
| 40 |
+
)
|
| 41 |
+
(2): ResidualAttentionBlock(
|
| 42 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 43 |
+
(attn): MultiheadAttention(
|
| 44 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 45 |
+
)
|
| 46 |
+
(ls_1): Identity()
|
| 47 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 48 |
+
(mlp): Sequential(
|
| 49 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 50 |
+
(gelu): GELU(approximate='none')
|
| 51 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 52 |
+
)
|
| 53 |
+
(ls_2): Identity()
|
| 54 |
+
)
|
| 55 |
+
(3): ResidualAttentionBlock(
|
| 56 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(attn): MultiheadAttention(
|
| 58 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 59 |
+
)
|
| 60 |
+
(ls_1): Identity()
|
| 61 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 62 |
+
(mlp): Sequential(
|
| 63 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 64 |
+
(gelu): GELU(approximate='none')
|
| 65 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 66 |
+
)
|
| 67 |
+
(ls_2): Identity()
|
| 68 |
+
)
|
| 69 |
+
(4): ResidualAttentionBlock(
|
| 70 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 71 |
+
(attn): MultiheadAttention(
|
| 72 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 73 |
+
)
|
| 74 |
+
(ls_1): Identity()
|
| 75 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 76 |
+
(mlp): Sequential(
|
| 77 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 78 |
+
(gelu): GELU(approximate='none')
|
| 79 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 80 |
+
)
|
| 81 |
+
(ls_2): Identity()
|
| 82 |
+
)
|
| 83 |
+
(5): ResidualAttentionBlock(
|
| 84 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 85 |
+
(attn): MultiheadAttention(
|
| 86 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 87 |
+
)
|
| 88 |
+
(ls_1): Identity()
|
| 89 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 90 |
+
(mlp): Sequential(
|
| 91 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 92 |
+
(gelu): GELU(approximate='none')
|
| 93 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 94 |
+
)
|
| 95 |
+
(ls_2): Identity()
|
| 96 |
+
)
|
| 97 |
+
(6): ResidualAttentionBlock(
|
| 98 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 99 |
+
(attn): MultiheadAttention(
|
| 100 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 101 |
+
)
|
| 102 |
+
(ls_1): Identity()
|
| 103 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 104 |
+
(mlp): Sequential(
|
| 105 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 106 |
+
(gelu): GELU(approximate='none')
|
| 107 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 108 |
+
)
|
| 109 |
+
(ls_2): Identity()
|
| 110 |
+
)
|
| 111 |
+
(7): ResidualAttentionBlock(
|
| 112 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 113 |
+
(attn): MultiheadAttention(
|
| 114 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 115 |
+
)
|
| 116 |
+
(ls_1): Identity()
|
| 117 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 118 |
+
(mlp): Sequential(
|
| 119 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 120 |
+
(gelu): GELU(approximate='none')
|
| 121 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 122 |
+
)
|
| 123 |
+
(ls_2): Identity()
|
| 124 |
+
)
|
| 125 |
+
(8): ResidualAttentionBlock(
|
| 126 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 127 |
+
(attn): MultiheadAttention(
|
| 128 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 129 |
+
)
|
| 130 |
+
(ls_1): Identity()
|
| 131 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 132 |
+
(mlp): Sequential(
|
| 133 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 134 |
+
(gelu): GELU(approximate='none')
|
| 135 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 136 |
+
)
|
| 137 |
+
(ls_2): Identity()
|
| 138 |
+
)
|
| 139 |
+
(9): ResidualAttentionBlock(
|
| 140 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 141 |
+
(attn): MultiheadAttention(
|
| 142 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 143 |
+
)
|
| 144 |
+
(ls_1): Identity()
|
| 145 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 146 |
+
(mlp): Sequential(
|
| 147 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 148 |
+
(gelu): GELU(approximate='none')
|
| 149 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 150 |
+
)
|
| 151 |
+
(ls_2): Identity()
|
| 152 |
+
)
|
| 153 |
+
(10): ResidualAttentionBlock(
|
| 154 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 155 |
+
(attn): MultiheadAttention(
|
| 156 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 157 |
+
)
|
| 158 |
+
(ls_1): Identity()
|
| 159 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 160 |
+
(mlp): Sequential(
|
| 161 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 162 |
+
(gelu): GELU(approximate='none')
|
| 163 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 164 |
+
)
|
| 165 |
+
(ls_2): Identity()
|
| 166 |
+
)
|
| 167 |
+
(11): ResidualAttentionBlock(
|
| 168 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 169 |
+
(attn): MultiheadAttention(
|
| 170 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 171 |
+
)
|
| 172 |
+
(ls_1): Identity()
|
| 173 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 174 |
+
(mlp): Sequential(
|
| 175 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 176 |
+
(gelu): GELU(approximate='none')
|
| 177 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 178 |
+
)
|
| 179 |
+
(ls_2): Identity()
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 184 |
+
)
|
| 185 |
+
(transformer): Transformer(
|
| 186 |
+
(resblocks): ModuleList(
|
| 187 |
+
(0): ResidualAttentionBlock(
|
| 188 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 189 |
+
(attn): MultiheadAttention(
|
| 190 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 191 |
+
)
|
| 192 |
+
(ls_1): Identity()
|
| 193 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 194 |
+
(mlp): Sequential(
|
| 195 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 196 |
+
(gelu): GELU(approximate='none')
|
| 197 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 198 |
+
)
|
| 199 |
+
(ls_2): Identity()
|
| 200 |
+
)
|
| 201 |
+
(1): ResidualAttentionBlock(
|
| 202 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 203 |
+
(attn): MultiheadAttention(
|
| 204 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 205 |
+
)
|
| 206 |
+
(ls_1): Identity()
|
| 207 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 208 |
+
(mlp): Sequential(
|
| 209 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 210 |
+
(gelu): GELU(approximate='none')
|
| 211 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 212 |
+
)
|
| 213 |
+
(ls_2): Identity()
|
| 214 |
+
)
|
| 215 |
+
(2): ResidualAttentionBlock(
|
| 216 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 217 |
+
(attn): MultiheadAttention(
|
| 218 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 219 |
+
)
|
| 220 |
+
(ls_1): Identity()
|
| 221 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 222 |
+
(mlp): Sequential(
|
| 223 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 224 |
+
(gelu): GELU(approximate='none')
|
| 225 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 226 |
+
)
|
| 227 |
+
(ls_2): Identity()
|
| 228 |
+
)
|
| 229 |
+
(3): ResidualAttentionBlock(
|
| 230 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 231 |
+
(attn): MultiheadAttention(
|
| 232 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 233 |
+
)
|
| 234 |
+
(ls_1): Identity()
|
| 235 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 236 |
+
(mlp): Sequential(
|
| 237 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 238 |
+
(gelu): GELU(approximate='none')
|
| 239 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 240 |
+
)
|
| 241 |
+
(ls_2): Identity()
|
| 242 |
+
)
|
| 243 |
+
(4): ResidualAttentionBlock(
|
| 244 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 245 |
+
(attn): MultiheadAttention(
|
| 246 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 247 |
+
)
|
| 248 |
+
(ls_1): Identity()
|
| 249 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 250 |
+
(mlp): Sequential(
|
| 251 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 252 |
+
(gelu): GELU(approximate='none')
|
| 253 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 254 |
+
)
|
| 255 |
+
(ls_2): Identity()
|
| 256 |
+
)
|
| 257 |
+
(5): ResidualAttentionBlock(
|
| 258 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 259 |
+
(attn): MultiheadAttention(
|
| 260 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 261 |
+
)
|
| 262 |
+
(ls_1): Identity()
|
| 263 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 264 |
+
(mlp): Sequential(
|
| 265 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 266 |
+
(gelu): GELU(approximate='none')
|
| 267 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 268 |
+
)
|
| 269 |
+
(ls_2): Identity()
|
| 270 |
+
)
|
| 271 |
+
(6): ResidualAttentionBlock(
|
| 272 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 273 |
+
(attn): MultiheadAttention(
|
| 274 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 275 |
+
)
|
| 276 |
+
(ls_1): Identity()
|
| 277 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 278 |
+
(mlp): Sequential(
|
| 279 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 280 |
+
(gelu): GELU(approximate='none')
|
| 281 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 282 |
+
)
|
| 283 |
+
(ls_2): Identity()
|
| 284 |
+
)
|
| 285 |
+
(7): ResidualAttentionBlock(
|
| 286 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 287 |
+
(attn): MultiheadAttention(
|
| 288 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 289 |
+
)
|
| 290 |
+
(ls_1): Identity()
|
| 291 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 292 |
+
(mlp): Sequential(
|
| 293 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 294 |
+
(gelu): GELU(approximate='none')
|
| 295 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 296 |
+
)
|
| 297 |
+
(ls_2): Identity()
|
| 298 |
+
)
|
| 299 |
+
(8): ResidualAttentionBlock(
|
| 300 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 301 |
+
(attn): MultiheadAttention(
|
| 302 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 303 |
+
)
|
| 304 |
+
(ls_1): Identity()
|
| 305 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 306 |
+
(mlp): Sequential(
|
| 307 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 308 |
+
(gelu): GELU(approximate='none')
|
| 309 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 310 |
+
)
|
| 311 |
+
(ls_2): Identity()
|
| 312 |
+
)
|
| 313 |
+
(9): ResidualAttentionBlock(
|
| 314 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 315 |
+
(attn): MultiheadAttention(
|
| 316 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 317 |
+
)
|
| 318 |
+
(ls_1): Identity()
|
| 319 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 320 |
+
(mlp): Sequential(
|
| 321 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 322 |
+
(gelu): GELU(approximate='none')
|
| 323 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 324 |
+
)
|
| 325 |
+
(ls_2): Identity()
|
| 326 |
+
)
|
| 327 |
+
(10): ResidualAttentionBlock(
|
| 328 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 329 |
+
(attn): MultiheadAttention(
|
| 330 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 331 |
+
)
|
| 332 |
+
(ls_1): Identity()
|
| 333 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 334 |
+
(mlp): Sequential(
|
| 335 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 336 |
+
(gelu): GELU(approximate='none')
|
| 337 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 338 |
+
)
|
| 339 |
+
(ls_2): Identity()
|
| 340 |
+
)
|
| 341 |
+
(11): ResidualAttentionBlock(
|
| 342 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 343 |
+
(attn): MultiheadAttention(
|
| 344 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 345 |
+
)
|
| 346 |
+
(ls_1): Identity()
|
| 347 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 348 |
+
(mlp): Sequential(
|
| 349 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 350 |
+
(gelu): GELU(approximate='none')
|
| 351 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 352 |
+
)
|
| 353 |
+
(ls_2): Identity()
|
| 354 |
+
)
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
(token_embedding): Embedding(49408, 512)
|
| 358 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 359 |
+
)
|
| 360 |
+
2024-09-06,22:41:25 | INFO | Params:
|
| 361 |
+
2024-09-06,22:41:25 | INFO | accum_freq: 1
|
| 362 |
+
2024-09-06,22:41:25 | INFO | aug_cfg: {}
|
| 363 |
+
2024-09-06,22:41:25 | INFO | batch_size: 2048
|
| 364 |
+
2024-09-06,22:41:25 | INFO | beta1: 0.9
|
| 365 |
+
2024-09-06,22:41:25 | INFO | beta2: 0.98
|
| 366 |
+
2024-09-06,22:41:25 | INFO | checkpoint_path: /home/breaking_0.1_trained/10_most_difficult/checkpoints
|
| 367 |
+
2024-09-06,22:41:25 | INFO | coca_caption_loss_weight: 2.0
|
| 368 |
+
2024-09-06,22:41:25 | INFO | coca_contrastive_loss_weight: 1.0
|
| 369 |
+
2024-09-06,22:41:25 | INFO | copy_codebase: False
|
| 370 |
+
2024-09-06,22:41:25 | INFO | csv_caption_key: title
|
| 371 |
+
2024-09-06,22:41:25 | INFO | csv_img_key: filepath
|
| 372 |
+
2024-09-06,22:41:25 | INFO | csv_separator:
|
| 373 |
+
2024-09-06,22:41:25 | INFO | dataset_resampled: True
|
| 374 |
+
2024-09-06,22:41:25 | INFO | dataset_type: webdataset
|
| 375 |
+
2024-09-06,22:41:25 | INFO | ddp_static_graph: True
|
| 376 |
+
2024-09-06,22:41:25 | INFO | debug: False
|
| 377 |
+
2024-09-06,22:41:25 | INFO | delete_previous_checkpoint: False
|
| 378 |
+
2024-09-06,22:41:25 | INFO | device: cuda:0
|
| 379 |
+
2024-09-06,22:41:25 | INFO | dist_backend: nccl
|
| 380 |
+
2024-09-06,22:41:25 | INFO | dist_url: env://
|
| 381 |
+
2024-09-06,22:41:25 | INFO | distill: False
|
| 382 |
+
2024-09-06,22:41:25 | INFO | distill_model: None
|
| 383 |
+
2024-09-06,22:41:25 | INFO | distill_pretrained: None
|
| 384 |
+
2024-09-06,22:41:25 | INFO | distributed: True
|
| 385 |
+
2024-09-06,22:41:25 | INFO | epochs: 5
|
| 386 |
+
2024-09-06,22:41:25 | INFO | epochs_cooldown: None
|
| 387 |
+
2024-09-06,22:41:25 | INFO | eps: 1e-06
|
| 388 |
+
2024-09-06,22:41:25 | INFO | force_custom_text: False
|
| 389 |
+
2024-09-06,22:41:25 | INFO | force_image_size: None
|
| 390 |
+
2024-09-06,22:41:25 | INFO | force_patch_dropout: None
|
| 391 |
+
2024-09-06,22:41:25 | INFO | force_quick_gelu: False
|
| 392 |
+
2024-09-06,22:41:25 | INFO | gather_with_grad: True
|
| 393 |
+
2024-09-06,22:41:25 | INFO | grad_checkpointing: True
|
| 394 |
+
2024-09-06,22:41:25 | INFO | grad_clip_norm: None
|
| 395 |
+
2024-09-06,22:41:25 | INFO | horovod: False
|
| 396 |
+
2024-09-06,22:41:25 | INFO | image_mean: None
|
| 397 |
+
2024-09-06,22:41:25 | INFO | image_std: None
|
| 398 |
+
2024-09-06,22:41:25 | INFO | imagenet_v2: None
|
| 399 |
+
2024-09-06,22:41:25 | INFO | imagenet_val: None
|
| 400 |
+
2024-09-06,22:41:25 | INFO | local_loss: True
|
| 401 |
+
2024-09-06,22:41:25 | INFO | local_rank: 0
|
| 402 |
+
2024-09-06,22:41:25 | INFO | lock_image: False
|
| 403 |
+
2024-09-06,22:41:25 | INFO | lock_image_freeze_bn_stats: False
|
| 404 |
+
2024-09-06,22:41:25 | INFO | lock_image_unlocked_groups: 0
|
| 405 |
+
2024-09-06,22:41:25 | INFO | lock_text: False
|
| 406 |
+
2024-09-06,22:41:25 | INFO | lock_text_freeze_layer_norm: False
|
| 407 |
+
2024-09-06,22:41:25 | INFO | lock_text_unlocked_layers: 0
|
| 408 |
+
2024-09-06,22:41:25 | INFO | log_every_n_steps: 100
|
| 409 |
+
2024-09-06,22:41:25 | INFO | log_level: 20
|
| 410 |
+
2024-09-06,22:41:25 | INFO | log_local: False
|
| 411 |
+
2024-09-06,22:41:25 | INFO | log_path: /home/breaking_0.1_trained/10_most_difficult/out.log
|
| 412 |
+
2024-09-06,22:41:25 | INFO | logs: /home/breaking_0.1_trained
|
| 413 |
+
2024-09-06,22:41:25 | INFO | lr: 0.0005
|
| 414 |
+
2024-09-06,22:41:25 | INFO | lr_cooldown_end: 0.0
|
| 415 |
+
2024-09-06,22:41:25 | INFO | lr_cooldown_power: 1.0
|
| 416 |
+
2024-09-06,22:41:25 | INFO | lr_scheduler: cosine
|
| 417 |
+
2024-09-06,22:41:25 | INFO | model: ViT-B-32
|
| 418 |
+
2024-09-06,22:41:25 | INFO | name: 10_most_difficult
|
| 419 |
+
2024-09-06,22:41:25 | INFO | no_set_device_rank: False
|
| 420 |
+
2024-09-06,22:41:25 | INFO | precision: amp
|
| 421 |
+
2024-09-06,22:41:25 | INFO | pretrained:
|
| 422 |
+
2024-09-06,22:41:25 | INFO | pretrained_image: False
|
| 423 |
+
2024-09-06,22:41:25 | INFO | rank: 0
|
| 424 |
+
2024-09-06,22:41:25 | INFO | remote_sync: None
|
| 425 |
+
2024-09-06,22:41:25 | INFO | remote_sync_frequency: 300
|
| 426 |
+
2024-09-06,22:41:25 | INFO | remote_sync_protocol: s3
|
| 427 |
+
2024-09-06,22:41:25 | INFO | report_to: wandb
|
| 428 |
+
2024-09-06,22:41:25 | INFO | resume: None
|
| 429 |
+
2024-09-06,22:41:25 | INFO | save_frequency: 0
|
| 430 |
+
2024-09-06,22:41:25 | INFO | save_most_recent: True
|
| 431 |
+
2024-09-06,22:41:25 | INFO | seed: 0
|
| 432 |
+
2024-09-06,22:41:25 | INFO | skip_scheduler: False
|
| 433 |
+
2024-09-06,22:41:25 | INFO | tensorboard: False
|
| 434 |
+
2024-09-06,22:41:25 | INFO | tensorboard_path:
|
| 435 |
+
2024-09-06,22:41:25 | INFO | torchscript: False
|
| 436 |
+
2024-09-06,22:41:25 | INFO | trace: False
|
| 437 |
+
2024-09-06,22:41:25 | INFO | train_data: /home/breaking_0.1/{00000000..00000127}.tar
|
| 438 |
+
2024-09-06,22:41:25 | INFO | train_data_upsampling_factors: None
|
| 439 |
+
2024-09-06,22:41:25 | INFO | train_num_samples: 2560000
|
| 440 |
+
2024-09-06,22:41:25 | INFO | use_bn_sync: False
|
| 441 |
+
2024-09-06,22:41:25 | INFO | val_data: None
|
| 442 |
+
2024-09-06,22:41:25 | INFO | val_frequency: 1
|
| 443 |
+
2024-09-06,22:41:25 | INFO | val_num_samples: None
|
| 444 |
+
2024-09-06,22:41:25 | INFO | wandb: True
|
| 445 |
+
2024-09-06,22:41:25 | INFO | wandb_notes:
|
| 446 |
+
2024-09-06,22:41:25 | INFO | wandb_project_name: clip_text_hq_clusters
|
| 447 |
+
2024-09-06,22:41:25 | INFO | warmup: 500
|
| 448 |
+
2024-09-06,22:41:25 | INFO | wd: 0.2
|
| 449 |
+
2024-09-06,22:41:25 | INFO | workers: 4
|
| 450 |
+
2024-09-06,22:41:25 | INFO | world_size: 2
|
| 451 |
+
2024-09-06,22:41:25 | INFO | zeroshot_frequency: 2
|
| 452 |
+
2024-09-06,22:41:34 | INFO | Start epoch 0
|
| 453 |
+
2024-09-06,22:41:51 | INFO | Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 11.911 Batch (t): 16.649, 246.023/s, 123.011/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3776 (8.3776) Loss: 8.3776 (8.3776)
|
| 454 |
+
2024-09-06,22:41:54 | INFO | Reducer buckets have been rebuilt in this iteration.
|
| 455 |
+
2024-09-06,22:46:11 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.555 Batch (t): 2.608, 1572.92/s, 786.459/s/gpu LR: 0.000101 Logit Scale: 14.264 Contrastive_loss: 8.2202 (8.2989) Loss: 8.2202 (8.2989)
|
| 456 |
+
2024-09-06,22:50:33 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.568 Batch (t): 2.616, 1572.28/s, 786.140/s/gpu LR: 0.000201 Logit Scale: 14.244 Contrastive_loss: 7.9768 (8.1915) Loss: 7.9768 (8.1915)
|
| 457 |
+
2024-09-06,22:54:55 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.618, 1560.51/s, 780.257/s/gpu LR: 0.000301 Logit Scale: 14.227 Contrastive_loss: 7.9563 (8.1327) Loss: 7.9563 (8.1327)
|
| 458 |
+
2024-09-06,22:59:17 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.574 Batch (t): 2.623, 1564.50/s, 782.249/s/gpu LR: 0.000401 Logit Scale: 14.205 Contrastive_loss: 7.9317 (8.0925) Loss: 7.9317 (8.0925)
|
| 459 |
+
2024-09-06,23:03:39 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.571 Batch (t): 2.620, 1561.55/s, 780.776/s/gpu LR: 0.000500 Logit Scale: 14.192 Contrastive_loss: 7.7846 (8.0412) Loss: 7.7846 (8.0412)
|
| 460 |
+
2024-09-06,23:08:01 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.569 Batch (t): 2.619, 1563.18/s, 781.589/s/gpu LR: 0.000498 Logit Scale: 14.198 Contrastive_loss: 6.8264 (7.8676) Loss: 6.8264 (7.8676)
|
| 461 |
+
2024-09-06,23:09:12 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.569 Batch (t): 2.618, 1573.70/s, 786.852/s/gpu LR: 0.000497 Logit Scale: 14.199 Contrastive_loss: 7.5857 (7.8324) Loss: 7.5857 (7.8324)
|
| 462 |
+
2024-09-06,23:09:14 | INFO | Start epoch 1
|
| 463 |
+
2024-09-06,23:09:26 | INFO | Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.742 Batch (t): 11.787, 347.508/s, 173.754/s/gpu LR: 0.000497 Logit Scale: 14.199 Contrastive_loss: 7.8159 (7.8159) Loss: 7.8159 (7.8159)
|
| 464 |
+
2024-09-06,23:13:45 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.536 Batch (t): 2.594, 1566.93/s, 783.467/s/gpu LR: 0.000491 Logit Scale: 14.223 Contrastive_loss: 6.1751 (6.9955) Loss: 6.1751 (6.9955)
|
| 465 |
+
2024-09-06,23:18:08 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.571 Batch (t): 2.622, 1565.08/s, 782.541/s/gpu LR: 0.000481 Logit Scale: 14.267 Contrastive_loss: 6.8320 (6.9410) Loss: 6.8320 (6.9410)
|
| 466 |
+
2024-09-06,23:22:30 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.571 Batch (t): 2.621, 1560.41/s, 780.206/s/gpu LR: 0.000468 Logit Scale: 14.319 Contrastive_loss: 6.7536 (6.8941) Loss: 6.7536 (6.8941)
|
| 467 |
+
2024-09-06,23:26:52 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.572 Batch (t): 2.625, 1564.15/s, 782.075/s/gpu LR: 0.000452 Logit Scale: 14.415 Contrastive_loss: 7.1712 (6.9495) Loss: 7.1712 (6.9495)
|
| 468 |
+
2024-09-06,23:31:14 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.567 Batch (t): 2.618, 1564.96/s, 782.479/s/gpu LR: 0.000433 Logit Scale: 14.531 Contrastive_loss: 7.3828 (7.0218) Loss: 7.3828 (7.0218)
|
| 469 |
+
2024-09-06,23:35:36 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.570 Batch (t): 2.619, 1559.05/s, 779.526/s/gpu LR: 0.000412 Logit Scale: 14.685 Contrastive_loss: 5.3914 (6.7888) Loss: 5.3914 (6.7888)
|
| 470 |
+
2024-09-06,23:36:47 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.569 Batch (t): 2.618, 1573.02/s, 786.512/s/gpu LR: 0.000406 Logit Scale: 14.750 Contrastive_loss: 6.8457 (6.7959) Loss: 6.8457 (6.7959)
|
| 471 |
+
2024-09-06,23:36:49 | INFO | Start epoch 2
|
| 472 |
+
2024-09-06,23:37:01 | INFO | Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.683 Batch (t): 11.729, 349.234/s, 174.617/s/gpu LR: 0.000405 Logit Scale: 14.752 Contrastive_loss: 3.8208 (3.8208) Loss: 3.8208 (3.8208)
|
| 473 |
+
2024-09-06,23:41:21 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.539 Batch (t): 2.596, 1563.69/s, 781.844/s/gpu LR: 0.000381 Logit Scale: 14.930 Contrastive_loss: 5.3136 (4.5672) Loss: 5.3136 (4.5672)
|
| 474 |
+
2024-09-06,23:45:42 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.615, 1565.33/s, 782.664/s/gpu LR: 0.000355 Logit Scale: 15.116 Contrastive_loss: 6.5801 (5.2382) Loss: 6.5801 (5.2382)
|
| 475 |
+
2024-09-06,23:50:04 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.571 Batch (t): 2.621, 1569.26/s, 784.629/s/gpu LR: 0.000327 Logit Scale: 15.352 Contrastive_loss: 3.4862 (4.8002) Loss: 3.4862 (4.8002)
|
| 476 |
+
2024-09-06,23:54:26 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.569 Batch (t): 2.619, 1571.61/s, 785.806/s/gpu LR: 0.000298 Logit Scale: 15.478 Contrastive_loss: 6.0303 (5.0462) Loss: 6.0303 (5.0462)
|
| 477 |
+
2024-09-06,23:58:48 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.568 Batch (t): 2.618, 1564.29/s, 782.143/s/gpu LR: 0.000269 Logit Scale: 15.784 Contrastive_loss: 6.0015 (5.2054) Loss: 6.0015 (5.2054)
|
| 478 |
+
2024-09-07,00:03:10 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.573 Batch (t): 2.624, 1562.22/s, 781.111/s/gpu LR: 0.000239 Logit Scale: 16.079 Contrastive_loss: 3.7522 (4.9978) Loss: 3.7522 (4.9978)
|
| 479 |
+
2024-09-07,00:04:21 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.569 Batch (t): 2.618, 1581.00/s, 790.499/s/gpu LR: 0.000231 Logit Scale: 16.160 Contrastive_loss: 3.6242 (4.8261) Loss: 3.6242 (4.8261)
|
| 480 |
+
2024-09-07,00:04:24 | INFO | Start epoch 3
|
| 481 |
+
2024-09-07,00:04:35 | INFO | Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.688 Batch (t): 11.733, 349.111/s, 174.556/s/gpu LR: 0.000231 Logit Scale: 16.162 Contrastive_loss: 1.8643 (1.8643) Loss: 1.8643 (1.8643)
|
| 482 |
+
2024-09-07,00:08:56 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.545 Batch (t): 2.604, 1559.72/s, 779.861/s/gpu LR: 0.000202 Logit Scale: 16.406 Contrastive_loss: 4.5089 (3.1866) Loss: 4.5089 (3.1866)
|
| 483 |
+
2024-09-07,00:13:18 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.566 Batch (t): 2.618, 1566.12/s, 783.061/s/gpu LR: 0.000173 Logit Scale: 16.650 Contrastive_loss: 2.5566 (2.9766) Loss: 2.5566 (2.9766)
|
| 484 |
+
2024-09-07,00:17:40 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.619, 1562.05/s, 781.024/s/gpu LR: 0.000145 Logit Scale: 16.830 Contrastive_loss: 3.5824 (3.1280) Loss: 3.5824 (3.1280)
|
| 485 |
+
2024-09-07,00:22:01 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.618, 1560.91/s, 780.456/s/gpu LR: 0.000119 Logit Scale: 16.987 Contrastive_loss: 3.5628 (3.2150) Loss: 3.5628 (3.2150)
|
| 486 |
+
2024-09-07,00:26:23 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.569 Batch (t): 2.620, 1568.81/s, 784.403/s/gpu LR: 0.000095 Logit Scale: 17.164 Contrastive_loss: 2.0013 (3.0127) Loss: 2.0013 (3.0127)
|
| 487 |
+
2024-09-07,00:30:45 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.567 Batch (t): 2.619, 1560.00/s, 779.999/s/gpu LR: 0.000072 Logit Scale: 17.312 Contrastive_loss: 2.4090 (2.9265) Loss: 2.4090 (2.9265)
|
| 488 |
+
2024-09-07,00:31:56 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.616, 1573.45/s, 786.724/s/gpu LR: 0.000067 Logit Scale: 17.344 Contrastive_loss: 2.9960 (2.9352) Loss: 2.9960 (2.9352)
|
| 489 |
+
2024-09-07,00:31:59 | INFO | Start epoch 4
|
| 490 |
+
2024-09-07,00:32:11 | INFO | Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.713 Batch (t): 11.759, 348.328/s, 174.164/s/gpu LR: 0.000067 Logit Scale: 17.345 Contrastive_loss: 1.7169 (1.7169) Loss: 1.7169 (1.7169)
|
| 491 |
+
2024-09-07,00:36:31 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.543 Batch (t): 2.602, 1569.60/s, 784.799/s/gpu LR: 0.000048 Logit Scale: 17.440 Contrastive_loss: 1.7550 (1.7359) Loss: 1.7550 (1.7359)
|
| 492 |
+
2024-09-07,00:40:52 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.563 Batch (t): 2.615, 1566.77/s, 783.385/s/gpu LR: 0.000032 Logit Scale: 17.498 Contrastive_loss: 3.3456 (2.2725) Loss: 3.3456 (2.2725)
|
| 493 |
+
2024-09-07,00:45:14 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.618, 1564.76/s, 782.378/s/gpu LR: 0.000019 Logit Scale: 17.536 Contrastive_loss: 1.5279 (2.0863) Loss: 1.5279 (2.0863)
|
| 494 |
+
2024-09-07,00:49:36 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.617, 1562.39/s, 781.195/s/gpu LR: 0.000009 Logit Scale: 17.557 Contrastive_loss: 1.9162 (2.0523) Loss: 1.9162 (2.0523)
|
| 495 |
+
2024-09-07,00:53:57 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.617, 1566.97/s, 783.487/s/gpu LR: 0.000003 Logit Scale: 17.566 Contrastive_loss: 1.6844 (1.9910) Loss: 1.6844 (1.9910)
|
| 496 |
+
2024-09-07,00:58:19 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.563 Batch (t): 2.614, 1561.38/s, 780.692/s/gpu LR: 0.000000 Logit Scale: 17.568 Contrastive_loss: 1.5111 (1.9224) Loss: 1.5111 (1.9224)
|
| 497 |
+
2024-09-07,00:59:29 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.610, 1575.70/s, 787.852/s/gpu LR: 0.000000 Logit Scale: 17.568 Contrastive_loss: 1.7148 (1.8965) Loss: 1.7148 (1.8965)
|
breaking_0.1_trained/10_most_difficult/params.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 1
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
checkpoint_path: /home/breaking_0.1_trained/10_most_difficult/checkpoints
|
| 7 |
+
coca_caption_loss_weight: 2.0
|
| 8 |
+
coca_contrastive_loss_weight: 1.0
|
| 9 |
+
copy_codebase: False
|
| 10 |
+
csv_caption_key: title
|
| 11 |
+
csv_img_key: filepath
|
| 12 |
+
csv_separator:
|
| 13 |
+
dataset_resampled: True
|
| 14 |
+
dataset_type: webdataset
|
| 15 |
+
ddp_static_graph: True
|
| 16 |
+
debug: False
|
| 17 |
+
delete_previous_checkpoint: False
|
| 18 |
+
device: cuda:0
|
| 19 |
+
dist_backend: nccl
|
| 20 |
+
dist_url: env://
|
| 21 |
+
distill: False
|
| 22 |
+
distill_model: None
|
| 23 |
+
distill_pretrained: None
|
| 24 |
+
distributed: True
|
| 25 |
+
epochs: 5
|
| 26 |
+
epochs_cooldown: None
|
| 27 |
+
eps: 1e-06
|
| 28 |
+
force_custom_text: False
|
| 29 |
+
force_image_size: None
|
| 30 |
+
force_patch_dropout: None
|
| 31 |
+
force_quick_gelu: False
|
| 32 |
+
gather_with_grad: True
|
| 33 |
+
grad_checkpointing: True
|
| 34 |
+
grad_clip_norm: None
|
| 35 |
+
horovod: False
|
| 36 |
+
image_mean: None
|
| 37 |
+
image_std: None
|
| 38 |
+
imagenet_v2: None
|
| 39 |
+
imagenet_val: None
|
| 40 |
+
local_loss: True
|
| 41 |
+
local_rank: 0
|
| 42 |
+
lock_image: False
|
| 43 |
+
lock_image_freeze_bn_stats: False
|
| 44 |
+
lock_image_unlocked_groups: 0
|
| 45 |
+
lock_text: False
|
| 46 |
+
lock_text_freeze_layer_norm: False
|
| 47 |
+
lock_text_unlocked_layers: 0
|
| 48 |
+
log_every_n_steps: 100
|
| 49 |
+
log_level: 20
|
| 50 |
+
log_local: False
|
| 51 |
+
log_path: /home/breaking_0.1_trained/10_most_difficult/out.log
|
| 52 |
+
logs: /home/breaking_0.1_trained
|
| 53 |
+
lr: 0.0005
|
| 54 |
+
lr_cooldown_end: 0.0
|
| 55 |
+
lr_cooldown_power: 1.0
|
| 56 |
+
lr_scheduler: cosine
|
| 57 |
+
model: ViT-B-32
|
| 58 |
+
name: 10_most_difficult
|
| 59 |
+
no_set_device_rank: False
|
| 60 |
+
precision: amp
|
| 61 |
+
pretrained:
|
| 62 |
+
pretrained_image: False
|
| 63 |
+
rank: 0
|
| 64 |
+
remote_sync: None
|
| 65 |
+
remote_sync_frequency: 300
|
| 66 |
+
remote_sync_protocol: s3
|
| 67 |
+
report_to: wandb
|
| 68 |
+
resume: None
|
| 69 |
+
save_frequency: 0
|
| 70 |
+
save_most_recent: True
|
| 71 |
+
seed: 0
|
| 72 |
+
skip_scheduler: False
|
| 73 |
+
tensorboard: False
|
| 74 |
+
tensorboard_path:
|
| 75 |
+
torchscript: False
|
| 76 |
+
trace: False
|
| 77 |
+
train_data: /home/breaking_0.1/{00000000..00000127}.tar
|
| 78 |
+
train_data_upsampling_factors: None
|
| 79 |
+
train_num_samples: 2560000
|
| 80 |
+
use_bn_sync: False
|
| 81 |
+
val_data: None
|
| 82 |
+
val_frequency: 1
|
| 83 |
+
val_num_samples: None
|
| 84 |
+
wandb: True
|
| 85 |
+
wandb_notes:
|
| 86 |
+
wandb_project_name: clip_text_hq_clusters
|
| 87 |
+
warmup: 500
|
| 88 |
+
wd: 0.2
|
| 89 |
+
workers: 4
|
| 90 |
+
world_size: 2
|
| 91 |
+
zeroshot_frequency: 2
|
breaking_0.5_trained/50_most_difficult/checkpoints/epoch_5.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6dc3fc5a1b9f2669eeeab83d7cf4012337c8c3b41a40a9cbd34adfc26b58a76
|
| 3 |
+
size 1815701601
|
breaking_0.5_trained/50_most_difficult/checkpoints/epoch_latest.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f927368bb04f90b339f66d1e37be977ce4c4426709a53e939de9b61f7691582
|
| 3 |
+
size 1815639289
|
breaking_0.5_trained/50_most_difficult/info.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad6c5a997437f8ce8f231394e07cecf65ee7f71485eeb6b732474a0144ea7a66
|
| 3 |
+
size 321
|
breaking_0.5_trained/50_most_difficult/out.log
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-07,04:53:20 | INFO | No latest resume checkpoint found in /home/breaking_0.5_trained/50_most_difficult/checkpoints.
|
| 2 |
+
2024-09-07,04:53:22 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 3 |
+
2024-09-07,04:53:22 | INFO | Loaded ViT-B-32 model config.
|
| 4 |
+
2024-09-07,04:53:23 | INFO | Model:
|
| 5 |
+
2024-09-07,04:53:23 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(patchnorm_pre_ln): Identity()
|
| 8 |
+
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
|
| 9 |
+
(patch_dropout): Identity()
|
| 10 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 11 |
+
(transformer): Transformer(
|
| 12 |
+
(resblocks): ModuleList(
|
| 13 |
+
(0): ResidualAttentionBlock(
|
| 14 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 15 |
+
(attn): MultiheadAttention(
|
| 16 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 17 |
+
)
|
| 18 |
+
(ls_1): Identity()
|
| 19 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 20 |
+
(mlp): Sequential(
|
| 21 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 22 |
+
(gelu): GELU(approximate='none')
|
| 23 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 24 |
+
)
|
| 25 |
+
(ls_2): Identity()
|
| 26 |
+
)
|
| 27 |
+
(1): ResidualAttentionBlock(
|
| 28 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
(attn): MultiheadAttention(
|
| 30 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 31 |
+
)
|
| 32 |
+
(ls_1): Identity()
|
| 33 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(mlp): Sequential(
|
| 35 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 36 |
+
(gelu): GELU(approximate='none')
|
| 37 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(ls_2): Identity()
|
| 40 |
+
)
|
| 41 |
+
(2): ResidualAttentionBlock(
|
| 42 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 43 |
+
(attn): MultiheadAttention(
|
| 44 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 45 |
+
)
|
| 46 |
+
(ls_1): Identity()
|
| 47 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 48 |
+
(mlp): Sequential(
|
| 49 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 50 |
+
(gelu): GELU(approximate='none')
|
| 51 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 52 |
+
)
|
| 53 |
+
(ls_2): Identity()
|
| 54 |
+
)
|
| 55 |
+
(3): ResidualAttentionBlock(
|
| 56 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(attn): MultiheadAttention(
|
| 58 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 59 |
+
)
|
| 60 |
+
(ls_1): Identity()
|
| 61 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 62 |
+
(mlp): Sequential(
|
| 63 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 64 |
+
(gelu): GELU(approximate='none')
|
| 65 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 66 |
+
)
|
| 67 |
+
(ls_2): Identity()
|
| 68 |
+
)
|
| 69 |
+
(4): ResidualAttentionBlock(
|
| 70 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 71 |
+
(attn): MultiheadAttention(
|
| 72 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 73 |
+
)
|
| 74 |
+
(ls_1): Identity()
|
| 75 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 76 |
+
(mlp): Sequential(
|
| 77 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 78 |
+
(gelu): GELU(approximate='none')
|
| 79 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 80 |
+
)
|
| 81 |
+
(ls_2): Identity()
|
| 82 |
+
)
|
| 83 |
+
(5): ResidualAttentionBlock(
|
| 84 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 85 |
+
(attn): MultiheadAttention(
|
| 86 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 87 |
+
)
|
| 88 |
+
(ls_1): Identity()
|
| 89 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 90 |
+
(mlp): Sequential(
|
| 91 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 92 |
+
(gelu): GELU(approximate='none')
|
| 93 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 94 |
+
)
|
| 95 |
+
(ls_2): Identity()
|
| 96 |
+
)
|
| 97 |
+
(6): ResidualAttentionBlock(
|
| 98 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 99 |
+
(attn): MultiheadAttention(
|
| 100 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 101 |
+
)
|
| 102 |
+
(ls_1): Identity()
|
| 103 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 104 |
+
(mlp): Sequential(
|
| 105 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 106 |
+
(gelu): GELU(approximate='none')
|
| 107 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 108 |
+
)
|
| 109 |
+
(ls_2): Identity()
|
| 110 |
+
)
|
| 111 |
+
(7): ResidualAttentionBlock(
|
| 112 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 113 |
+
(attn): MultiheadAttention(
|
| 114 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 115 |
+
)
|
| 116 |
+
(ls_1): Identity()
|
| 117 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 118 |
+
(mlp): Sequential(
|
| 119 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 120 |
+
(gelu): GELU(approximate='none')
|
| 121 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 122 |
+
)
|
| 123 |
+
(ls_2): Identity()
|
| 124 |
+
)
|
| 125 |
+
(8): ResidualAttentionBlock(
|
| 126 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 127 |
+
(attn): MultiheadAttention(
|
| 128 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 129 |
+
)
|
| 130 |
+
(ls_1): Identity()
|
| 131 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 132 |
+
(mlp): Sequential(
|
| 133 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 134 |
+
(gelu): GELU(approximate='none')
|
| 135 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 136 |
+
)
|
| 137 |
+
(ls_2): Identity()
|
| 138 |
+
)
|
| 139 |
+
(9): ResidualAttentionBlock(
|
| 140 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 141 |
+
(attn): MultiheadAttention(
|
| 142 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 143 |
+
)
|
| 144 |
+
(ls_1): Identity()
|
| 145 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 146 |
+
(mlp): Sequential(
|
| 147 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 148 |
+
(gelu): GELU(approximate='none')
|
| 149 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 150 |
+
)
|
| 151 |
+
(ls_2): Identity()
|
| 152 |
+
)
|
| 153 |
+
(10): ResidualAttentionBlock(
|
| 154 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 155 |
+
(attn): MultiheadAttention(
|
| 156 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 157 |
+
)
|
| 158 |
+
(ls_1): Identity()
|
| 159 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 160 |
+
(mlp): Sequential(
|
| 161 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 162 |
+
(gelu): GELU(approximate='none')
|
| 163 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 164 |
+
)
|
| 165 |
+
(ls_2): Identity()
|
| 166 |
+
)
|
| 167 |
+
(11): ResidualAttentionBlock(
|
| 168 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 169 |
+
(attn): MultiheadAttention(
|
| 170 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 171 |
+
)
|
| 172 |
+
(ls_1): Identity()
|
| 173 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 174 |
+
(mlp): Sequential(
|
| 175 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 176 |
+
(gelu): GELU(approximate='none')
|
| 177 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 178 |
+
)
|
| 179 |
+
(ls_2): Identity()
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 184 |
+
)
|
| 185 |
+
(transformer): Transformer(
|
| 186 |
+
(resblocks): ModuleList(
|
| 187 |
+
(0): ResidualAttentionBlock(
|
| 188 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 189 |
+
(attn): MultiheadAttention(
|
| 190 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 191 |
+
)
|
| 192 |
+
(ls_1): Identity()
|
| 193 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 194 |
+
(mlp): Sequential(
|
| 195 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 196 |
+
(gelu): GELU(approximate='none')
|
| 197 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 198 |
+
)
|
| 199 |
+
(ls_2): Identity()
|
| 200 |
+
)
|
| 201 |
+
(1): ResidualAttentionBlock(
|
| 202 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 203 |
+
(attn): MultiheadAttention(
|
| 204 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 205 |
+
)
|
| 206 |
+
(ls_1): Identity()
|
| 207 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 208 |
+
(mlp): Sequential(
|
| 209 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 210 |
+
(gelu): GELU(approximate='none')
|
| 211 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 212 |
+
)
|
| 213 |
+
(ls_2): Identity()
|
| 214 |
+
)
|
| 215 |
+
(2): ResidualAttentionBlock(
|
| 216 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 217 |
+
(attn): MultiheadAttention(
|
| 218 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 219 |
+
)
|
| 220 |
+
(ls_1): Identity()
|
| 221 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 222 |
+
(mlp): Sequential(
|
| 223 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 224 |
+
(gelu): GELU(approximate='none')
|
| 225 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 226 |
+
)
|
| 227 |
+
(ls_2): Identity()
|
| 228 |
+
)
|
| 229 |
+
(3): ResidualAttentionBlock(
|
| 230 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 231 |
+
(attn): MultiheadAttention(
|
| 232 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 233 |
+
)
|
| 234 |
+
(ls_1): Identity()
|
| 235 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 236 |
+
(mlp): Sequential(
|
| 237 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 238 |
+
(gelu): GELU(approximate='none')
|
| 239 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 240 |
+
)
|
| 241 |
+
(ls_2): Identity()
|
| 242 |
+
)
|
| 243 |
+
(4): ResidualAttentionBlock(
|
| 244 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 245 |
+
(attn): MultiheadAttention(
|
| 246 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 247 |
+
)
|
| 248 |
+
(ls_1): Identity()
|
| 249 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 250 |
+
(mlp): Sequential(
|
| 251 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 252 |
+
(gelu): GELU(approximate='none')
|
| 253 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 254 |
+
)
|
| 255 |
+
(ls_2): Identity()
|
| 256 |
+
)
|
| 257 |
+
(5): ResidualAttentionBlock(
|
| 258 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 259 |
+
(attn): MultiheadAttention(
|
| 260 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 261 |
+
)
|
| 262 |
+
(ls_1): Identity()
|
| 263 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 264 |
+
(mlp): Sequential(
|
| 265 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 266 |
+
(gelu): GELU(approximate='none')
|
| 267 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 268 |
+
)
|
| 269 |
+
(ls_2): Identity()
|
| 270 |
+
)
|
| 271 |
+
(6): ResidualAttentionBlock(
|
| 272 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 273 |
+
(attn): MultiheadAttention(
|
| 274 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 275 |
+
)
|
| 276 |
+
(ls_1): Identity()
|
| 277 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 278 |
+
(mlp): Sequential(
|
| 279 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 280 |
+
(gelu): GELU(approximate='none')
|
| 281 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 282 |
+
)
|
| 283 |
+
(ls_2): Identity()
|
| 284 |
+
)
|
| 285 |
+
(7): ResidualAttentionBlock(
|
| 286 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 287 |
+
(attn): MultiheadAttention(
|
| 288 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 289 |
+
)
|
| 290 |
+
(ls_1): Identity()
|
| 291 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 292 |
+
(mlp): Sequential(
|
| 293 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 294 |
+
(gelu): GELU(approximate='none')
|
| 295 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 296 |
+
)
|
| 297 |
+
(ls_2): Identity()
|
| 298 |
+
)
|
| 299 |
+
(8): ResidualAttentionBlock(
|
| 300 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 301 |
+
(attn): MultiheadAttention(
|
| 302 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 303 |
+
)
|
| 304 |
+
(ls_1): Identity()
|
| 305 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 306 |
+
(mlp): Sequential(
|
| 307 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 308 |
+
(gelu): GELU(approximate='none')
|
| 309 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 310 |
+
)
|
| 311 |
+
(ls_2): Identity()
|
| 312 |
+
)
|
| 313 |
+
(9): ResidualAttentionBlock(
|
| 314 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 315 |
+
(attn): MultiheadAttention(
|
| 316 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 317 |
+
)
|
| 318 |
+
(ls_1): Identity()
|
| 319 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 320 |
+
(mlp): Sequential(
|
| 321 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 322 |
+
(gelu): GELU(approximate='none')
|
| 323 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 324 |
+
)
|
| 325 |
+
(ls_2): Identity()
|
| 326 |
+
)
|
| 327 |
+
(10): ResidualAttentionBlock(
|
| 328 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 329 |
+
(attn): MultiheadAttention(
|
| 330 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 331 |
+
)
|
| 332 |
+
(ls_1): Identity()
|
| 333 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 334 |
+
(mlp): Sequential(
|
| 335 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 336 |
+
(gelu): GELU(approximate='none')
|
| 337 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 338 |
+
)
|
| 339 |
+
(ls_2): Identity()
|
| 340 |
+
)
|
| 341 |
+
(11): ResidualAttentionBlock(
|
| 342 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 343 |
+
(attn): MultiheadAttention(
|
| 344 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 345 |
+
)
|
| 346 |
+
(ls_1): Identity()
|
| 347 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 348 |
+
(mlp): Sequential(
|
| 349 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 350 |
+
(gelu): GELU(approximate='none')
|
| 351 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 352 |
+
)
|
| 353 |
+
(ls_2): Identity()
|
| 354 |
+
)
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
(token_embedding): Embedding(49408, 512)
|
| 358 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 359 |
+
)
|
| 360 |
+
2024-09-07,04:53:23 | INFO | Params:
|
| 361 |
+
2024-09-07,04:53:23 | INFO | accum_freq: 1
|
| 362 |
+
2024-09-07,04:53:23 | INFO | aug_cfg: {}
|
| 363 |
+
2024-09-07,04:53:23 | INFO | batch_size: 2048
|
| 364 |
+
2024-09-07,04:53:23 | INFO | beta1: 0.9
|
| 365 |
+
2024-09-07,04:53:23 | INFO | beta2: 0.98
|
| 366 |
+
2024-09-07,04:53:23 | INFO | checkpoint_path: /home/breaking_0.5_trained/50_most_difficult/checkpoints
|
| 367 |
+
2024-09-07,04:53:23 | INFO | coca_caption_loss_weight: 2.0
|
| 368 |
+
2024-09-07,04:53:23 | INFO | coca_contrastive_loss_weight: 1.0
|
| 369 |
+
2024-09-07,04:53:23 | INFO | copy_codebase: False
|
| 370 |
+
2024-09-07,04:53:23 | INFO | csv_caption_key: title
|
| 371 |
+
2024-09-07,04:53:23 | INFO | csv_img_key: filepath
|
| 372 |
+
2024-09-07,04:53:23 | INFO | csv_separator:
|
| 373 |
+
2024-09-07,04:53:23 | INFO | dataset_resampled: True
|
| 374 |
+
2024-09-07,04:53:23 | INFO | dataset_type: webdataset
|
| 375 |
+
2024-09-07,04:53:23 | INFO | ddp_static_graph: True
|
| 376 |
+
2024-09-07,04:53:23 | INFO | debug: False
|
| 377 |
+
2024-09-07,04:53:23 | INFO | delete_previous_checkpoint: False
|
| 378 |
+
2024-09-07,04:53:23 | INFO | device: cuda:0
|
| 379 |
+
2024-09-07,04:53:23 | INFO | dist_backend: nccl
|
| 380 |
+
2024-09-07,04:53:23 | INFO | dist_url: env://
|
| 381 |
+
2024-09-07,04:53:23 | INFO | distill: False
|
| 382 |
+
2024-09-07,04:53:23 | INFO | distill_model: None
|
| 383 |
+
2024-09-07,04:53:23 | INFO | distill_pretrained: None
|
| 384 |
+
2024-09-07,04:53:23 | INFO | distributed: True
|
| 385 |
+
2024-09-07,04:53:23 | INFO | epochs: 5
|
| 386 |
+
2024-09-07,04:53:23 | INFO | epochs_cooldown: None
|
| 387 |
+
2024-09-07,04:53:23 | INFO | eps: 1e-06
|
| 388 |
+
2024-09-07,04:53:23 | INFO | force_custom_text: False
|
| 389 |
+
2024-09-07,04:53:23 | INFO | force_image_size: None
|
| 390 |
+
2024-09-07,04:53:23 | INFO | force_patch_dropout: None
|
| 391 |
+
2024-09-07,04:53:23 | INFO | force_quick_gelu: False
|
| 392 |
+
2024-09-07,04:53:23 | INFO | gather_with_grad: True
|
| 393 |
+
2024-09-07,04:53:23 | INFO | grad_checkpointing: True
|
| 394 |
+
2024-09-07,04:53:23 | INFO | grad_clip_norm: None
|
| 395 |
+
2024-09-07,04:53:23 | INFO | horovod: False
|
| 396 |
+
2024-09-07,04:53:23 | INFO | image_mean: None
|
| 397 |
+
2024-09-07,04:53:23 | INFO | image_std: None
|
| 398 |
+
2024-09-07,04:53:23 | INFO | imagenet_v2: None
|
| 399 |
+
2024-09-07,04:53:23 | INFO | imagenet_val: None
|
| 400 |
+
2024-09-07,04:53:23 | INFO | local_loss: True
|
| 401 |
+
2024-09-07,04:53:23 | INFO | local_rank: 0
|
| 402 |
+
2024-09-07,04:53:23 | INFO | lock_image: False
|
| 403 |
+
2024-09-07,04:53:23 | INFO | lock_image_freeze_bn_stats: False
|
| 404 |
+
2024-09-07,04:53:23 | INFO | lock_image_unlocked_groups: 0
|
| 405 |
+
2024-09-07,04:53:23 | INFO | lock_text: False
|
| 406 |
+
2024-09-07,04:53:23 | INFO | lock_text_freeze_layer_norm: False
|
| 407 |
+
2024-09-07,04:53:23 | INFO | lock_text_unlocked_layers: 0
|
| 408 |
+
2024-09-07,04:53:23 | INFO | log_every_n_steps: 100
|
| 409 |
+
2024-09-07,04:53:23 | INFO | log_level: 20
|
| 410 |
+
2024-09-07,04:53:23 | INFO | log_local: False
|
| 411 |
+
2024-09-07,04:53:23 | INFO | log_path: /home/breaking_0.5_trained/50_most_difficult/out.log
|
| 412 |
+
2024-09-07,04:53:23 | INFO | logs: /home/breaking_0.5_trained
|
| 413 |
+
2024-09-07,04:53:23 | INFO | lr: 0.0005
|
| 414 |
+
2024-09-07,04:53:23 | INFO | lr_cooldown_end: 0.0
|
| 415 |
+
2024-09-07,04:53:23 | INFO | lr_cooldown_power: 1.0
|
| 416 |
+
2024-09-07,04:53:23 | INFO | lr_scheduler: cosine
|
| 417 |
+
2024-09-07,04:53:23 | INFO | model: ViT-B-32
|
| 418 |
+
2024-09-07,04:53:23 | INFO | name: 50_most_difficult
|
| 419 |
+
2024-09-07,04:53:23 | INFO | no_set_device_rank: False
|
| 420 |
+
2024-09-07,04:53:23 | INFO | precision: amp
|
| 421 |
+
2024-09-07,04:53:23 | INFO | pretrained:
|
| 422 |
+
2024-09-07,04:53:23 | INFO | pretrained_image: False
|
| 423 |
+
2024-09-07,04:53:23 | INFO | rank: 0
|
| 424 |
+
2024-09-07,04:53:23 | INFO | remote_sync: None
|
| 425 |
+
2024-09-07,04:53:23 | INFO | remote_sync_frequency: 300
|
| 426 |
+
2024-09-07,04:53:23 | INFO | remote_sync_protocol: s3
|
| 427 |
+
2024-09-07,04:53:23 | INFO | report_to: wandb
|
| 428 |
+
2024-09-07,04:53:23 | INFO | resume: None
|
| 429 |
+
2024-09-07,04:53:23 | INFO | save_frequency: 0
|
| 430 |
+
2024-09-07,04:53:23 | INFO | save_most_recent: True
|
| 431 |
+
2024-09-07,04:53:23 | INFO | seed: 0
|
| 432 |
+
2024-09-07,04:53:23 | INFO | skip_scheduler: False
|
| 433 |
+
2024-09-07,04:53:23 | INFO | tensorboard: False
|
| 434 |
+
2024-09-07,04:53:23 | INFO | tensorboard_path:
|
| 435 |
+
2024-09-07,04:53:23 | INFO | torchscript: False
|
| 436 |
+
2024-09-07,04:53:23 | INFO | trace: False
|
| 437 |
+
2024-09-07,04:53:23 | INFO | train_data: /home/breaking_0.5/{00000000..00000531}.tar
|
| 438 |
+
2024-09-07,04:53:23 | INFO | train_data_upsampling_factors: None
|
| 439 |
+
2024-09-07,04:53:23 | INFO | train_num_samples: 2560000
|
| 440 |
+
2024-09-07,04:53:23 | INFO | use_bn_sync: False
|
| 441 |
+
2024-09-07,04:53:23 | INFO | val_data: None
|
| 442 |
+
2024-09-07,04:53:23 | INFO | val_frequency: 1
|
| 443 |
+
2024-09-07,04:53:23 | INFO | val_num_samples: None
|
| 444 |
+
2024-09-07,04:53:23 | INFO | wandb: True
|
| 445 |
+
2024-09-07,04:53:23 | INFO | wandb_notes:
|
| 446 |
+
2024-09-07,04:53:23 | INFO | wandb_project_name: clip_text_hq_clusters
|
| 447 |
+
2024-09-07,04:53:23 | INFO | warmup: 500
|
| 448 |
+
2024-09-07,04:53:23 | INFO | wd: 0.2
|
| 449 |
+
2024-09-07,04:53:23 | INFO | workers: 4
|
| 450 |
+
2024-09-07,04:53:23 | INFO | world_size: 2
|
| 451 |
+
2024-09-07,04:53:23 | INFO | zeroshot_frequency: 2
|
| 452 |
+
2024-09-07,04:53:30 | INFO | Start epoch 0
|
| 453 |
+
2024-09-07,04:53:47 | INFO | Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 12.292 Batch (t): 17.044, 240.320/s, 120.160/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3793 (8.3793) Loss: 8.3793 (8.3793)
|
| 454 |
+
2024-09-07,04:53:50 | INFO | Reducer buckets have been rebuilt in this iteration.
|
| 455 |
+
2024-09-07,04:58:08 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.555 Batch (t): 2.608, 1570.24/s, 785.122/s/gpu LR: 0.000101 Logit Scale: 14.263 Contrastive_loss: 8.2098 (8.2945) Loss: 8.2098 (8.2945)
|
| 456 |
+
2024-09-07,05:02:30 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.566 Batch (t): 2.619, 1567.59/s, 783.795/s/gpu LR: 0.000201 Logit Scale: 14.236 Contrastive_loss: 8.1342 (8.2411) Loss: 8.1342 (8.2411)
|
| 457 |
+
2024-09-07,05:06:52 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.621, 1561.56/s, 780.780/s/gpu LR: 0.000301 Logit Scale: 14.197 Contrastive_loss: 7.9712 (8.1736) Loss: 7.9712 (8.1736)
|
| 458 |
+
2024-09-07,05:11:14 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.572 Batch (t): 2.623, 1564.54/s, 782.270/s/gpu LR: 0.000401 Logit Scale: 14.159 Contrastive_loss: 7.8644 (8.1118) Loss: 7.8644 (8.1118)
|
| 459 |
+
2024-09-07,05:15:36 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.569 Batch (t): 2.619, 1566.43/s, 783.215/s/gpu LR: 0.000500 Logit Scale: 14.119 Contrastive_loss: 7.8428 (8.0670) Loss: 7.8428 (8.0670)
|
| 460 |
+
2024-09-07,05:19:58 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.568 Batch (t): 2.620, 1557.15/s, 778.573/s/gpu LR: 0.000498 Logit Scale: 14.100 Contrastive_loss: 7.7296 (8.0188) Loss: 7.7296 (8.0188)
|
| 461 |
+
2024-09-07,05:21:08 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.565 Batch (t): 2.617, 1576.44/s, 788.218/s/gpu LR: 0.000497 Logit Scale: 14.099 Contrastive_loss: 7.7862 (7.9897) Loss: 7.7862 (7.9897)
|
| 462 |
+
2024-09-07,05:21:11 | INFO | Start epoch 1
|
| 463 |
+
2024-09-07,05:21:23 | INFO | Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.800 Batch (t): 11.848, 345.708/s, 172.854/s/gpu LR: 0.000497 Logit Scale: 14.099 Contrastive_loss: 7.7746 (7.7746) Loss: 7.7746 (7.7746)
|
| 464 |
+
2024-09-07,05:25:43 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.607, 1561.69/s, 780.843/s/gpu LR: 0.000491 Logit Scale: 14.097 Contrastive_loss: 7.6379 (7.7063) Loss: 7.6379 (7.7063)
|
| 465 |
+
2024-09-07,05:30:05 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.566 Batch (t): 2.617, 1566.22/s, 783.108/s/gpu LR: 0.000481 Logit Scale: 14.123 Contrastive_loss: 7.5494 (7.6540) Loss: 7.5494 (7.6540)
|
| 466 |
+
2024-09-07,05:34:27 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.564 Batch (t): 2.618, 1569.73/s, 784.866/s/gpu LR: 0.000468 Logit Scale: 14.166 Contrastive_loss: 7.4297 (7.5979) Loss: 7.4297 (7.5979)
|
| 467 |
+
2024-09-07,05:38:49 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.616, 1566.22/s, 783.112/s/gpu LR: 0.000452 Logit Scale: 14.227 Contrastive_loss: 7.2636 (7.5310) Loss: 7.2636 (7.5310)
|
| 468 |
+
2024-09-07,05:43:10 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.617, 1560.62/s, 780.309/s/gpu LR: 0.000433 Logit Scale: 14.298 Contrastive_loss: 7.1316 (7.4645) Loss: 7.1316 (7.4645)
|
| 469 |
+
2024-09-07,05:47:32 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.571 Batch (t): 2.622, 1567.17/s, 783.585/s/gpu LR: 0.000412 Logit Scale: 14.398 Contrastive_loss: 7.2612 (7.4354) Loss: 7.2612 (7.4354)
|
| 470 |
+
2024-09-07,05:48:43 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.560 Batch (t): 2.611, 1575.74/s, 787.872/s/gpu LR: 0.000406 Logit Scale: 14.425 Contrastive_loss: 7.1283 (7.3970) Loss: 7.1283 (7.3970)
|
| 471 |
+
2024-09-07,05:48:46 | INFO | Start epoch 2
|
| 472 |
+
2024-09-07,05:48:57 | INFO | Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.681 Batch (t): 11.731, 349.173/s, 174.586/s/gpu LR: 0.000405 Logit Scale: 14.426 Contrastive_loss: 7.0068 (7.0068) Loss: 7.0068 (7.0068)
|
| 473 |
+
2024-09-07,05:53:18 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.607, 1567.72/s, 783.858/s/gpu LR: 0.000381 Logit Scale: 14.555 Contrastive_loss: 7.1138 (7.0603) Loss: 7.1138 (7.0603)
|
| 474 |
+
2024-09-07,05:57:40 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.618, 1552.55/s, 776.277/s/gpu LR: 0.000355 Logit Scale: 14.689 Contrastive_loss: 6.8805 (7.0004) Loss: 6.8805 (7.0004)
|
| 475 |
+
2024-09-07,06:02:02 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.564 Batch (t): 2.616, 1568.97/s, 784.485/s/gpu LR: 0.000327 Logit Scale: 14.818 Contrastive_loss: 6.9158 (6.9792) Loss: 6.9158 (6.9792)
|
| 476 |
+
2024-09-07,06:06:23 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.618, 1568.84/s, 784.420/s/gpu LR: 0.000298 Logit Scale: 14.968 Contrastive_loss: 6.8714 (6.9577) Loss: 6.8714 (6.9577)
|
| 477 |
+
2024-09-07,06:10:45 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.566 Batch (t): 2.617, 1560.16/s, 780.081/s/gpu LR: 0.000269 Logit Scale: 15.097 Contrastive_loss: 6.7470 (6.9226) Loss: 6.7470 (6.9226)
|
| 478 |
+
2024-09-07,06:15:06 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.563 Batch (t): 2.615, 1572.55/s, 786.275/s/gpu LR: 0.000239 Logit Scale: 15.244 Contrastive_loss: 6.7248 (6.8943) Loss: 6.7248 (6.8943)
|
| 479 |
+
2024-09-07,06:16:17 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.557 Batch (t): 2.609, 1580.67/s, 790.337/s/gpu LR: 0.000231 Logit Scale: 15.290 Contrastive_loss: 6.6894 (6.8687) Loss: 6.6894 (6.8687)
|
| 480 |
+
2024-09-07,06:16:20 | INFO | Start epoch 3
|
| 481 |
+
2024-09-07,06:16:31 | INFO | Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.568 Batch (t): 11.615, 352.634/s, 176.317/s/gpu LR: 0.000231 Logit Scale: 15.292 Contrastive_loss: 6.5913 (6.5913) Loss: 6.5913 (6.5913)
|
| 482 |
+
2024-09-07,06:20:52 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.551 Batch (t): 2.609, 1564.82/s, 782.409/s/gpu LR: 0.000202 Logit Scale: 15.435 Contrastive_loss: 6.6123 (6.6018) Loss: 6.6123 (6.6018)
|
| 483 |
+
2024-09-07,06:25:14 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.559 Batch (t): 2.612, 1570.74/s, 785.370/s/gpu LR: 0.000173 Logit Scale: 15.541 Contrastive_loss: 6.5566 (6.5867) Loss: 6.5566 (6.5867)
|
| 484 |
+
2024-09-07,06:29:35 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.561 Batch (t): 2.614, 1565.41/s, 782.705/s/gpu LR: 0.000145 Logit Scale: 15.648 Contrastive_loss: 6.1246 (6.4712) Loss: 6.1246 (6.4712)
|
| 485 |
+
2024-09-07,06:33:57 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.618, 1561.09/s, 780.547/s/gpu LR: 0.000119 Logit Scale: 15.762 Contrastive_loss: 6.1003 (6.3970) Loss: 6.1003 (6.3970)
|
| 486 |
+
2024-09-07,06:38:19 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.563 Batch (t): 2.618, 1567.95/s, 783.976/s/gpu LR: 0.000095 Logit Scale: 15.849 Contrastive_loss: 6.1111 (6.3494) Loss: 6.1111 (6.3494)
|
| 487 |
+
2024-09-07,06:42:40 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.564 Batch (t): 2.617, 1561.15/s, 780.577/s/gpu LR: 0.000072 Logit Scale: 15.918 Contrastive_loss: 6.1647 (6.3230) Loss: 6.1647 (6.3230)
|
| 488 |
+
2024-09-07,06:43:51 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.564 Batch (t): 2.617, 1572.59/s, 786.296/s/gpu LR: 0.000067 Logit Scale: 15.938 Contrastive_loss: 6.2637 (6.3156) Loss: 6.2637 (6.3156)
|
| 489 |
+
2024-09-07,06:43:54 | INFO | Start epoch 4
|
| 490 |
+
2024-09-07,06:44:05 | INFO | Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.638 Batch (t): 11.688, 350.454/s, 175.227/s/gpu LR: 0.000067 Logit Scale: 15.939 Contrastive_loss: 6.2286 (6.2286) Loss: 6.2286 (6.2286)
|
| 491 |
+
2024-09-07,06:48:26 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.610, 1567.01/s, 783.507/s/gpu LR: 0.000048 Logit Scale: 15.991 Contrastive_loss: 6.1032 (6.1659) Loss: 6.1032 (6.1659)
|
| 492 |
+
2024-09-07,06:52:48 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.618, 1562.44/s, 781.222/s/gpu LR: 0.000032 Logit Scale: 16.023 Contrastive_loss: 6.1207 (6.1508) Loss: 6.1207 (6.1508)
|
| 493 |
+
2024-09-07,06:57:10 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.562 Batch (t): 2.616, 1566.88/s, 783.440/s/gpu LR: 0.000019 Logit Scale: 16.048 Contrastive_loss: 5.5887 (6.0103) Loss: 5.5887 (6.0103)
|
| 494 |
+
2024-09-07,07:01:31 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.617, 1570.10/s, 785.052/s/gpu LR: 0.000009 Logit Scale: 16.061 Contrastive_loss: 5.9363 (5.9955) Loss: 5.9363 (5.9955)
|
| 495 |
+
2024-09-07,07:05:53 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.558 Batch (t): 2.611, 1569.54/s, 784.770/s/gpu LR: 0.000003 Logit Scale: 16.067 Contrastive_loss: 5.9174 (5.9825) Loss: 5.9174 (5.9825)
|
| 496 |
+
2024-09-07,07:10:14 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.559 Batch (t): 2.611, 1568.58/s, 784.289/s/gpu LR: 0.000000 Logit Scale: 16.068 Contrastive_loss: 6.1942 (6.0127) Loss: 6.1942 (6.0127)
|
| 497 |
+
2024-09-07,07:11:24 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.549 Batch (t): 2.604, 1583.04/s, 791.520/s/gpu LR: 0.000000 Logit Scale: 16.068 Contrastive_loss: 5.7131 (5.9753) Loss: 5.7131 (5.9753)
|
breaking_0.5_trained/50_most_difficult/params.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 1
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
checkpoint_path: /home/breaking_0.5_trained/50_most_difficult/checkpoints
|
| 7 |
+
coca_caption_loss_weight: 2.0
|
| 8 |
+
coca_contrastive_loss_weight: 1.0
|
| 9 |
+
copy_codebase: False
|
| 10 |
+
csv_caption_key: title
|
| 11 |
+
csv_img_key: filepath
|
| 12 |
+
csv_separator:
|
| 13 |
+
dataset_resampled: True
|
| 14 |
+
dataset_type: webdataset
|
| 15 |
+
ddp_static_graph: True
|
| 16 |
+
debug: False
|
| 17 |
+
delete_previous_checkpoint: False
|
| 18 |
+
device: cuda:0
|
| 19 |
+
dist_backend: nccl
|
| 20 |
+
dist_url: env://
|
| 21 |
+
distill: False
|
| 22 |
+
distill_model: None
|
| 23 |
+
distill_pretrained: None
|
| 24 |
+
distributed: True
|
| 25 |
+
epochs: 5
|
| 26 |
+
epochs_cooldown: None
|
| 27 |
+
eps: 1e-06
|
| 28 |
+
force_custom_text: False
|
| 29 |
+
force_image_size: None
|
| 30 |
+
force_patch_dropout: None
|
| 31 |
+
force_quick_gelu: False
|
| 32 |
+
gather_with_grad: True
|
| 33 |
+
grad_checkpointing: True
|
| 34 |
+
grad_clip_norm: None
|
| 35 |
+
horovod: False
|
| 36 |
+
image_mean: None
|
| 37 |
+
image_std: None
|
| 38 |
+
imagenet_v2: None
|
| 39 |
+
imagenet_val: None
|
| 40 |
+
local_loss: True
|
| 41 |
+
local_rank: 0
|
| 42 |
+
lock_image: False
|
| 43 |
+
lock_image_freeze_bn_stats: False
|
| 44 |
+
lock_image_unlocked_groups: 0
|
| 45 |
+
lock_text: False
|
| 46 |
+
lock_text_freeze_layer_norm: False
|
| 47 |
+
lock_text_unlocked_layers: 0
|
| 48 |
+
log_every_n_steps: 100
|
| 49 |
+
log_level: 20
|
| 50 |
+
log_local: False
|
| 51 |
+
log_path: /home/breaking_0.5_trained/50_most_difficult/out.log
|
| 52 |
+
logs: /home/breaking_0.5_trained
|
| 53 |
+
lr: 0.0005
|
| 54 |
+
lr_cooldown_end: 0.0
|
| 55 |
+
lr_cooldown_power: 1.0
|
| 56 |
+
lr_scheduler: cosine
|
| 57 |
+
model: ViT-B-32
|
| 58 |
+
name: 50_most_difficult
|
| 59 |
+
no_set_device_rank: False
|
| 60 |
+
precision: amp
|
| 61 |
+
pretrained:
|
| 62 |
+
pretrained_image: False
|
| 63 |
+
rank: 0
|
| 64 |
+
remote_sync: None
|
| 65 |
+
remote_sync_frequency: 300
|
| 66 |
+
remote_sync_protocol: s3
|
| 67 |
+
report_to: wandb
|
| 68 |
+
resume: None
|
| 69 |
+
save_frequency: 0
|
| 70 |
+
save_most_recent: True
|
| 71 |
+
seed: 0
|
| 72 |
+
skip_scheduler: False
|
| 73 |
+
tensorboard: False
|
| 74 |
+
tensorboard_path:
|
| 75 |
+
torchscript: False
|
| 76 |
+
trace: False
|
| 77 |
+
train_data: /home/breaking_0.5/{00000000..00000531}.tar
|
| 78 |
+
train_data_upsampling_factors: None
|
| 79 |
+
train_num_samples: 2560000
|
| 80 |
+
use_bn_sync: False
|
| 81 |
+
val_data: None
|
| 82 |
+
val_frequency: 1
|
| 83 |
+
val_num_samples: None
|
| 84 |
+
wandb: True
|
| 85 |
+
wandb_notes:
|
| 86 |
+
wandb_project_name: clip_text_hq_clusters
|
| 87 |
+
warmup: 500
|
| 88 |
+
wd: 0.2
|
| 89 |
+
workers: 4
|
| 90 |
+
world_size: 2
|
| 91 |
+
zeroshot_frequency: 2
|
minipile_style_trained/minipile_style/checkpoints/epoch_5.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c33e4d08cc8f61ae1e38c3b96a44bb3d23274a2ee09eeb63422efb074219fb94
|
| 3 |
+
size 1815701601
|
minipile_style_trained/minipile_style/checkpoints/epoch_latest.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:335eec8e1e6fda984a6ce38720821667784813c986fbdda3dbc353b57c6b78f3
|
| 3 |
+
size 1815639289
|
minipile_style_trained/minipile_style/info.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05b638b253299eab95f5d720ce204dda31d22d40812cbd37d729a62b31b2632e
|
| 3 |
+
size 307
|
minipile_style_trained/minipile_style/out.log
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-07,01:04:23 | INFO | No latest resume checkpoint found in /home/minipile_style_trained/minipile_style/checkpoints.
|
| 2 |
+
2024-09-07,01:04:25 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 3 |
+
2024-09-07,01:04:25 | INFO | Loaded ViT-B-32 model config.
|
| 4 |
+
2024-09-07,01:04:26 | INFO | Model:
|
| 5 |
+
2024-09-07,01:04:26 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(patchnorm_pre_ln): Identity()
|
| 8 |
+
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
|
| 9 |
+
(patch_dropout): Identity()
|
| 10 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 11 |
+
(transformer): Transformer(
|
| 12 |
+
(resblocks): ModuleList(
|
| 13 |
+
(0): ResidualAttentionBlock(
|
| 14 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 15 |
+
(attn): MultiheadAttention(
|
| 16 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 17 |
+
)
|
| 18 |
+
(ls_1): Identity()
|
| 19 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 20 |
+
(mlp): Sequential(
|
| 21 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 22 |
+
(gelu): GELU(approximate='none')
|
| 23 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 24 |
+
)
|
| 25 |
+
(ls_2): Identity()
|
| 26 |
+
)
|
| 27 |
+
(1): ResidualAttentionBlock(
|
| 28 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
(attn): MultiheadAttention(
|
| 30 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 31 |
+
)
|
| 32 |
+
(ls_1): Identity()
|
| 33 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(mlp): Sequential(
|
| 35 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 36 |
+
(gelu): GELU(approximate='none')
|
| 37 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(ls_2): Identity()
|
| 40 |
+
)
|
| 41 |
+
(2): ResidualAttentionBlock(
|
| 42 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 43 |
+
(attn): MultiheadAttention(
|
| 44 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 45 |
+
)
|
| 46 |
+
(ls_1): Identity()
|
| 47 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 48 |
+
(mlp): Sequential(
|
| 49 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 50 |
+
(gelu): GELU(approximate='none')
|
| 51 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 52 |
+
)
|
| 53 |
+
(ls_2): Identity()
|
| 54 |
+
)
|
| 55 |
+
(3): ResidualAttentionBlock(
|
| 56 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(attn): MultiheadAttention(
|
| 58 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 59 |
+
)
|
| 60 |
+
(ls_1): Identity()
|
| 61 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 62 |
+
(mlp): Sequential(
|
| 63 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 64 |
+
(gelu): GELU(approximate='none')
|
| 65 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 66 |
+
)
|
| 67 |
+
(ls_2): Identity()
|
| 68 |
+
)
|
| 69 |
+
(4): ResidualAttentionBlock(
|
| 70 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 71 |
+
(attn): MultiheadAttention(
|
| 72 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 73 |
+
)
|
| 74 |
+
(ls_1): Identity()
|
| 75 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 76 |
+
(mlp): Sequential(
|
| 77 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 78 |
+
(gelu): GELU(approximate='none')
|
| 79 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 80 |
+
)
|
| 81 |
+
(ls_2): Identity()
|
| 82 |
+
)
|
| 83 |
+
(5): ResidualAttentionBlock(
|
| 84 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 85 |
+
(attn): MultiheadAttention(
|
| 86 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 87 |
+
)
|
| 88 |
+
(ls_1): Identity()
|
| 89 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 90 |
+
(mlp): Sequential(
|
| 91 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 92 |
+
(gelu): GELU(approximate='none')
|
| 93 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 94 |
+
)
|
| 95 |
+
(ls_2): Identity()
|
| 96 |
+
)
|
| 97 |
+
(6): ResidualAttentionBlock(
|
| 98 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 99 |
+
(attn): MultiheadAttention(
|
| 100 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 101 |
+
)
|
| 102 |
+
(ls_1): Identity()
|
| 103 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 104 |
+
(mlp): Sequential(
|
| 105 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 106 |
+
(gelu): GELU(approximate='none')
|
| 107 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 108 |
+
)
|
| 109 |
+
(ls_2): Identity()
|
| 110 |
+
)
|
| 111 |
+
(7): ResidualAttentionBlock(
|
| 112 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 113 |
+
(attn): MultiheadAttention(
|
| 114 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 115 |
+
)
|
| 116 |
+
(ls_1): Identity()
|
| 117 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 118 |
+
(mlp): Sequential(
|
| 119 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 120 |
+
(gelu): GELU(approximate='none')
|
| 121 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 122 |
+
)
|
| 123 |
+
(ls_2): Identity()
|
| 124 |
+
)
|
| 125 |
+
(8): ResidualAttentionBlock(
|
| 126 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 127 |
+
(attn): MultiheadAttention(
|
| 128 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 129 |
+
)
|
| 130 |
+
(ls_1): Identity()
|
| 131 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 132 |
+
(mlp): Sequential(
|
| 133 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 134 |
+
(gelu): GELU(approximate='none')
|
| 135 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 136 |
+
)
|
| 137 |
+
(ls_2): Identity()
|
| 138 |
+
)
|
| 139 |
+
(9): ResidualAttentionBlock(
|
| 140 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 141 |
+
(attn): MultiheadAttention(
|
| 142 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 143 |
+
)
|
| 144 |
+
(ls_1): Identity()
|
| 145 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 146 |
+
(mlp): Sequential(
|
| 147 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 148 |
+
(gelu): GELU(approximate='none')
|
| 149 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 150 |
+
)
|
| 151 |
+
(ls_2): Identity()
|
| 152 |
+
)
|
| 153 |
+
(10): ResidualAttentionBlock(
|
| 154 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 155 |
+
(attn): MultiheadAttention(
|
| 156 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 157 |
+
)
|
| 158 |
+
(ls_1): Identity()
|
| 159 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 160 |
+
(mlp): Sequential(
|
| 161 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 162 |
+
(gelu): GELU(approximate='none')
|
| 163 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 164 |
+
)
|
| 165 |
+
(ls_2): Identity()
|
| 166 |
+
)
|
| 167 |
+
(11): ResidualAttentionBlock(
|
| 168 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 169 |
+
(attn): MultiheadAttention(
|
| 170 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 171 |
+
)
|
| 172 |
+
(ls_1): Identity()
|
| 173 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 174 |
+
(mlp): Sequential(
|
| 175 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 176 |
+
(gelu): GELU(approximate='none')
|
| 177 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 178 |
+
)
|
| 179 |
+
(ls_2): Identity()
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 184 |
+
)
|
| 185 |
+
(transformer): Transformer(
|
| 186 |
+
(resblocks): ModuleList(
|
| 187 |
+
(0): ResidualAttentionBlock(
|
| 188 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 189 |
+
(attn): MultiheadAttention(
|
| 190 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 191 |
+
)
|
| 192 |
+
(ls_1): Identity()
|
| 193 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 194 |
+
(mlp): Sequential(
|
| 195 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 196 |
+
(gelu): GELU(approximate='none')
|
| 197 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 198 |
+
)
|
| 199 |
+
(ls_2): Identity()
|
| 200 |
+
)
|
| 201 |
+
(1): ResidualAttentionBlock(
|
| 202 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 203 |
+
(attn): MultiheadAttention(
|
| 204 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 205 |
+
)
|
| 206 |
+
(ls_1): Identity()
|
| 207 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 208 |
+
(mlp): Sequential(
|
| 209 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 210 |
+
(gelu): GELU(approximate='none')
|
| 211 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 212 |
+
)
|
| 213 |
+
(ls_2): Identity()
|
| 214 |
+
)
|
| 215 |
+
(2): ResidualAttentionBlock(
|
| 216 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 217 |
+
(attn): MultiheadAttention(
|
| 218 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 219 |
+
)
|
| 220 |
+
(ls_1): Identity()
|
| 221 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 222 |
+
(mlp): Sequential(
|
| 223 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 224 |
+
(gelu): GELU(approximate='none')
|
| 225 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 226 |
+
)
|
| 227 |
+
(ls_2): Identity()
|
| 228 |
+
)
|
| 229 |
+
(3): ResidualAttentionBlock(
|
| 230 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 231 |
+
(attn): MultiheadAttention(
|
| 232 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 233 |
+
)
|
| 234 |
+
(ls_1): Identity()
|
| 235 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 236 |
+
(mlp): Sequential(
|
| 237 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 238 |
+
(gelu): GELU(approximate='none')
|
| 239 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 240 |
+
)
|
| 241 |
+
(ls_2): Identity()
|
| 242 |
+
)
|
| 243 |
+
(4): ResidualAttentionBlock(
|
| 244 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 245 |
+
(attn): MultiheadAttention(
|
| 246 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 247 |
+
)
|
| 248 |
+
(ls_1): Identity()
|
| 249 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 250 |
+
(mlp): Sequential(
|
| 251 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 252 |
+
(gelu): GELU(approximate='none')
|
| 253 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 254 |
+
)
|
| 255 |
+
(ls_2): Identity()
|
| 256 |
+
)
|
| 257 |
+
(5): ResidualAttentionBlock(
|
| 258 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 259 |
+
(attn): MultiheadAttention(
|
| 260 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 261 |
+
)
|
| 262 |
+
(ls_1): Identity()
|
| 263 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 264 |
+
(mlp): Sequential(
|
| 265 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 266 |
+
(gelu): GELU(approximate='none')
|
| 267 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 268 |
+
)
|
| 269 |
+
(ls_2): Identity()
|
| 270 |
+
)
|
| 271 |
+
(6): ResidualAttentionBlock(
|
| 272 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 273 |
+
(attn): MultiheadAttention(
|
| 274 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 275 |
+
)
|
| 276 |
+
(ls_1): Identity()
|
| 277 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 278 |
+
(mlp): Sequential(
|
| 279 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 280 |
+
(gelu): GELU(approximate='none')
|
| 281 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 282 |
+
)
|
| 283 |
+
(ls_2): Identity()
|
| 284 |
+
)
|
| 285 |
+
(7): ResidualAttentionBlock(
|
| 286 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 287 |
+
(attn): MultiheadAttention(
|
| 288 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 289 |
+
)
|
| 290 |
+
(ls_1): Identity()
|
| 291 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 292 |
+
(mlp): Sequential(
|
| 293 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 294 |
+
(gelu): GELU(approximate='none')
|
| 295 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 296 |
+
)
|
| 297 |
+
(ls_2): Identity()
|
| 298 |
+
)
|
| 299 |
+
(8): ResidualAttentionBlock(
|
| 300 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 301 |
+
(attn): MultiheadAttention(
|
| 302 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 303 |
+
)
|
| 304 |
+
(ls_1): Identity()
|
| 305 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 306 |
+
(mlp): Sequential(
|
| 307 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 308 |
+
(gelu): GELU(approximate='none')
|
| 309 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 310 |
+
)
|
| 311 |
+
(ls_2): Identity()
|
| 312 |
+
)
|
| 313 |
+
(9): ResidualAttentionBlock(
|
| 314 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 315 |
+
(attn): MultiheadAttention(
|
| 316 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 317 |
+
)
|
| 318 |
+
(ls_1): Identity()
|
| 319 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 320 |
+
(mlp): Sequential(
|
| 321 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 322 |
+
(gelu): GELU(approximate='none')
|
| 323 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 324 |
+
)
|
| 325 |
+
(ls_2): Identity()
|
| 326 |
+
)
|
| 327 |
+
(10): ResidualAttentionBlock(
|
| 328 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 329 |
+
(attn): MultiheadAttention(
|
| 330 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 331 |
+
)
|
| 332 |
+
(ls_1): Identity()
|
| 333 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 334 |
+
(mlp): Sequential(
|
| 335 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 336 |
+
(gelu): GELU(approximate='none')
|
| 337 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 338 |
+
)
|
| 339 |
+
(ls_2): Identity()
|
| 340 |
+
)
|
| 341 |
+
(11): ResidualAttentionBlock(
|
| 342 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 343 |
+
(attn): MultiheadAttention(
|
| 344 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 345 |
+
)
|
| 346 |
+
(ls_1): Identity()
|
| 347 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 348 |
+
(mlp): Sequential(
|
| 349 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 350 |
+
(gelu): GELU(approximate='none')
|
| 351 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 352 |
+
)
|
| 353 |
+
(ls_2): Identity()
|
| 354 |
+
)
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
(token_embedding): Embedding(49408, 512)
|
| 358 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 359 |
+
)
|
| 360 |
+
2024-09-07,01:04:26 | INFO | Params:
|
| 361 |
+
2024-09-07,01:04:26 | INFO | accum_freq: 1
|
| 362 |
+
2024-09-07,01:04:26 | INFO | aug_cfg: {}
|
| 363 |
+
2024-09-07,01:04:26 | INFO | batch_size: 2048
|
| 364 |
+
2024-09-07,01:04:26 | INFO | beta1: 0.9
|
| 365 |
+
2024-09-07,01:04:26 | INFO | beta2: 0.98
|
| 366 |
+
2024-09-07,01:04:26 | INFO | checkpoint_path: /home/minipile_style_trained/minipile_style/checkpoints
|
| 367 |
+
2024-09-07,01:04:26 | INFO | coca_caption_loss_weight: 2.0
|
| 368 |
+
2024-09-07,01:04:26 | INFO | coca_contrastive_loss_weight: 1.0
|
| 369 |
+
2024-09-07,01:04:26 | INFO | copy_codebase: False
|
| 370 |
+
2024-09-07,01:04:26 | INFO | csv_caption_key: title
|
| 371 |
+
2024-09-07,01:04:26 | INFO | csv_img_key: filepath
|
| 372 |
+
2024-09-07,01:04:26 | INFO | csv_separator:
|
| 373 |
+
2024-09-07,01:04:26 | INFO | dataset_resampled: True
|
| 374 |
+
2024-09-07,01:04:26 | INFO | dataset_type: webdataset
|
| 375 |
+
2024-09-07,01:04:26 | INFO | ddp_static_graph: True
|
| 376 |
+
2024-09-07,01:04:26 | INFO | debug: False
|
| 377 |
+
2024-09-07,01:04:26 | INFO | delete_previous_checkpoint: False
|
| 378 |
+
2024-09-07,01:04:26 | INFO | device: cuda:0
|
| 379 |
+
2024-09-07,01:04:26 | INFO | dist_backend: nccl
|
| 380 |
+
2024-09-07,01:04:26 | INFO | dist_url: env://
|
| 381 |
+
2024-09-07,01:04:26 | INFO | distill: False
|
| 382 |
+
2024-09-07,01:04:26 | INFO | distill_model: None
|
| 383 |
+
2024-09-07,01:04:26 | INFO | distill_pretrained: None
|
| 384 |
+
2024-09-07,01:04:26 | INFO | distributed: True
|
| 385 |
+
2024-09-07,01:04:26 | INFO | epochs: 5
|
| 386 |
+
2024-09-07,01:04:26 | INFO | epochs_cooldown: None
|
| 387 |
+
2024-09-07,01:04:26 | INFO | eps: 1e-06
|
| 388 |
+
2024-09-07,01:04:26 | INFO | force_custom_text: False
|
| 389 |
+
2024-09-07,01:04:26 | INFO | force_image_size: None
|
| 390 |
+
2024-09-07,01:04:26 | INFO | force_patch_dropout: None
|
| 391 |
+
2024-09-07,01:04:26 | INFO | force_quick_gelu: False
|
| 392 |
+
2024-09-07,01:04:26 | INFO | gather_with_grad: True
|
| 393 |
+
2024-09-07,01:04:26 | INFO | grad_checkpointing: True
|
| 394 |
+
2024-09-07,01:04:26 | INFO | grad_clip_norm: None
|
| 395 |
+
2024-09-07,01:04:26 | INFO | horovod: False
|
| 396 |
+
2024-09-07,01:04:26 | INFO | image_mean: None
|
| 397 |
+
2024-09-07,01:04:26 | INFO | image_std: None
|
| 398 |
+
2024-09-07,01:04:26 | INFO | imagenet_v2: None
|
| 399 |
+
2024-09-07,01:04:26 | INFO | imagenet_val: None
|
| 400 |
+
2024-09-07,01:04:26 | INFO | local_loss: True
|
| 401 |
+
2024-09-07,01:04:26 | INFO | local_rank: 0
|
| 402 |
+
2024-09-07,01:04:26 | INFO | lock_image: False
|
| 403 |
+
2024-09-07,01:04:26 | INFO | lock_image_freeze_bn_stats: False
|
| 404 |
+
2024-09-07,01:04:26 | INFO | lock_image_unlocked_groups: 0
|
| 405 |
+
2024-09-07,01:04:26 | INFO | lock_text: False
|
| 406 |
+
2024-09-07,01:04:26 | INFO | lock_text_freeze_layer_norm: False
|
| 407 |
+
2024-09-07,01:04:26 | INFO | lock_text_unlocked_layers: 0
|
| 408 |
+
2024-09-07,01:04:26 | INFO | log_every_n_steps: 100
|
| 409 |
+
2024-09-07,01:04:26 | INFO | log_level: 20
|
| 410 |
+
2024-09-07,01:04:26 | INFO | log_local: False
|
| 411 |
+
2024-09-07,01:04:26 | INFO | log_path: /home/minipile_style_trained/minipile_style/out.log
|
| 412 |
+
2024-09-07,01:04:26 | INFO | logs: /home/minipile_style_trained
|
| 413 |
+
2024-09-07,01:04:26 | INFO | lr: 0.0005
|
| 414 |
+
2024-09-07,01:04:26 | INFO | lr_cooldown_end: 0.0
|
| 415 |
+
2024-09-07,01:04:26 | INFO | lr_cooldown_power: 1.0
|
| 416 |
+
2024-09-07,01:04:26 | INFO | lr_scheduler: cosine
|
| 417 |
+
2024-09-07,01:04:26 | INFO | model: ViT-B-32
|
| 418 |
+
2024-09-07,01:04:26 | INFO | name: minipile_style
|
| 419 |
+
2024-09-07,01:04:26 | INFO | no_set_device_rank: False
|
| 420 |
+
2024-09-07,01:04:26 | INFO | precision: amp
|
| 421 |
+
2024-09-07,01:04:26 | INFO | pretrained:
|
| 422 |
+
2024-09-07,01:04:26 | INFO | pretrained_image: False
|
| 423 |
+
2024-09-07,01:04:26 | INFO | rank: 0
|
| 424 |
+
2024-09-07,01:04:26 | INFO | remote_sync: None
|
| 425 |
+
2024-09-07,01:04:26 | INFO | remote_sync_frequency: 300
|
| 426 |
+
2024-09-07,01:04:26 | INFO | remote_sync_protocol: s3
|
| 427 |
+
2024-09-07,01:04:26 | INFO | report_to: wandb
|
| 428 |
+
2024-09-07,01:04:26 | INFO | resume: None
|
| 429 |
+
2024-09-07,01:04:26 | INFO | save_frequency: 0
|
| 430 |
+
2024-09-07,01:04:26 | INFO | save_most_recent: True
|
| 431 |
+
2024-09-07,01:04:26 | INFO | seed: 0
|
| 432 |
+
2024-09-07,01:04:26 | INFO | skip_scheduler: False
|
| 433 |
+
2024-09-07,01:04:26 | INFO | tensorboard: False
|
| 434 |
+
2024-09-07,01:04:26 | INFO | tensorboard_path:
|
| 435 |
+
2024-09-07,01:04:26 | INFO | torchscript: False
|
| 436 |
+
2024-09-07,01:04:26 | INFO | trace: False
|
| 437 |
+
2024-09-07,01:04:26 | INFO | train_data: /home/minipile_style/{00000000..00000143}.tar
|
| 438 |
+
2024-09-07,01:04:26 | INFO | train_data_upsampling_factors: None
|
| 439 |
+
2024-09-07,01:04:26 | INFO | train_num_samples: 2560000
|
| 440 |
+
2024-09-07,01:04:26 | INFO | use_bn_sync: False
|
| 441 |
+
2024-09-07,01:04:26 | INFO | val_data: None
|
| 442 |
+
2024-09-07,01:04:26 | INFO | val_frequency: 1
|
| 443 |
+
2024-09-07,01:04:26 | INFO | val_num_samples: None
|
| 444 |
+
2024-09-07,01:04:26 | INFO | wandb: True
|
| 445 |
+
2024-09-07,01:04:26 | INFO | wandb_notes:
|
| 446 |
+
2024-09-07,01:04:26 | INFO | wandb_project_name: clip_text_hq_clusters
|
| 447 |
+
2024-09-07,01:04:26 | INFO | warmup: 500
|
| 448 |
+
2024-09-07,01:04:26 | INFO | wd: 0.2
|
| 449 |
+
2024-09-07,01:04:26 | INFO | workers: 4
|
| 450 |
+
2024-09-07,01:04:26 | INFO | world_size: 2
|
| 451 |
+
2024-09-07,01:04:26 | INFO | zeroshot_frequency: 2
|
| 452 |
+
2024-09-07,01:04:34 | INFO | Start epoch 0
|
| 453 |
+
2024-09-07,01:04:51 | INFO | Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 12.628 Batch (t): 17.150, 238.834/s, 119.417/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3862 (8.3862) Loss: 8.3862 (8.3862)
|
| 454 |
+
2024-09-07,01:04:54 | INFO | Reducer buckets have been rebuilt in this iteration.
|
| 455 |
+
2024-09-07,01:09:11 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.549 Batch (t): 2.604, 1566.52/s, 783.260/s/gpu LR: 0.000101 Logit Scale: 14.263 Contrastive_loss: 8.0384 (8.2123) Loss: 8.0384 (8.2123)
|
| 456 |
+
2024-09-07,01:13:33 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.562 Batch (t): 2.612, 1564.35/s, 782.174/s/gpu LR: 0.000201 Logit Scale: 14.235 Contrastive_loss: 7.7396 (8.0547) Loss: 7.7396 (8.0547)
|
| 457 |
+
2024-09-07,01:17:54 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.618, 1563.14/s, 781.569/s/gpu LR: 0.000301 Logit Scale: 14.216 Contrastive_loss: 7.5079 (7.9180) Loss: 7.5079 (7.9180)
|
| 458 |
+
2024-09-07,01:22:15 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.612, 1567.33/s, 783.663/s/gpu LR: 0.000401 Logit Scale: 14.207 Contrastive_loss: 7.3567 (7.8057) Loss: 7.3567 (7.8057)
|
| 459 |
+
2024-09-07,01:26:37 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.563 Batch (t): 2.613, 1568.18/s, 784.091/s/gpu LR: 0.000500 Logit Scale: 14.204 Contrastive_loss: 7.2498 (7.7131) Loss: 7.2498 (7.7131)
|
| 460 |
+
2024-09-07,01:30:58 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.562 Batch (t): 2.614, 1566.25/s, 783.123/s/gpu LR: 0.000498 Logit Scale: 14.229 Contrastive_loss: 7.1177 (7.6280) Loss: 7.1177 (7.6280)
|
| 461 |
+
2024-09-07,01:32:09 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.564 Batch (t): 2.615, 1576.49/s, 788.246/s/gpu LR: 0.000497 Logit Scale: 14.241 Contrastive_loss: 6.9284 (7.5406) Loss: 6.9284 (7.5406)
|
| 462 |
+
2024-09-07,01:32:11 | INFO | Start epoch 1
|
| 463 |
+
2024-09-07,01:32:23 | INFO | Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.632 Batch (t): 11.682, 350.622/s, 175.311/s/gpu LR: 0.000497 Logit Scale: 14.241 Contrastive_loss: 7.1128 (7.1128) Loss: 7.1128 (7.1128)
|
| 464 |
+
2024-09-07,01:36:43 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.542 Batch (t): 2.600, 1574.71/s, 787.354/s/gpu LR: 0.000491 Logit Scale: 14.325 Contrastive_loss: 6.2617 (6.6872) Loss: 6.2617 (6.6872)
|
| 465 |
+
2024-09-07,01:41:04 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.560 Batch (t): 2.612, 1567.28/s, 783.640/s/gpu LR: 0.000481 Logit Scale: 14.435 Contrastive_loss: 6.3944 (6.5896) Loss: 6.3944 (6.5896)
|
| 466 |
+
2024-09-07,01:45:25 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.561 Batch (t): 2.613, 1569.31/s, 784.656/s/gpu LR: 0.000468 Logit Scale: 14.554 Contrastive_loss: 6.2874 (6.5141) Loss: 6.2874 (6.5141)
|
| 467 |
+
2024-09-07,01:49:48 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.571 Batch (t): 2.623, 1561.57/s, 780.787/s/gpu LR: 0.000452 Logit Scale: 14.717 Contrastive_loss: 6.4188 (6.4950) Loss: 6.4188 (6.4950)
|
| 468 |
+
2024-09-07,01:54:09 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.617, 1568.23/s, 784.117/s/gpu LR: 0.000433 Logit Scale: 14.903 Contrastive_loss: 6.1368 (6.4353) Loss: 6.1368 (6.4353)
|
| 469 |
+
2024-09-07,01:58:31 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.562 Batch (t): 2.615, 1563.86/s, 781.931/s/gpu LR: 0.000412 Logit Scale: 15.119 Contrastive_loss: 6.3245 (6.4195) Loss: 6.3245 (6.4195)
|
| 470 |
+
2024-09-07,01:59:41 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.557 Batch (t): 2.609, 1570.89/s, 785.444/s/gpu LR: 0.000406 Logit Scale: 15.184 Contrastive_loss: 6.0716 (6.3760) Loss: 6.0716 (6.3760)
|
| 471 |
+
2024-09-07,01:59:44 | INFO | Start epoch 2
|
| 472 |
+
2024-09-07,01:59:56 | INFO | Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.464 Batch (t): 11.513, 355.787/s, 177.893/s/gpu LR: 0.000405 Logit Scale: 15.186 Contrastive_loss: 5.2756 (5.2756) Loss: 5.2756 (5.2756)
|
| 473 |
+
2024-09-07,02:04:16 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.543 Batch (t): 2.604, 1565.58/s, 782.788/s/gpu LR: 0.000381 Logit Scale: 15.457 Contrastive_loss: 5.4310 (5.3533) Loss: 5.4310 (5.3533)
|
| 474 |
+
2024-09-07,02:08:37 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.561 Batch (t): 2.614, 1565.61/s, 782.803/s/gpu LR: 0.000355 Logit Scale: 15.792 Contrastive_loss: 5.1490 (5.2852) Loss: 5.1490 (5.2852)
|
| 475 |
+
2024-09-07,02:12:58 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.557 Batch (t): 2.611, 1570.96/s, 785.478/s/gpu LR: 0.000327 Logit Scale: 16.006 Contrastive_loss: 4.7893 (5.1612) Loss: 4.7893 (5.1612)
|
| 476 |
+
2024-09-07,02:17:20 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.560 Batch (t): 2.614, 1571.90/s, 785.949/s/gpu LR: 0.000298 Logit Scale: 16.299 Contrastive_loss: 4.1633 (4.9616) Loss: 4.1633 (4.9616)
|
| 477 |
+
2024-09-07,02:21:41 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.558 Batch (t): 2.612, 1572.91/s, 786.453/s/gpu LR: 0.000269 Logit Scale: 16.531 Contrastive_loss: 5.3621 (5.0284) Loss: 5.3621 (5.0284)
|
| 478 |
+
2024-09-07,02:26:02 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.558 Batch (t): 2.610, 1562.81/s, 781.406/s/gpu LR: 0.000239 Logit Scale: 16.809 Contrastive_loss: 4.7398 (4.9871) Loss: 4.7398 (4.9871)
|
| 479 |
+
2024-09-07,02:27:13 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.558 Batch (t): 2.611, 1580.04/s, 790.018/s/gpu LR: 0.000231 Logit Scale: 16.883 Contrastive_loss: 5.2913 (5.0252) Loss: 5.2913 (5.0252)
|
| 480 |
+
2024-09-07,02:27:15 | INFO | Start epoch 3
|
| 481 |
+
2024-09-07,02:27:27 | INFO | Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.452 Batch (t): 11.500, 356.177/s, 178.089/s/gpu LR: 0.000231 Logit Scale: 16.886 Contrastive_loss: 2.5502 (2.5502) Loss: 2.5502 (2.5502)
|
| 482 |
+
2024-09-07,02:31:47 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.541 Batch (t): 2.601, 1571.87/s, 785.933/s/gpu LR: 0.000202 Logit Scale: 17.181 Contrastive_loss: 3.5267 (3.0384) Loss: 3.5267 (3.0384)
|
| 483 |
+
2024-09-07,02:36:08 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.560 Batch (t): 2.613, 1565.80/s, 782.899/s/gpu LR: 0.000173 Logit Scale: 17.462 Contrastive_loss: 3.9774 (3.3514) Loss: 3.9774 (3.3514)
|
| 484 |
+
2024-09-07,02:40:30 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.562 Batch (t): 2.614, 1569.59/s, 784.797/s/gpu LR: 0.000145 Logit Scale: 17.663 Contrastive_loss: 3.9409 (3.4988) Loss: 3.9409 (3.4988)
|
| 485 |
+
2024-09-07,02:44:51 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.559 Batch (t): 2.614, 1568.53/s, 784.265/s/gpu LR: 0.000119 Logit Scale: 17.841 Contrastive_loss: 4.1620 (3.6314) Loss: 4.1620 (3.6314)
|
| 486 |
+
2024-09-07,02:49:13 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.562 Batch (t): 2.616, 1564.29/s, 782.143/s/gpu LR: 0.000095 Logit Scale: 18.007 Contrastive_loss: 2.9186 (3.5126) Loss: 2.9186 (3.5126)
|
| 487 |
+
2024-09-07,02:53:34 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.561 Batch (t): 2.614, 1567.47/s, 783.735/s/gpu LR: 0.000072 Logit Scale: 18.160 Contrastive_loss: 2.7363 (3.4017) Loss: 2.7363 (3.4017)
|
| 488 |
+
2024-09-07,02:54:45 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.560 Batch (t): 2.612, 1572.99/s, 786.494/s/gpu LR: 0.000067 Logit Scale: 18.194 Contrastive_loss: 2.7893 (3.3252) Loss: 2.7893 (3.3252)
|
| 489 |
+
2024-09-07,02:54:47 | INFO | Start epoch 4
|
| 490 |
+
2024-09-07,02:54:59 | INFO | Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.426 Batch (t): 11.475, 356.950/s, 178.475/s/gpu LR: 0.000067 Logit Scale: 18.195 Contrastive_loss: 2.1587 (2.1587) Loss: 2.1587 (2.1587)
|
| 491 |
+
2024-09-07,02:59:19 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.543 Batch (t): 2.604, 1570.96/s, 785.480/s/gpu LR: 0.000048 Logit Scale: 18.291 Contrastive_loss: 3.3741 (2.7664) Loss: 3.3741 (2.7664)
|
| 492 |
+
2024-09-07,03:03:40 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.560 Batch (t): 2.613, 1563.73/s, 781.867/s/gpu LR: 0.000032 Logit Scale: 18.364 Contrastive_loss: 3.4906 (3.0078) Loss: 3.4906 (3.0078)
|
| 493 |
+
2024-09-07,03:08:02 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.562 Batch (t): 2.615, 1566.58/s, 783.290/s/gpu LR: 0.000019 Logit Scale: 18.406 Contrastive_loss: 2.0411 (2.7662) Loss: 2.0411 (2.7662)
|
| 494 |
+
2024-09-07,03:12:23 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.559 Batch (t): 2.613, 1569.87/s, 784.935/s/gpu LR: 0.000009 Logit Scale: 18.431 Contrastive_loss: 2.7578 (2.7645) Loss: 2.7578 (2.7645)
|
| 495 |
+
2024-09-07,03:16:44 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.557 Batch (t): 2.613, 1566.22/s, 783.109/s/gpu LR: 0.000003 Logit Scale: 18.441 Contrastive_loss: 2.2815 (2.6840) Loss: 2.2815 (2.6840)
|
| 496 |
+
2024-09-07,03:21:06 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.559 Batch (t): 2.614, 1570.17/s, 785.084/s/gpu LR: 0.000000 Logit Scale: 18.443 Contrastive_loss: 2.7785 (2.6975) Loss: 2.7785 (2.6975)
|
| 497 |
+
2024-09-07,03:22:16 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.555 Batch (t): 2.609, 1577.35/s, 788.676/s/gpu LR: 0.000000 Logit Scale: 18.444 Contrastive_loss: 1.7127 (2.5744) Loss: 1.7127 (2.5744)
|
minipile_style_trained/minipile_style/params.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 1
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
checkpoint_path: /home/minipile_style_trained/minipile_style/checkpoints
|
| 7 |
+
coca_caption_loss_weight: 2.0
|
| 8 |
+
coca_contrastive_loss_weight: 1.0
|
| 9 |
+
copy_codebase: False
|
| 10 |
+
csv_caption_key: title
|
| 11 |
+
csv_img_key: filepath
|
| 12 |
+
csv_separator:
|
| 13 |
+
dataset_resampled: True
|
| 14 |
+
dataset_type: webdataset
|
| 15 |
+
ddp_static_graph: True
|
| 16 |
+
debug: False
|
| 17 |
+
delete_previous_checkpoint: False
|
| 18 |
+
device: cuda:0
|
| 19 |
+
dist_backend: nccl
|
| 20 |
+
dist_url: env://
|
| 21 |
+
distill: False
|
| 22 |
+
distill_model: None
|
| 23 |
+
distill_pretrained: None
|
| 24 |
+
distributed: True
|
| 25 |
+
epochs: 5
|
| 26 |
+
epochs_cooldown: None
|
| 27 |
+
eps: 1e-06
|
| 28 |
+
force_custom_text: False
|
| 29 |
+
force_image_size: None
|
| 30 |
+
force_patch_dropout: None
|
| 31 |
+
force_quick_gelu: False
|
| 32 |
+
gather_with_grad: True
|
| 33 |
+
grad_checkpointing: True
|
| 34 |
+
grad_clip_norm: None
|
| 35 |
+
horovod: False
|
| 36 |
+
image_mean: None
|
| 37 |
+
image_std: None
|
| 38 |
+
imagenet_v2: None
|
| 39 |
+
imagenet_val: None
|
| 40 |
+
local_loss: True
|
| 41 |
+
local_rank: 0
|
| 42 |
+
lock_image: False
|
| 43 |
+
lock_image_freeze_bn_stats: False
|
| 44 |
+
lock_image_unlocked_groups: 0
|
| 45 |
+
lock_text: False
|
| 46 |
+
lock_text_freeze_layer_norm: False
|
| 47 |
+
lock_text_unlocked_layers: 0
|
| 48 |
+
log_every_n_steps: 100
|
| 49 |
+
log_level: 20
|
| 50 |
+
log_local: False
|
| 51 |
+
log_path: /home/minipile_style_trained/minipile_style/out.log
|
| 52 |
+
logs: /home/minipile_style_trained
|
| 53 |
+
lr: 0.0005
|
| 54 |
+
lr_cooldown_end: 0.0
|
| 55 |
+
lr_cooldown_power: 1.0
|
| 56 |
+
lr_scheduler: cosine
|
| 57 |
+
model: ViT-B-32
|
| 58 |
+
name: minipile_style
|
| 59 |
+
no_set_device_rank: False
|
| 60 |
+
precision: amp
|
| 61 |
+
pretrained:
|
| 62 |
+
pretrained_image: False
|
| 63 |
+
rank: 0
|
| 64 |
+
remote_sync: None
|
| 65 |
+
remote_sync_frequency: 300
|
| 66 |
+
remote_sync_protocol: s3
|
| 67 |
+
report_to: wandb
|
| 68 |
+
resume: None
|
| 69 |
+
save_frequency: 0
|
| 70 |
+
save_most_recent: True
|
| 71 |
+
seed: 0
|
| 72 |
+
skip_scheduler: False
|
| 73 |
+
tensorboard: False
|
| 74 |
+
tensorboard_path:
|
| 75 |
+
torchscript: False
|
| 76 |
+
trace: False
|
| 77 |
+
train_data: /home/minipile_style/{00000000..00000143}.tar
|
| 78 |
+
train_data_upsampling_factors: None
|
| 79 |
+
train_num_samples: 2560000
|
| 80 |
+
use_bn_sync: False
|
| 81 |
+
val_data: None
|
| 82 |
+
val_frequency: 1
|
| 83 |
+
val_num_samples: None
|
| 84 |
+
wandb: True
|
| 85 |
+
wandb_notes:
|
| 86 |
+
wandb_project_name: clip_text_hq_clusters
|
| 87 |
+
warmup: 500
|
| 88 |
+
wd: 0.2
|
| 89 |
+
workers: 4
|
| 90 |
+
world_size: 2
|
| 91 |
+
zeroshot_frequency: 2
|