Add vision_config
#2
by
pcuenq
HF Staff
- opened
- config.json +62 -1
config.json
CHANGED
|
@@ -82,7 +82,7 @@
|
|
| 82 |
"mm_vision_select_feature": "patch",
|
| 83 |
"mm_vision_select_layer": -2,
|
| 84 |
"mm_vision_tower": "mobileclip_l_1024",
|
| 85 |
-
"model_type": "
|
| 86 |
"no_repeat_ngram_size": 0,
|
| 87 |
"num_attention_heads": 14,
|
| 88 |
"num_beam_groups": 1,
|
|
@@ -126,5 +126,66 @@
|
|
| 126 |
"use_cache": true,
|
| 127 |
"use_mm_proj": true,
|
| 128 |
"use_sliding_window": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
"vocab_size": 151936
|
| 130 |
}
|
|
|
|
| 82 |
"mm_vision_select_feature": "patch",
|
| 83 |
"mm_vision_select_layer": -2,
|
| 84 |
"mm_vision_tower": "mobileclip_l_1024",
|
| 85 |
+
"model_type": "fastvlm",
|
| 86 |
"no_repeat_ngram_size": 0,
|
| 87 |
"num_attention_heads": 14,
|
| 88 |
"num_beam_groups": 1,
|
|
|
|
| 126 |
"use_cache": true,
|
| 127 |
"use_mm_proj": true,
|
| 128 |
"use_sliding_window": false,
|
| 129 |
+
"vision_config": {
|
| 130 |
+
"cls_ratio": 2.0,
|
| 131 |
+
"down_patch_size": 7,
|
| 132 |
+
"down_stride": 2,
|
| 133 |
+
"downsamples": [
|
| 134 |
+
true,
|
| 135 |
+
true,
|
| 136 |
+
true,
|
| 137 |
+
true,
|
| 138 |
+
true
|
| 139 |
+
],
|
| 140 |
+
"embed_dims": [
|
| 141 |
+
96,
|
| 142 |
+
192,
|
| 143 |
+
384,
|
| 144 |
+
768,
|
| 145 |
+
1536
|
| 146 |
+
],
|
| 147 |
+
"hidden_size": 1024,
|
| 148 |
+
"image_size": 1024,
|
| 149 |
+
"intermediate_size": 3072,
|
| 150 |
+
"layer_scale_init_value": 1e-05,
|
| 151 |
+
"layers": [
|
| 152 |
+
2,
|
| 153 |
+
12,
|
| 154 |
+
24,
|
| 155 |
+
4,
|
| 156 |
+
2
|
| 157 |
+
],
|
| 158 |
+
"mlp_ratios": [
|
| 159 |
+
4,
|
| 160 |
+
4,
|
| 161 |
+
4,
|
| 162 |
+
4,
|
| 163 |
+
4
|
| 164 |
+
],
|
| 165 |
+
"num_classes": 1000,
|
| 166 |
+
"patch_size": 64,
|
| 167 |
+
"pos_embs_shapes": [
|
| 168 |
+
null,
|
| 169 |
+
null,
|
| 170 |
+
null,
|
| 171 |
+
[
|
| 172 |
+
7,
|
| 173 |
+
7
|
| 174 |
+
],
|
| 175 |
+
[
|
| 176 |
+
7,
|
| 177 |
+
7
|
| 178 |
+
]
|
| 179 |
+
],
|
| 180 |
+
"projection_dim": 768,
|
| 181 |
+
"repmixer_kernel_size": 3,
|
| 182 |
+
"token_mixers": [
|
| 183 |
+
"repmixer",
|
| 184 |
+
"repmixer",
|
| 185 |
+
"repmixer",
|
| 186 |
+
"attention",
|
| 187 |
+
"attention"
|
| 188 |
+
]
|
| 189 |
+
},
|
| 190 |
"vocab_size": 151936
|
| 191 |
}
|