hao9610 commited on Aug 6

Commit

6289af9

verified ·

1 Parent(s): a21008f

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

.gitattributes +2 -0
README.md +1 -1
s1_seg_finetune/xsam_sam_large_m2f_e36_gpu16_seg_finetune/pytorch_model.bin +3 -0
s2_align_pretrain/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_e1_gpu16_align_pretrain/pytorch_model.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/pytorch_model.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/added_tokens.json +13 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/config.json +36 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/generation_config.json +11 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00001-of-00004.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00002-of-00004.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00003-of-00004.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00004-of-00004.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model.bin.index.json +202 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/special_tokens_map.json +30 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.json +0 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.model +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer_config.json +132 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/config.json +33 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/preprocessor_config.json +44 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/pytorch_model.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/config.json +18 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/configuration_projector.py +25 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/modeling_projector.py +48 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/pytorch_model.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/config.json +19 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/preprocessor_config.json +24 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/pytorch_model.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/config.json +18 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/configuration_projector.py +25 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/modeling_projector.py +48 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/pytorch_model.bin +3 -0
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/xtuner_config.py +703 -0
vgdseg_annotations/coco_vgdseg_train.json +3 -0
vgdseg_annotations/coco_vgdseg_val.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+vgdseg_annotations/coco_vgdseg_train.json filter=lfs diff=lfs merge=lfs -text
+vgdseg_annotations/coco_vgdseg_val.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -16,7 +16,7 @@ tags:
 <sup>1</sup> Sun Yat-sen University, <sup>2</sup> Peng Cheng Laboratory, <sup>3</sup> Meituan Inc.
-<sup>📧</sup> corresponding author.
 </div>
 <div align="center" style="display: flex; justify-content: center; align-items: center;">

 <sup>1</sup> Sun Yat-sen University, <sup>2</sup> Peng Cheng Laboratory, <sup>3</sup> Meituan Inc.
+<sup>📧</sup> Corresponding author
 </div>
 <div align="center" style="display: flex; justify-content: center; align-items: center;">

s1_seg_finetune/xsam_sam_large_m2f_e36_gpu16_seg_finetune/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8f0ea6912951c1a31e409e331be35471953190f31f38efd519c10f02e9b11da
+size 679089406

s2_align_pretrain/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_e1_gpu16_align_pretrain/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:202d131ebb09b6d068c56c668bcb95780059021bbf5b506a13c3785261c66389
+size 70019002

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1036e611890bd8af8e73e368b97bddd3bcbfc8abdcbb257f05226ac4e8ff1ec
+size 9248986190

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/added_tokens.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "Phi-3-mini-4k-instruct",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_phi3.Phi3Config",
+    "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 4096,
+  "model_type": "phi3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 32000,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 2047,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.48.0",
+  "use_cache": true,
+  "vocab_size": 32014
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": [
+    32000,
+    32001,
+    32007
+  ],
+  "pad_token_id": 32000,
+  "transformers_version": "4.48.0"
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00001-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1398fdd95dd16597366a564b7fdef4f928faa12fe2ac4b80b7f4511502abbc0
+size 1958403558

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00002-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ff085f4f962fe5ca14769b7d8a8235cfc2a58c80a2532da2ce31f21838141ac
+size 1937897316

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00003-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69b499b2a723e8ed2bdfb0213f75d5c2ece2ddb2830b26f8856b4dfe74968138
+size 1981937508

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00004-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac56ea2ec260c0f149361d5ffd1eced953727eee36b94094b47b376e606f249e
+size 1763373552

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+  "metadata": {
+    "total_size": 7641544704
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00004-of-00004.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.0.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.0.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.1.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.1.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.10.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.10.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.11.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.11.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.12.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.12.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.13.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.13.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.14.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.14.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.15.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.15.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.16.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.16.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.17.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.17.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.18.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.18.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.19.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.19.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.2.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.2.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.20.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.20.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.21.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.21.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.22.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.22.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.23.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.23.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.24.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.24.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.25.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "model.layers.25.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.26.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.26.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.27.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.27.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.28.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.28.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.29.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.29.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.3.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.3.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.30.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.30.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.31.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.31.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.4.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.4.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.5.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.5.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.6.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.6.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.7.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.7.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.8.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.8.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.9.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.layers.9.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "model.norm.weight": "pytorch_model-00004-of-00004.bin"
+  }
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "sam-vit-large",
+  "architectures": [
+    "XSegmentor"
+  ],
+  "initializer_range": 0.02,
+  "mask_decoder_config": {
+    "model_type": ""
+  },
+  "model_type": "sam",
+  "prompt_encoder_config": {
+    "model_type": ""
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.0",
+  "vision_config": {
+    "dropout": 0.0,
+    "global_attn_indexes": [
+      5,
+      11,
+      17,
+      23
+    ],
+    "hidden_size": 1024,
+    "initializer_factor": 1.0,
+    "intermediate_size": 6144,
+    "mlp_dim": 4096,
+    "model_type": "",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "projection_dim": 512
+  }
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "crop_size": {
+    "height": 1024,
+    "width": 1024
+  },
+  "do_convert_rgb": true,
+  "do_crop": false,
+  "do_flip": false,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "flip_direction": "horizontal",
+  "flip_ratio": 0.5,
+  "ignore_index": 0,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "SamImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "mask_pad_size": {
+    "height": 1024,
+    "width": 1024
+  },
+  "mask_size": {
+    "longest_edge": 1024
+  },
+  "pad_size": {
+    "height": 1024,
+    "width": 1024
+  },
+  "processor_class": "SamProcessor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 1024
+  }
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73ee1f35874aba42b79cf385e9e7f8bbbf619e3bb8f3ad27955c41cbf3e8dcb3
+size 616667758

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "architectures": [
+    "DynamicProjectorModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_projector.DynamicProjectorConfig",
+    "AutoModel": "modeling_projector.DynamicProjectorModel"
+  },
+  "bias": true,
+  "depth": 2,
+  "downsample_ratio": 0.5,
+  "hidden_act": "gelu",
+  "llm_hidden_size": 3072,
+  "model_type": "dynamic_projector",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.0",
+  "visual_hidden_size": 1024
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/configuration_projector.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from transformers import PretrainedConfig
+class DynamicProjectorConfig(PretrainedConfig):
+    model_type = "dynamic_projector"
+    _auto_class = "AutoConfig"
+    def __init__(
+        self,
+        visual_hidden_size=4096,
+        llm_hidden_size=4096,
+        downsample_ratio=1.0,
+        depth=2,
+        hidden_act="gelu",
+        bias=True,
+        **kwargs,
+    ):
+        self.visual_hidden_size = visual_hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.downsample_ratio = downsample_ratio
+        self.depth = depth
+        self.hidden_act = hidden_act
+        self.bias = bias
+        super().__init__(**kwargs)

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/modeling_projector.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from xsam.model.utils import pixel_shuffle
+from .configuration_projector import DynamicProjectorConfig
+class DynamicProjectorModel(PreTrainedModel):
+    _auto_class = "AutoModel"
+    config_class = DynamicProjectorConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["model"]
+    def __init__(self, config: DynamicProjectorConfig) -> None:
+        super().__init__(config)
+        self.gradient_checkpointing = False
+        visual_hidden_size = config.visual_hidden_size * int(1 / config.downsample_ratio) ** 2
+        modules = [
+            nn.Linear(visual_hidden_size, config.llm_hidden_size, bias=config.bias),
+        ]
+        for _ in range(1, config.depth):
+            modules.append(ACT2FN[config.hidden_act])
+            modules.append(nn.Linear(config.llm_hidden_size, config.llm_hidden_size, bias=config.bias))
+        self.model = nn.Sequential(*modules)
+    def enable_input_require_grads(self):
+        def make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+        self.model.register_forward_hook(make_inputs_require_grad)
+    def forward(self, x):
+        if x.ndim == 4:
+            if self.config.downsample_ratio != 1:
+                x = pixel_shuffle(x, self.config.downsample_ratio)
+            x = x.view(x.shape[0], -1, x.shape[-1])
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(self.model, x)
+        else:
+            layer_outputs = self.model(x)
+        return layer_outputs

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51a2a66f2a0cd1b54c160916a628821da09f366256b7d5c9f73a05b261c9f71e
+size 44054528

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_name_or_path": "siglip2-so400m-patch14-384",
+  "architectures": [
+    "SiglipVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 384,
+  "intermediate_size": 4304,
+  "layer_norm_eps": 1e-06,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 27,
+  "patch_size": 14,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.0"
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  }
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d20f0e3b88fb7a553165f32ec37684da2d51f36e87ded7420d7ea3375b015e3
+size 856600842

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "architectures": [
+    "DynamicProjectorModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_projector.DynamicProjectorConfig",
+    "AutoModel": "modeling_projector.DynamicProjectorModel"
+  },
+  "bias": true,
+  "depth": 2,
+  "downsample_ratio": 1.0,
+  "hidden_act": "gelu",
+  "llm_hidden_size": 3072,
+  "model_type": "dynamic_projector",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.0",
+  "visual_hidden_size": 1152
+}

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/configuration_projector.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from transformers import PretrainedConfig
+class DynamicProjectorConfig(PretrainedConfig):
+    model_type = "dynamic_projector"
+    _auto_class = "AutoConfig"
+    def __init__(
+        self,
+        visual_hidden_size=4096,
+        llm_hidden_size=4096,
+        downsample_ratio=1.0,
+        depth=2,
+        hidden_act="gelu",
+        bias=True,
+        **kwargs,
+    ):
+        self.visual_hidden_size = visual_hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.downsample_ratio = downsample_ratio
+        self.depth = depth
+        self.hidden_act = hidden_act
+        self.bias = bias
+        super().__init__(**kwargs)

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/modeling_projector.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from xsam.model.utils import pixel_shuffle
+from .configuration_projector import DynamicProjectorConfig
+class DynamicProjectorModel(PreTrainedModel):
+    _auto_class = "AutoModel"
+    config_class = DynamicProjectorConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["model"]
+    def __init__(self, config: DynamicProjectorConfig) -> None:
+        super().__init__(config)
+        self.gradient_checkpointing = False
+        visual_hidden_size = config.visual_hidden_size * int(1 / config.downsample_ratio) ** 2
+        modules = [
+            nn.Linear(visual_hidden_size, config.llm_hidden_size, bias=config.bias),
+        ]
+        for _ in range(1, config.depth):
+            modules.append(ACT2FN[config.hidden_act])
+            modules.append(nn.Linear(config.llm_hidden_size, config.llm_hidden_size, bias=config.bias))
+        self.model = nn.Sequential(*modules)
+    def enable_input_require_grads(self):
+        def make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+        self.model.register_forward_hook(make_inputs_require_grad)
+    def forward(self, x):
+        if x.ndim == 4:
+            if self.config.downsample_ratio != 1:
+                x = pixel_shuffle(x, self.config.downsample_ratio)
+            x = x.view(x.shape[0], -1, x.shape[-1])
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(self.model, x)
+        else:
+            layer_outputs = self.model(x)
+        return layer_outputs

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:509d21776557ab6566a4b2163e29df9de3299e5c7af9b2e906f6fdf447d91795
+size 25966592

s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/xtuner_config.py ADDED Viewed

	@@ -0,0 +1,703 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from os import getenv
+import torch
+from mmengine.hooks import CheckpointHook, DistSamplerSeedHook, IterTimerHook, LoggerHook, ParamSchedulerHook
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from torch.optim import AdamW
+from transformers import AutoModelForCausalLM, AutoTokenizer, SiglipImageProcessor, SiglipVisionModel
+from xsam.dataset import GenericSegDataset, VGDSegDataset
+from xsam.dataset.map_fns import dataset_map_fn_factory, generic_seg_map_fn, template_map_fn_factory, vgd_seg_map_fn
+from xsam.dataset.process_fns import (
+    gcg_seg_postprocess_fn,
+    generic_seg_postprocess_fn,
+    inter_seg_postprocess_fn,
+    process_map_fn_factory,
+    reason_seg_postprocess_fn,
+    refer_seg_postprocess_fn,
+    vgd_seg_postprocess_fn,
+)
+from xsam.dataset.processors import SamImageProcessor
+from xsam.engine.hooks import DatasetInfoHook, EvaluateChatHook, ModelInfoHook, PTCheckpointHook
+from xsam.engine.runners import TrainLoop
+from xsam.evaluation.evaluators import GenericSegEvaluator, VGDSegEvaluator
+from xsam.model import XSamModel
+from xsam.model.segmentors import XSegmentor
+from xsam.model.segmentors.mask2former import Mask2FormerConfig, Mask2FormerModel
+from xsam.model.segmentors.sam import SamModel
+from xsam.utils.visualizer import Visualizer
+from xtuner.utils import PROMPT_TEMPLATE
+#######################################################################
+#                          PART 1  Settings                           #
+#######################################################################
+# Directories
+code_dir = getenv("CODE_DIR", "./xsam/")
+data_dir = getenv("DATA_DIR", "./datas/")
+init_dir = getenv("INIT_DIR", "./inits/")
+work_dir = getenv("WORK_DIR", "./wkdrs/")
+# Model
+llm_name_or_path = init_dir + "Phi-3-mini-4k-instruct"
+visual_encoder_name_or_path = init_dir + "siglip2-so400m-patch14-384"
+seg_encoder_name_or_path = init_dir + "sam-vit-large"
+seg_decoder_name_or_path = init_dir + "mask2former-swin-large-coco-panoptic"
+# Specify the pretrained pth
+s1_pretrained_pth = work_dir + "s1_seg_finetune/xsam_sam_large_m2f_e36_gpu16_seg_finetune/pytorch_model.bin"
+s2_pretrained_pth = (
+    work_dir
+    + "s2_align_pretrain/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_e1_gpu16_align_pretrain/pytorch_model.bin"
+)  # noqa: E501
+# Prompt
+prompt_template = PROMPT_TEMPLATE.phi3_chat
+max_length = int(4096 - (384 / 14) ** 2 - 1024)
+# Scheduler & Optimizer
+batch_size = 4  # per_device
+accumulative_counts = 1
+dataloader_num_workers = 4
+max_epochs = 1
+optim_type = AdamW
+lr = 4e-5
+betas = (0.9, 0.999)
+weight_decay = 0.05
+max_norm = 1  # grad clip
+warmup_ratio = 0.03
+# Save
+save_steps = 2000
+save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)
+# Logging
+logging_interval = 10
+# Evaluate the generation performance during the training
+evaluation_freq = 2000
+SYSTEM = ""
+evaluation_images = [
+    code_dir + "xsam/configs/xsam/images/llava_imgconv.jpg",
+    code_dir + "xsam/configs/xsam/images/panoptic_genseg.jpg",
+    code_dir + "xsam/configs/xsam/images/refcoco_refseg.jpg",
+    code_dir + "xsam/configs/xsam/images/lisa_reaseg.jpg",
+    code_dir + "xsam/configs/xsam/images/refcocog_gcgseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
+    code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
+]
+evaluation_inputs = [
+    "Can you describe this image in detail? Please elaborate in your response.",
+    "Can you generate segmentation masks for this image based on the specified categories: <p>person</p>, <p>bicycle</p>, <p>car</p>, <p>motorcycle</p>, <p>airplane</p>, <p>bus</p>, <p>train</p>, <p>truck</p>, <p>boat</p>, <p>traffic light</p>, <p>fire hydrant</p>, <p>stop sign</p>, <p>parking meter</p>, <p>bench</p>, <p>bird</p>, <p>cat</p>, <p>dog</p>, <p>horse</p>, <p>sheep</p>, <p>cow</p>, <p>elephant</p>, <p>bear</p>, <p>zebra</p>, <p>giraffe</p>, <p>backpack</p>, <p>umbrella</p>, <p>handbag</p>, <p>tie</p>, <p>suitcase</p>, <p>frisbee</p>, <p>skis</p>, <p>snowboard</p>, <p>sports ball</p>, <p>kite</p>, <p>baseball bat</p>, <p>baseball glove</p>, <p>skateboard</p>, <p>surfboard</p>, <p>tennis racket</p>, <p>bottle</p>, <p>wine glass</p>, <p>cup</p>, <p>fork</p>, <p>knife</p>, <p>spoon</p>, <p>bowl</p>, <p>banana</p>, <p>apple</p>, <p>sandwich</p>, <p>orange</p>, <p>broccoli</p>, <p>carrot</p>, <p>hot dog</p>, <p>pizza</p>, <p>donut</p>, <p>cake</p>, <p>chair</p>, <p>couch</p>, <p>potted plant</p>, <p>bed</p>, <p>dining table</p>, <p>toilet</p>, <p>tv</p>, <p>laptop</p>, <p>mouse</p>, <p>remote</p>, <p>keyboard</p>, <p>cell phone</p>, <p>microwave</p>, <p>oven</p>, <p>toaster</p>, <p>sink</p>, <p>refrigerator</p>, <p>book</p>, <p>clock</p>, <p>vase</p>, <p>scissors</p>, <p>teddy bear</p>, <p>hair drier</p>, <p>toothbrush</p>, <p>banner</p>, <p>blanket</p>, <p>bridge</p>, <p>cardboard</p>, <p>counter</p>, <p>curtain</p>, <p>door</p>, <p>floor wood</p>, <p>flower</p>, <p>fruit</p>, <p>gravel</p>, <p>house</p>, <p>light</p>, <p>mirror</p>, <p>net</p>, <p>pillow</p>, <p>platform</p>, <p>playingfield</p>, <p>railroad</p>, <p>river</p>, <p>road</p>, <p>roof</p>, <p>sand</p>, <p>sea</p>, <p>shelf</p>, <p>snow</p>, <p>stairs</p>, <p>tent</p>, <p>towel</p>, <p>wall brick</p>, <p>wall stone</p>, <p>wall tile</p>, <p>wall wood</p>, <p>water</p>, <p>window blind</p>, <p>window</p>, <p>tree</p>, <p>fence</p>, <p>ceiling</p>, <p>sky</p>, <p>cabinet</p>, <p>table</p>, <p>floor</p>, <p>pavement</p>, <p>mountain</p>, <p>grass</p>, <p>dirt</p>, <p>paper</p>, <p>food</p>, <p>building</p>, <p>rock</p>, <p>wall</p>, <p>rug</p>? Please output the segmentation mask.",
+    "Can you segment <p>the women with red coat</p> in this image? Please output the corresponding segmentation mask.",
+    "<p>when enjoying an ice cream sundae, what can we use to scoop up the whipped cream and place it on top of the ice cream?</p> Please output the corresponding segmentation mask.",
+    "Can you provide a brief description of the this image? Respond with interleaved segmentation masks for the corresponding phrases.",
+    "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
+    "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
+    "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
+    "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
+    "Can you segment the image based on the following regions: <p><region></p>? Please output the segmentation mask.",
+    "Can you segment the image based on the following regions: <p><region></p>? Please output the segmentation mask.",
+    "Can you segment the image based on the following regions: <p><region></p>? Please output the segmentation mask.",
+    "Can you segment the image based on the following regions: <p><region></p>, <p><region></p>? Please output the segmentation mask.",
+    "Can you segment the image based on the following regions: <p><region></p>, <p><region></p>? Please output the segmentation mask.",
+]
+vprompt_masks = [
+    (None,),
+    (None,),
+    (None,),
+    (None,),
+    (None,),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_point0.png",),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_scribble1.png",),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_box0.png",),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_mask1.png",),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_point0.png",),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_scribble1.png",),
+    (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_box0.png",),
+    (
+        code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_point0.png",
+        code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_scribble1.png",
+    ),
+    (
+        code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_box0.png",
+        code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_point1.png",
+    ),
+]
+#######################################################################
+#            PART 2  Model & Tokenizer & Image Processor              #
+#######################################################################
+# TODO: add special tokens via import from xsam.utils
+special_tokens = ["<SEG>", "<p>", "</p>"]
+cond_type = "phrase"  # "phrase" "cls" "all"
+ignore_label = 255
+tokenizer = dict(
+    type=AutoTokenizer.from_pretrained,
+    pretrained_model_name_or_path=llm_name_or_path,
+    trust_remote_code=True,
+    padding_side="right",
+)
+image_processor = dict(
+    type=SiglipImageProcessor.from_pretrained,
+    pretrained_model_name_or_path=visual_encoder_name_or_path,
+    trust_remote_code=True,
+)
+extra_image_processor = dict(
+    type=SamImageProcessor.from_pretrained,
+    pretrained_model_name_or_path=seg_encoder_name_or_path,
+    trust_remote_code=True,
+    ignore_index=0,
+)
+model = dict(
+    type=XSamModel,
+    freeze_llm=False,
+    freeze_visual_encoder=False,
+    freeze_segmentor_encoder=False,
+    use_dual_encoder=True,
+    use_vision_sampler=True,
+    connector_type="conv",
+    cond_type=cond_type,
+    seg_select_layers=[6, 12, 18, 24],
+    connector_hidden_dim=512,
+    connector_scale_factor=[4, 2, 1, 0.5],
+    sampler_input_feat="seg_pixel_values",
+    special_tokens=special_tokens,
+    s1_pretrained_pth=s1_pretrained_pth,
+    s2_pretrained_pth=s2_pretrained_pth,
+    tokenizer=tokenizer,
+    postprocess_fn=generic_seg_postprocess_fn,
+    llm=dict(
+        type=AutoModelForCausalLM.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path,
+        trust_remote_code=False,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    ),
+    visual_encoder=dict(
+        type=SiglipVisionModel.from_pretrained,
+        pretrained_model_name_or_path=visual_encoder_name_or_path,
+        torch_dtype=torch.bfloat16,
+    ),
+    segmentor=dict(
+        type=XSegmentor,
+        encoder=dict(
+            type=SamModel.from_pretrained,
+            pretrained_model_name_or_path=seg_encoder_name_or_path,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+        ),
+        decoder=dict(
+            type=Mask2FormerModel._from_config,
+            config=dict(
+                type=Mask2FormerConfig.from_pretrained,
+                pretrained_model_name_or_path=seg_decoder_name_or_path,
+                use_backbone=False,
+                feature_channels=[512, 1024, 2048],
+                num_feature_levels=3,
+                trust_remote_code=True,
+            ),
+            torch_dtype=torch.bfloat16,
+        ),
+        torch_dtype=torch.bfloat16,
+        reinit_decoder=True,
+        open_cls=True,
+    ),
+)
+#######################################################################
+#                      PART 3  Dataset & Dataloader                   #
+#######################################################################
+imgconv_data_root = data_dir + "llava_data/"
+genseg_data_root = data_dir + "generic_seg_data/"
+ovseg_data_root = data_dir + "ov_seg_data/"
+refseg_data_root = data_dir + "refer_seg_data/"
+reaseg_data_root = data_dir + "reason_seg_data/"
+gcgseg_data_root = data_dir + "gcg_seg_data/"
+vgdseg_data_root = data_dir + "vgd_seg_data/"
+interseg_data_root = data_dir + "inter_seg_data/"
+pannoptic_genseg_dataset = dict(
+    type=GenericSegDataset,
+    data_path=genseg_data_root + "coco/annotations/panoptic_train2017.json",
+    image_folder=genseg_data_root + "coco/train2017",
+    panseg_map_folder=genseg_data_root + "coco/panoptic_train2017",
+    tokenizer=tokenizer,
+    task_name="genseg",
+    data_name="panoptic_genseg",
+    cond_type=cond_type,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    image_processor=image_processor,
+    dataset_map_fn=dict(
+        type=dataset_map_fn_factory,
+        fn=generic_seg_map_fn,
+        cond_type=cond_type,
+    ),
+    template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    use_variant_cat=True,
+    pad_image_to_square=True,
+)
+coco_vgdseg_dataset = dict(
+    type=VGDSegDataset,
+    source_data_path=vgdseg_data_root + "coco/annotations/instances_train2017.json",
+    data_path=vgdseg_data_root + "annotations/coco_vgdseg_train.json",
+    image_folder=vgdseg_data_root + "coco/train2017",
+    tokenizer=tokenizer,
+    data_mode="train",
+    task_name="vgdseg",
+    data_name="coco_vgdseg",
+    cond_type=cond_type,
+    special_tokens=special_tokens,
+    extra_image_processor=extra_image_processor,
+    image_processor=image_processor,
+    dataset_map_fn=dict(
+        type=dataset_map_fn_factory,
+        fn=vgd_seg_map_fn,
+        cond_type=cond_type,
+    ),
+    template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
+    use_negative_sample=True,
+    sample_num=5,
+    max_length=max_length,
+    pad_image_to_square=True,
+)
+# False for predict mode, True for tensor mode
+output_ids_with_output = True
+val_datasets = [
+    dict(
+        type=GenericSegDataset,
+        data_path=genseg_data_root + "coco/annotations/panoptic_val2017.json",
+        image_folder=genseg_data_root + "coco/val2017",
+        panseg_map_folder=genseg_data_root + "coco/panoptic_val2017",
+        semseg_map_folder=genseg_data_root + "coco/panoptic_semseg_val2017",
+        data_mode="eval",
+        tokenizer=tokenizer,
+        task_name="genseg",
+        data_name="panoptic_genseg",
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        extra_image_processor=extra_image_processor,
+        image_processor=image_processor,
+        output_ids_with_output=output_ids_with_output,
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=generic_seg_postprocess_fn,
+            task_name="panoptic_genseg",
+            threshold=0.0,
+        ),
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=generic_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory,
+            template=prompt_template,
+            output_suffix=output_ids_with_output,
+        ),
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+    dict(
+        type=GenericSegDataset,
+        data_path=genseg_data_root + "coco/annotations/panoptic_val2017.json",
+        image_folder=genseg_data_root + "coco/val2017",
+        panseg_map_folder=genseg_data_root + "coco/panoptic_val2017",
+        semseg_map_folder=genseg_data_root + "coco/panoptic_semseg_val2017",
+        data_mode="eval",
+        tokenizer=tokenizer,
+        task_name="genseg",
+        data_name="panoptic_genseg",
+        output_ids_with_output=output_ids_with_output,
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        image_processor=image_processor,
+        extra_image_processor=extra_image_processor,
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=generic_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=generic_seg_postprocess_fn,
+            task_name="semantic_genseg",
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory,
+            template=prompt_template,
+            output_suffix=output_ids_with_output,
+        ),
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+    dict(
+        type=GenericSegDataset,
+        data_path=genseg_data_root + "coco/annotations/instances_val2017.json",
+        image_folder=genseg_data_root + "coco/val2017",
+        task_name="genseg",
+        data_name="instance_genseg",
+        data_mode="eval",
+        tokenizer=tokenizer,
+        output_ids_with_output=output_ids_with_output,
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        image_processor=image_processor,
+        extra_image_processor=extra_image_processor,
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=generic_seg_postprocess_fn,
+            task_name="instance_genseg",
+            threshold=0.0,
+        ),
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=generic_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory,
+            template=prompt_template,
+            output_suffix=output_ids_with_output,
+        ),
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+    dict(
+        type=VGDSegDataset,
+        source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
+        data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
+        image_folder=vgdseg_data_root + "coco/val2017",
+        tokenizer=tokenizer,
+        task_name="vgdseg",
+        data_name="coco_vgdseg_point",
+        data_mode="eval",
+        visual_prompt_type="point_visual_prompt",
+        output_ids_with_output=output_ids_with_output,
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        extra_image_processor=extra_image_processor,
+        image_processor=image_processor,
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=vgd_seg_postprocess_fn,
+            threshold=0.0,
+            return_contiguous_labels=True,
+        ),
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=vgd_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
+        ),
+        use_negative_sample=False,
+        sample_num=5,
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+    dict(
+        type=VGDSegDataset,
+        source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
+        data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
+        image_folder=vgdseg_data_root + "coco/val2017",
+        tokenizer=tokenizer,
+        task_name="vgdseg",
+        data_name="coco_vgdseg_scribble",
+        data_mode="eval",
+        visual_prompt_type="scribble_visual_prompt",
+        output_ids_with_output=output_ids_with_output,
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        extra_image_processor=extra_image_processor,
+        image_processor=image_processor,
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=vgd_seg_postprocess_fn,
+            threshold=0.0,
+            return_contiguous_labels=True,
+        ),
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=vgd_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
+        ),
+        use_negative_sample=False,
+        sample_num=5,
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+    dict(
+        type=VGDSegDataset,
+        source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
+        data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
+        image_folder=vgdseg_data_root + "coco/val2017",
+        tokenizer=tokenizer,
+        task_name="vgdseg",
+        data_name="coco_vgdseg_box",
+        data_mode="eval",
+        visual_prompt_type="box_visual_prompt",
+        output_ids_with_output=output_ids_with_output,
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        extra_image_processor=extra_image_processor,
+        image_processor=image_processor,
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=vgd_seg_postprocess_fn,
+            threshold=0.0,
+            return_contiguous_labels=True,
+        ),
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=vgd_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
+        ),
+        use_negative_sample=False,
+        sample_num=5,
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+    dict(
+        type=VGDSegDataset,
+        source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
+        data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
+        image_folder=vgdseg_data_root + "coco/val2017",
+        tokenizer=tokenizer,
+        task_name="vgdseg",
+        data_name="coco_vgdseg_mask",
+        data_mode="eval",
+        visual_prompt_type="mask_visual_prompt",
+        output_ids_with_output=output_ids_with_output,
+        cond_type=cond_type,
+        special_tokens=special_tokens,
+        extra_image_processor=extra_image_processor,
+        image_processor=image_processor,
+        postprocess_fn=dict(
+            type=process_map_fn_factory,
+            fn=vgd_seg_postprocess_fn,
+            threshold=0.0,
+            return_contiguous_labels=True,
+        ),
+        dataset_map_fn=dict(
+            type=dataset_map_fn_factory,
+            fn=vgd_seg_map_fn,
+            cond_type=cond_type,
+        ),
+        template_map_fn=dict(
+            type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
+        ),
+        use_negative_sample=False,
+        sample_num=5,
+        max_length=max_length,
+        pad_image_to_square=True,
+    ),
+]
+val_evaluators = [
+    dict(
+        type=GenericSegEvaluator,
+        distributed=True,
+        data_name="panoptic_genseg",
+    ),
+    dict(
+        type=GenericSegEvaluator,
+        data_name="semantic_genseg",
+        distributed=True,
+    ),
+    dict(
+        type=GenericSegEvaluator,
+        data_name="instance_genseg",
+        distributed=True,
+    ),
+    dict(
+        type=VGDSegEvaluator,
+        data_name="coco_vgdseg_point",
+        distributed=True,
+    ),
+    dict(
+        type=VGDSegEvaluator,
+        data_name="coco_vgdseg_scribble",
+        distributed=True,
+    ),
+    dict(
+        type=VGDSegEvaluator,
+        data_name="coco_vgdseg_box",
+        distributed=True,
+    ),
+    dict(
+        type=VGDSegEvaluator,
+        data_name="coco_vgdseg_mask",
+        distributed=True,
+    ),
+]
+vis_datasets = val_datasets
+vis_datasets = deepcopy(val_datasets)
+for dataset in vis_datasets:
+    if dataset["task_name"] in ["genseg", "ovseg", "vgdseg", "interseg"]:
+        dataset["postprocess_fn"]["threshold"] = 0.5  # type: ignore
+#######################################################################
+#                    PART 4  Scheduler & Optimizer                    #
+#######################################################################
+# optimizer
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
+    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
+    accumulative_counts=accumulative_counts,
+    loss_scale="dynamic",
+    dtype="float16",
+    paramwise_cfg=dict(
+        custom_keys={
+            "segmentor.encoder": dict(lr_mult=0.1, decay_mult=1.0),
+            "visual_encoder": dict(lr_mult=0.1, decay_mult=1.0),
+        },
+    ),
+)
+# learning policy
+# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=warmup_ratio * max_epochs,
+        convert_to_iter_based=True,
+    ),
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=0.0,
+        by_epoch=True,
+        begin=warmup_ratio * max_epochs,
+        end=max_epochs,
+        convert_to_iter_based=True,
+    ),
+]
+# train, val, test setting
+train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
+#######################################################################
+#                           PART 5  Runtime                           #
+#######################################################################
+# set visualizer
+visualizer = dict(
+    type=Visualizer,
+    scale=1.0,
+    font_size_scale=1.0,
+)
+# Log the dialogue periodically during the training process, optional
+custom_hooks = [
+    dict(
+        type=ModelInfoHook,
+        module_names=["llm", "visual_encoder", "projector", "connector", "segmentor"],
+        display_params=True,
+    ),
+    dict(type=DatasetInfoHook, tokenizer=tokenizer, special_tokens=special_tokens),
+    dict(
+        type=EvaluateChatHook,
+        tokenizer=tokenizer,
+        special_tokens=special_tokens,
+        image_processor=image_processor,
+        postprocess_fns=[
+            None,
+            generic_seg_postprocess_fn,
+            refer_seg_postprocess_fn,
+            reason_seg_postprocess_fn,
+            gcg_seg_postprocess_fn,
+            inter_seg_postprocess_fn,
+            inter_seg_postprocess_fn,
+            inter_seg_postprocess_fn,
+            inter_seg_postprocess_fn,
+            vgd_seg_postprocess_fn,
+            vgd_seg_postprocess_fn,
+            vgd_seg_postprocess_fn,
+            vgd_seg_postprocess_fn,
+            vgd_seg_postprocess_fn,
+        ],
+        extra_image_processor=extra_image_processor,
+        visualizer=visualizer,
+        every_n_iters=evaluation_freq,
+        evaluation_inputs=evaluation_inputs,
+        evaluation_images=evaluation_images,
+        vprompt_masks=vprompt_masks,
+        system=SYSTEM,
+        prompt_template=prompt_template,
+    ),
+    dict(type=PTCheckpointHook),
+]
+# configure default hooks
+default_hooks = dict(
+    # record the time of every iteration.
+    timer=dict(type=IterTimerHook),
+    # print log every 10 iterations.
+    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=logging_interval),
+    # enable the parameter scheduler.
+    param_scheduler=dict(type=ParamSchedulerHook),
+    # save checkpoint per `save_steps`.
+    checkpoint=dict(
+        type=CheckpointHook,
+        by_epoch=False,
+        interval=save_steps,
+        max_keep_ckpts=save_total_limit,
+    ),
+    # set sampler seed in distributed environment.
+    sampler_seed=dict(type=DistSamplerSeedHook),
+)
+# configure environment
+env_cfg = dict(
+    # whether to enable cudnn benchmark
+    cudnn_benchmark=False,
+    # set multi process parameters
+    mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
+    # set distributed parameters
+    dist_cfg=dict(backend="nccl"),
+)
+# set log level
+log_level = "INFO"
+# load from which checkpoint
+load_from = None
+# whether to resume training from the loaded checkpoint
+resume = False
+# Defaults to use random seed and disable `deterministic`
+randomness = dict(seed=None, deterministic=False)
+# set log processor
+log_processor = dict(
+    by_epoch=False,
+    window_size=1,
+    mean_pattern=r".*(loss|time|data_time|grad_norm|tflops).*",
+)

vgdseg_annotations/coco_vgdseg_train.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cd3675dff40773835bb8bcc0af2a33855f5bda6e15f873320a5667147934a92
+size 1388731793

vgdseg_annotations/coco_vgdseg_val.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39089126330dc2e72fd03f472e37ffab6273ce605b9c6415a4e6edd53a645f21
+size 58943447