update for new format config of transformers lib / sglang lib

Files changed (3) hide show

README.md +6 -8
config.json +36 -32
generation_config.json +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ language:
 - en
 - zh
 base_model:
-- THUDM/GLM-4-9B-0414
 pipeline_tag: image-text-to-text
 library_name: transformers
 tags:
@@ -14,13 +14,11 @@ tags:
 # GLM-4.1V-9B-Base
 <div align="center">
-<img src=https://raw.githubusercontent.com/THUDM/GLM-4.1V-Thinking/99c5eb6563236f0ff43605d91d107544da9863b2/resources/logo.svg width="40%"/>
 </div>
 <p align="center">
     📖 View the GLM-4.1V-9B-Thinking <a href="https://arxiv.org/abs/2507.01006" target="_blank">paper</a>.
     <br>
-    💡 Try the <a href="https://huggingface.co/spaces/THUDM/GLM-4.1V-9B-Thinking-Demo" target="_blank">Hugging Face</a> or <a href="https://modelscope.cn/studios/ZhipuAI/GLM-4.1V-9B-Thinking-Demo" target="_blank">ModelScope</a> online demo for GLM-4.1V-9B-Thinking.
-    <br>
     📍 Using GLM-4.1V-9B-Thinking API at <a href="https://www.bigmodel.cn/dev/api/visual-reasoning-model/GLM-4.1V-Thinking">Zhipu Foundation Model Open Platform</a>
 </p>
@@ -31,14 +29,14 @@ increasingly complex, VLMs must evolve beyond basic multimodal perception to enh
 complex tasks. This involves improving accuracy, comprehensiveness, and intelligence, enabling applications such as
 complex problem solving, long-context understanding, and multimodal agents.
-Based on the [GLM-4-9B-0414](https://github.com/THUDM/GLM-4) foundation model, we present the new open-source VLM model
 **GLM-4.1V-9B-Thinking**, designed to explore the upper limits of reasoning in vision-language models. By introducing
 a "thinking paradigm" and leveraging reinforcement learning, the model significantly enhances its capabilities. It
 achieves state-of-the-art performance among 10B-parameter VLMs, matching or even surpassing the 72B-parameter
 Qwen-2.5-VL-72B on 18 benchmark tasks. We are also open-sourcing the base model GLM-4.1V-9B-Base to
 support further research into the boundaries of VLM capabilities.
-![rl](https://raw.githubusercontent.com/THUDM/GLM-4.1V-Thinking/refs/heads/main/resources/rl.jpeg)
 Compared to the previous generation models CogVLM2 and the GLM-4V series, **GLM-4.1V-Thinking** offers the
 following improvements:
@@ -56,7 +54,7 @@ richness, and interpretability. It comprehensively surpasses traditional non-rea
 Out of 28 benchmark tasks, it achieved the best performance among 10B-level models on 23 tasks,
 and even outperformed the 72B-parameter Qwen-2.5-VL-72B on 18 tasks.
-![bench](https://raw.githubusercontent.com/THUDM/GLM-4.1V-Thinking/refs/heads/main/resources/bench.jpeg)
-For video reasoning, web demo deployment, and more code, please check our [GitHub](https://github.com/THUDM/GLM-4.1V-Thinking).

 - en
 - zh
 base_model:
+- zai-org/GLM-4-9B-0414
 pipeline_tag: image-text-to-text
 library_name: transformers
 tags:
 # GLM-4.1V-9B-Base
 <div align="center">
+<img src=https://raw.githubusercontent.com/zai-org/GLM-4.1V-Thinking/99c5eb6563236f0ff43605d91d107544da9863b2/resources/logo.svg width="40%"/>
 </div>
 <p align="center">
     📖 View the GLM-4.1V-9B-Thinking <a href="https://arxiv.org/abs/2507.01006" target="_blank">paper</a>.
     <br>
     📍 Using GLM-4.1V-9B-Thinking API at <a href="https://www.bigmodel.cn/dev/api/visual-reasoning-model/GLM-4.1V-Thinking">Zhipu Foundation Model Open Platform</a>
 </p>
 complex tasks. This involves improving accuracy, comprehensiveness, and intelligence, enabling applications such as
 complex problem solving, long-context understanding, and multimodal agents.
+Based on the [GLM-4-9B-0414](https://github.com/zai-org/GLM-4) foundation model, we present the new open-source VLM model
 **GLM-4.1V-9B-Thinking**, designed to explore the upper limits of reasoning in vision-language models. By introducing
 a "thinking paradigm" and leveraging reinforcement learning, the model significantly enhances its capabilities. It
 achieves state-of-the-art performance among 10B-parameter VLMs, matching or even surpassing the 72B-parameter
 Qwen-2.5-VL-72B on 18 benchmark tasks. We are also open-sourcing the base model GLM-4.1V-9B-Base to
 support further research into the boundaries of VLM capabilities.
+![rl](https://raw.githubusercontent.com/zai-org/GLM-4.1V-Thinking/refs/heads/main/resources/rl.jpeg)
 Compared to the previous generation models CogVLM2 and the GLM-4V series, **GLM-4.1V-Thinking** offers the
 following improvements:
 Out of 28 benchmark tasks, it achieved the best performance among 10B-level models on 23 tasks,
 and even outperformed the 72B-parameter Qwen-2.5-VL-72B on 18 tasks.
+![bench](https://raw.githubusercontent.com/zai-org/GLM-4.1V-Thinking/refs/heads/main/resources/bench.jpeg)
+For video reasoning, web demo deployment, and more code, please check our [GitHub](https://github.com/zai-org/GLM-V).

config.json CHANGED Viewed

@@ -3,38 +3,50 @@
     "Glm4vForConditionalGeneration"
   ],
   "model_type": "glm4v",
-  "attention_bias": true,
-  "attention_dropout": 0.0,
-  "pad_token_id": 151329,
-  "eos_token_id": [
-    151329,
-    151336,
-    151338,
-    151348
-  ],
   "image_start_token_id": 151339,
   "image_end_token_id": 151340,
   "video_start_token_id": 151341,
   "video_end_token_id": 151342,
   "image_token_id": 151343,
   "video_token_id": 151344,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 13696,
-  "max_position_embeddings": 65536,
-  "num_attention_heads": 32,
-  "num_hidden_layers": 40,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 10000.0,
   "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.53.0dev",
-  "use_cache": true,
-  "vocab_size": 151552,
-  "partial_rotary_factor": 0.5,
   "vision_config": {
     "hidden_size": 1536,
     "depth": 24,
     "num_heads": 12,
@@ -49,13 +61,5 @@
     "rms_norm_eps": 1e-05,
     "spatial_merge_size": 2,
     "temporal_patch_size": 2
-  },
-  "rope_scaling": {
-    "type": "default",
-    "mrope_section": [
-      8,
-      12,
-      12
-    ]
   }
 }

     "Glm4vForConditionalGeneration"
   ],
   "model_type": "glm4v",
   "image_start_token_id": 151339,
   "image_end_token_id": 151340,
   "video_start_token_id": 151341,
   "video_end_token_id": 151342,
   "image_token_id": 151343,
   "video_token_id": 151344,
   "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "text_config": {
+    "model_type": "glm4v_text",
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "pad_token_id": 151329,
+    "eos_token_id": [
+      151329,
+      151336,
+      151338,
+      151348
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 13696,
+    "max_position_embeddings": 65536,
+    "num_attention_heads": 32,
+    "num_hidden_layers": 40,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-05,
+    "dtype": "bfloat16",
+    "use_cache": true,
+    "vocab_size": 151552,
+    "partial_rotary_factor": 0.5,
+    "rope_theta": 10000,
+    "rope_scaling": {
+      "rope_type": "default",
+      "mrope_section": [
+        8,
+        12,
+        12
+      ]
+    }
+  },
   "vision_config": {
+    "model_type": "glm4v",
     "hidden_size": 1536,
     "depth": 24,
     "num_heads": 12,
     "rms_norm_eps": 1e-05,
     "spatial_merge_size": 2,
     "temporal_patch_size": 2
   }
 }

generation_config.json CHANGED Viewed

@@ -11,5 +11,5 @@
   "top_p": 0.6,
   "temperature": 0.8,
   "top_k": 2,
-  "transformers_version": "4.53.0dev"
 }

   "top_p": 0.6,
   "temperature": 0.8,
   "top_k": 2,
+  "transformers_version": "4.57.1"
 }