feat: support batch infer and optimize processor

Browse files

Files changed (3) hide show

modeling_valley.py +43 -7
preprocessor_config.json +0 -20
processing_valley.py +9 -4

modeling_valley.py CHANGED Viewed

@@ -17,7 +17,7 @@ import numpy as np
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, Union
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers import AutoConfig, AutoModelForCausalLM, Qwen2Config, Qwen2ForCausalLM, Qwen2Model
@@ -39,7 +39,7 @@ class ValleyMetaModel:
             else:
                 self.vision_tower = build_vision_tower(config, delay_load=False)
         # Build Projector
-        if hasattr(config, "mm_projector_type"):
             self.mm_projector = build_vision_projector(config)
     def get_vision_tower(self):
@@ -114,6 +114,15 @@ class ValleyMetaForCausalLM(ABC):
         return image_features
     def prepare_inputs_labels_for_multimodal(
         self, input_ids, position_ids, attention_mask, past_key_values, labels, images,
@@ -128,7 +137,6 @@ class ValleyMetaForCausalLM(ABC):
                     dtype=attention_mask.dtype,
                     device=attention_mask.device
                 )), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
             return input_ids, position_ids, attention_mask, past_key_values, None, labels
         # Step1: Get image embedings
@@ -355,8 +363,7 @@ class ValleyMetaForCausalLM(ABC):
         for i, (cur_new_embed, cur_new_labels, cur_attention_mask) in enumerate(zip(new_input_embeds, new_labels, new_attention_mask)):
             cur_len = cur_new_embed.shape[0]
-            # Right padding when inferencing
-            if not self.training and not getattr(self, "right_padding", None):
                 new_input_embeds_padded.append(torch.cat((
                     torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
                     cur_new_embed
@@ -366,7 +373,6 @@ class ValleyMetaForCausalLM(ABC):
                     new_attention_mask_padded[i, -cur_len:] = cur_attention_mask
                     position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
-            # Left padding while training
             else:
                 new_input_embeds_padded.append(torch.cat((
                     cur_new_embed,
@@ -404,6 +410,33 @@ class ValleyQwen2ForCausalLM(Qwen2ForCausalLM, ValleyMetaForCausalLM):
     def get_model(self):
         return self.model
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -481,7 +514,7 @@ class ValleyQwen2ForCausalLM(Qwen2ForCausalLM, ValleyMetaForCausalLM):
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
@@ -489,6 +522,9 @@ class ValleyQwen2ForCausalLM(Qwen2ForCausalLM, ValleyMetaForCausalLM):
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):

 from torch import nn
 from torch.nn import CrossEntropyLoss
 from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Union, Dict, Any
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers import AutoConfig, AutoModelForCausalLM, Qwen2Config, Qwen2ForCausalLM, Qwen2Model
             else:
                 self.vision_tower = build_vision_tower(config, delay_load=False)
         # Build Projector
+        if hasattr(config, "mm_projector_type") and not getattr(config, "only_navit", False):
             self.mm_projector = build_vision_projector(config)
     def get_vision_tower(self):
         return image_features
+    def get_padding_method(self):
+        right_padding = getattr(self, 'right_padding', None)
+        # if right_padding flag is setted, ignore training flag.
+        if right_padding is not None:
+            method = 'right' if right_padding else 'left'
+        # in the other way, use training flag to determine the padding method.
+        method = 'right' if self.training else 'left'
+        return method
     def prepare_inputs_labels_for_multimodal(
         self, input_ids, position_ids, attention_mask, past_key_values, labels, images,
                     dtype=attention_mask.dtype,
                     device=attention_mask.device
                 )), dim=1)
             return input_ids, position_ids, attention_mask, past_key_values, None, labels
         # Step1: Get image embedings
         for i, (cur_new_embed, cur_new_labels, cur_attention_mask) in enumerate(zip(new_input_embeds, new_labels, new_attention_mask)):
             cur_len = cur_new_embed.shape[0]
+            if self.get_padding_method() == 'left':
                 new_input_embeds_padded.append(torch.cat((
                     torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
                     cur_new_embed
                     new_attention_mask_padded[i, -cur_len:] = cur_attention_mask
                     position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
             else:
                 new_input_embeds_padded.append(torch.cat((
                     cur_new_embed,
     def get_model(self):
         return self.model
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: CausalLMOutputWithPast,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        new_model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs,
+            model_kwargs,
+            is_encoder_decoder,
+            num_new_tokens
+        )
+        """
+        Set model_kwargs["attention_mask"] to the expanded `attention_mask` in
+        the `prepare_inputs_labels_for_multimodal` function to ensure the
+        correctness of the generate behavior when `use_cache` is enabled.
+        """
+        if not is_encoder_decoder:
+            if "attention_mask" in new_model_kwargs:
+                attention_mask = outputs.attention_mask
+                new_model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+        return new_model_kwargs
     def forward(
         self,
         input_ids: torch.LongTensor = None,
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
+        res =  CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             attentions=outputs.attentions,
         )
+        res.attention_mask = attention_mask
+        return res
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):

preprocessor_config.json CHANGED Viewed

@@ -2,25 +2,5 @@
     "processor_class": "ValleyProcessor",
     "auto_map": {
         "AutoProcessor": "processing_valley.ValleyProcessor"
-    },
-    "min_pixels": 1,
-    "qwen2vl_processor_config": {
-        "min_pixels": 3136,
-        "max_pixels": 12845056,
-        "patch_size": 14,
-        "temporal_patch_size": 2,
-        "merge_size": 2,
-        "image_mean": [
-            0.48145466,
-            0.4578275,
-            0.40821073
-        ],
-        "image_std": [
-            0.26862954,
-            0.26130258,
-            0.27577711
-        ],
-        "image_processor_type": "Qwen2VLImageProcessor",
-        "processor_class": "Qwen2VLProcessor"
     }
 }

     "processor_class": "ValleyProcessor",
     "auto_map": {
         "AutoProcessor": "processing_valley.ValleyProcessor"
     }
 }

processing_valley.py CHANGED Viewed

@@ -88,10 +88,15 @@ class ValleyProcessor(ProcessorMixin):
         self.siglip_image_processor = SiglipImageProcessor.from_dict(siglip_processor_config)
         self.qwen2vl_image_processor = Qwen2VLImageProcessor.from_dict(
             qwen2vl_processor_config,
-            max_pixels=kwargs.get("max_pixels", 1280*28*28),
-            min_pixels=kwargs.get("min_pixels", 4*28*28)
         )
         self.anyres = kwargs.get("anyres", True)
         self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
         self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
@@ -259,7 +264,7 @@ class ValleyProcessor(ProcessorMixin):
         return input_ids
-    def __call__(self, messages, inference=True) -> BatchFeature:
         # Deal with images
         if "images" not in messages or not messages["images"] or not messages["images"][0]:
             images = [self.black_img]

         self.siglip_image_processor = SiglipImageProcessor.from_dict(siglip_processor_config)
         self.qwen2vl_image_processor = Qwen2VLImageProcessor.from_dict(
             qwen2vl_processor_config,
         )
+        max_pixels = kwargs.get("max_pixels", None)
+        min_pixels = kwargs.get("min_pixels", None)
+        if max_pixels:
+            self.qwen2vl_image_processor.max_pixels = max_pixels
+        if min_pixels:
+            self.qwen2vl_image_processor.min_pixels = min_pixels
         self.anyres = kwargs.get("anyres", True)
         self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
         self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
         return input_ids
+    def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
         # Deal with images
         if "images" not in messages or not messages["images"] or not messages["images"][0]:
             images = [self.black_img]