Add supports_gradient_checkpointing
Browse files- configuration_internvl_chat.py +2 -0
- modeling_intern_vit.py +1 -0
- modeling_internvl_chat.py +11 -0
    	
        configuration_internvl_chat.py
    CHANGED
    
    | @@ -64,6 +64,8 @@ class InternVLChatConfig(PretrainedConfig): | |
| 64 | 
             
                    self.ps_version = ps_version  # pixel shuffle version
         | 
| 65 | 
             
                    self.min_dynamic_patch = min_dynamic_patch
         | 
| 66 | 
             
                    self.max_dynamic_patch = max_dynamic_patch
         | 
|  | |
|  | |
| 67 |  | 
| 68 | 
             
                    logger.info(f'vision_select_layer: {self.select_layer}')
         | 
| 69 | 
             
                    logger.info(f'ps_version: {self.ps_version}')
         | 
|  | |
| 64 | 
             
                    self.ps_version = ps_version  # pixel shuffle version
         | 
| 65 | 
             
                    self.min_dynamic_patch = min_dynamic_patch
         | 
| 66 | 
             
                    self.max_dynamic_patch = max_dynamic_patch
         | 
| 67 | 
            +
                    # By default, we use tie_word_embeddings=False for models of all sizes.
         | 
| 68 | 
            +
                    self.tie_word_embeddings = self.llm_config.tie_word_embeddings
         | 
| 69 |  | 
| 70 | 
             
                    logger.info(f'vision_select_layer: {self.select_layer}')
         | 
| 71 | 
             
                    logger.info(f'ps_version: {self.ps_version}')
         | 
    	
        modeling_intern_vit.py
    CHANGED
    
    | @@ -364,6 +364,7 @@ class InternVisionEncoder(nn.Module): | |
| 364 | 
             
            class InternVisionModel(PreTrainedModel):
         | 
| 365 | 
             
                main_input_name = 'pixel_values'
         | 
| 366 | 
             
                _supports_flash_attn_2 = True
         | 
|  | |
| 367 | 
             
                config_class = InternVisionConfig
         | 
| 368 | 
             
                _no_split_modules = ['InternVisionEncoderLayer']
         | 
| 369 |  | 
|  | |
| 364 | 
             
            class InternVisionModel(PreTrainedModel):
         | 
| 365 | 
             
                main_input_name = 'pixel_values'
         | 
| 366 | 
             
                _supports_flash_attn_2 = True
         | 
| 367 | 
            +
                supports_gradient_checkpointing = True
         | 
| 368 | 
             
                config_class = InternVisionConfig
         | 
| 369 | 
             
                _no_split_modules = ['InternVisionEncoderLayer']
         | 
| 370 |  | 
    	
        modeling_internvl_chat.py
    CHANGED
    
    | @@ -38,6 +38,7 @@ class InternVLChatModel(PreTrainedModel): | |
| 38 | 
             
                main_input_name = 'pixel_values'
         | 
| 39 | 
             
                base_model_prefix = 'language_model'
         | 
| 40 | 
             
                _supports_flash_attn_2 = True
         | 
|  | |
| 41 | 
             
                _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
         | 
| 42 |  | 
| 43 | 
             
                def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
         | 
| @@ -347,3 +348,13 @@ class InternVLChatModel(PreTrainedModel): | |
| 347 | 
             
                    )
         | 
| 348 |  | 
| 349 | 
             
                    return outputs
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 38 | 
             
                main_input_name = 'pixel_values'
         | 
| 39 | 
             
                base_model_prefix = 'language_model'
         | 
| 40 | 
             
                _supports_flash_attn_2 = True
         | 
| 41 | 
            +
                supports_gradient_checkpointing = True
         | 
| 42 | 
             
                _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
         | 
| 43 |  | 
| 44 | 
             
                def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
         | 
|  | |
| 348 | 
             
                    )
         | 
| 349 |  | 
| 350 | 
             
                    return outputs
         | 
| 351 | 
            +
             | 
| 352 | 
            +
                @property
         | 
| 353 | 
            +
                def lm_head(self):
         | 
| 354 | 
            +
                    return self.language_model.get_output_embeddings()
         | 
| 355 | 
            +
             | 
| 356 | 
            +
                def get_input_embeddings(self):
         | 
| 357 | 
            +
                    return self.language_model.get_input_embeddings()
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                def get_output_embeddings(self):
         | 
| 360 | 
            +
                    return self.language_model.get_output_embeddings()
         | 
