Fix: AttributeError when `input_ids` is None during multimodal LLM training

When training a multimodal language model, such as MiniGPT-4, the model utilizes `inputs_embeds` instead of `input_ids`. This is because the multimodal embeddings are aligned with the LLM's text space and are concatenated with the text embeddings, rendering `input_ids` unnecessary and thus `None`.

This leads to the following error:

```
AttributeError: 'NoneType' object has no attribute 'shape'
```

This commit addresses the issue by modifying the code to handle cases where `input_ids` is None, ensuring that the model can properly process the provided `inputs_embeds` without relying on `input_ids`.

Files changed (1) hide show

modeling_chatglm.py +5 -4

modeling_chatglm.py CHANGED Viewed

@@ -771,15 +771,16 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
             if padding_mask is not None and not padding_mask.all():
                 return padding_mask
             return None
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
         if padding_mask is not None:
             full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
         if not past_length and padding_mask is not None:
@@ -872,7 +873,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        batch_size, seq_length = input_ids.shape
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)

             if padding_mask is not None and not padding_mask.all():
                 return padding_mask
             return None
+        batch_size, seq_length = input_ids.shape if input_ids is not None else padding_mask.shape
+        device = input_ids.device if input_ids is not None else padding_mask.device
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=device), full_attention_mask), dim=-1)
         if padding_mask is not None:
             full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
         if not past_length and padding_mask is not None:
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = (input_ids.shape if input_ids is not None else inputs_embeds.shape[:2] if inputs_embeds is not None else (None, None))
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)