baidu
/

ERNIE-4.5-VL-424B-A47B-PT

@@ -27,6 +27,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers.activations import ACT2FN
@@ -2072,6 +2073,10 @@ class MOEAllGatherLayerV2(MOELayer):
         top_k = self.k
         num_expert_per_rank_per_modality = gate_logits_lm.shape[-1]
         group_size = gate_logits_lm.shape[-1] // top_k
         if self.group_experts:
             assert not self.use_correction_bias
             gate_logits_lm = gate_logits_lm.reshape(
@@ -3457,33 +3462,27 @@ class VisionAttention(nn.Module):
         k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
             dim=0
         )
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        attention_mask = torch.full(
-            [1, seq_length, seq_length],
-            torch.finfo(q.dtype).min,
-            device=q.device,
-            dtype=q.dtype,
-        )
-        for i in range(1, len(cu_seqlens)):
-            attention_mask[
-                ...,
-                cu_seqlens[i - 1] : cu_seqlens[i],
-                cu_seqlens[i - 1] : cu_seqlens[i],
-            ] = 0
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
-        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(q.dtype)
-        attn_output = torch.matmul(attn_weights, v)
-        attn_output = attn_output.transpose(0, 1)
-        attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
         return attn_output

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributed.tensor import DTensor
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers.activations import ACT2FN
         top_k = self.k
         num_expert_per_rank_per_modality = gate_logits_lm.shape[-1]
         group_size = gate_logits_lm.shape[-1] // top_k
+        if self.use_correction_bias and isinstance(self.moe_statics.e_score_correction_bias, DTensor):
+            correction_bias = self.moe_statics.e_score_correction_bias.to_local()
+        elif self.use_correction_bias:
+            correction_bias = self.moe_statics.e_score_correction_bias
         if self.group_experts:
             assert not self.use_correction_bias
             gate_logits_lm = gate_logits_lm.reshape(
         k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
             dim=0
         )
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
+        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        splits = [
+            torch.split(tensor, lengths.tolist(), dim=1) for tensor in (q, k, v)
+        ]
+        attn_output = []
+        for q, k, v in zip(*splits):
+            attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_output_splited = torch.matmul(attn_weights, v)
+            attn_output_splited = attn_output_splited.transpose(0, 1)
+            attn_output.append(attn_output_splited)
+        attn_output = torch.cat(attn_output, dim=0)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
         attn_output = self.proj(attn_output)
         return attn_output