JetLM
/

SDAR-1.7B-Chat

@@ -46,13 +46,7 @@ from transformers.processing_utils import Unpack
 from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_sdar import SDARConfig
-from fla.modules.activations import swiglu_linear
-from fla.modules import (
-    FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss,
-    FusedLinearUnreducedCrossEntropyLoss,
-    FusedLinearDiffusionCrossEntropyLoss)
 from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
-from torch.distributed.tensor import DTensor
 import torch.nn.functional as F
 try:
@@ -61,12 +55,11 @@ try:
 except:
     pass
-def dtensor2local(dtensor):
-    if isinstance(dtensor, DTensor):
-        return dtensor.to_local()
-    else:
-        return dtensor
 if is_torch_flex_attn_available():
@@ -77,10 +70,6 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
-@torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
-def fused_flex_attention(query, key, value, attention_mask=None, **kwargs):
-    return flex_attention(query, key, value, block_mask=attention_mask, **kwargs)
 @use_kernel_forward_from_hub("RMSNorm")
 class SDARRMSNorm(nn.Module):
@@ -93,16 +82,16 @@ class SDARRMSNorm(nn.Module):
         self.variance_epsilon = eps
     def forward(self, hidden_states):
-        weight = dtensor2local(self.weight)
-        '''
-        return flash_rms_norm(hidden_states, weight=weight, bias=None, eps=self.variance_epsilon)
         '''
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * \
             torch.rsqrt(variance + self.variance_epsilon)
-        return weight * hidden_states.to(input_dtype)
     def extra_repr(self):
@@ -124,12 +113,11 @@ class SDARMLP(nn.Module):
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
-        down_proj_weight = dtensor2local(self.down_proj.weight)
-        down_proj_bias = dtensor2local(self.down_proj.bias)
-        # down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        down_proj = swiglu_linear(self.gate_proj(x), self.up_proj(x),
-                                  down_proj_weight, down_proj_bias)
-        return down_proj
 def rotate_half(x):
@@ -856,35 +844,11 @@ class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
         loss = None
         if labels is not None:
-            if self.config.fuse_cross_entropy:
-                if fuse_linear_and_cross_entropy:
-                    # Note: We use reduction='sum'
-                    # For 'mean' reduction, gradients are normalized by number of *non-ignored* elements
-                    # mean_loss = sum_loss / num_non_ignored_tokens, instead of all tokens (labels != -100)
-                    loss_fct = FusedLinearDiffusionCrossEntropyLoss(
-                        reduction='sum')
-                else:
-                    loss_fct = FusedCrossEntropyLoss(
-                        reduction='sum', inplace_backward=True)
-            else:
-                loss_fct = nn.CrossEntropyLoss()  # nn.CE
-            if fuse_linear_and_cross_entropy:
-                p_mask = kwargs.get('p_mask', None)
-                # loss: tuple of (sum_loss, unreduced_loss)
-                lm_head_weight = dtensor2local(self.lm_head.weight)
-                lm_head_bias = dtensor2local(self.lm_head.bias)
-                loss = loss_fct(
-                    x=hidden_states,  # `view(-1, V)` inside the kernel
-                    target=labels,
-                    weight=lm_head_weight,
-                    bias=lm_head_bias,
-                    p_mask=p_mask,
-                )
-            else:
-                raise RuntimeError("Do not support yet!")
-                loss = loss_fct(
-                    logits.view(-1, self.config.vocab_size), labels.view(-1))
         return CausalLMOutputWithPast(
             loss=loss,

 from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_sdar import SDARConfig
 from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
 import torch.nn.functional as F
 try:
 except:
     pass
+try:
+    from liger_kernel.ops.swiglu import LigerSiLUMulFunction  # noqa: F401
+    liger_kernel_is_available = True
+except ImportError:
+    liger_kernel_is_available = False
 if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
 @use_kernel_forward_from_hub("RMSNorm")
 class SDARRMSNorm(nn.Module):
         self.variance_epsilon = eps
     def forward(self, hidden_states):
+        return flash_rms_norm(
+            hidden_states, weight=self.weight, bias=None, eps=self.variance_epsilon)
         '''
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * \
             torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+        '''
     def extra_repr(self):
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
+        if liger_kernel_is_available:
+            return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+            return down_proj
 def rotate_half(x):
         loss = None
         if labels is not None:
+            # FusedLinearCrossEntropyLoss will be implemented by monkey patch when training
+            # We don't use it when inferencing
+            loss_fct = nn.CrossEntropyLoss()  # nn.CE
+            loss = loss_fct(
+                logits.view(-1, self.config.vocab_size), labels.view(-1))
         return CausalLMOutputWithPast(
             loss=loss,