ltg
/

norbert4-large

@@ -121,60 +121,6 @@ class GeGLU(nn.Module):
         return x * gelu_new(gate)
-class Encoder(nn.Module):
-    def __init__(self, config: GptBertConfig):
-        super().__init__()
-        self.layers = nn.ModuleList([Layer(config, i) for i in range(config.num_layers)])
-        self.short_long_ratio = config.short_long_ratio
-    def set_window_length(self, config: GptBertConfig):
-        for i, layer in enumerate(self.layers):
-            if (i + 1) % self.local_global_ratio == 0:
-                layer.set_window_length(config.global_window_length)
-            else:
-                layer.set_window_length(config.local_window_length)
-    def forward(self, hidden_layer: torch.Tensor, padding_info, output_hidden_states=False, checkpoint_activations=False):
-        hidden_layers = [hidden_layer] if output_hidden_states else None
-        v1 = None
-        embeddings = hidden_layer
-        for layer in self.layers:
-            if checkpoint_activations:
-                hidden_layer, v1 = torch.utils.checkpoint.checkpoint(layers, hidden_layer, embeddings, v1, padding_info, use_reentrant=True)
-            else:
-                hidden_layer, v1 = layer(hidden_layer, embeddings, v1, padding_info)
-            if output_hidden_states:
-                hidden_layers.append(hidden_layer)
-        return hidden_layer, hidden_layers
-class Layer(nn.Module):
-    def __init__(self, config: GptBertConfig, layer_idx: int):
-        super().__init__()
-        self.attention = SelfAttention(config, layer_idx)
-        self.mlp = FeedForward(config)
-        self.lambdas = nn.Parameter(torch.tensor([0., 0., 1., 0., 1., 0.]))
-    def set_window_length(self, window_length: int):
-        self.attention.set_window_length(window_length)
-    def forward(self, hidden_layer: torch.Tensor, embeddings: torch.Tensor, v1: torch.Tensor | None, padding_info):
-        attention_output = (1 - self.lambdas[0]) * hidden_layer + self.lambdas[0] * embeddings
-        qk_layer = (1 - self.lambdas[1]) * hidden_layer + self.lambdas[1] * embeddings
-        mlp_layer = F.softplus(self.lambdas[2]) * ((1 - self.lambdas[3]) * hidden_layer + self.lambdas[3] * embeddings)
-        attention_output, v1 = self.attention(attention_output, qk_layer, v1, padding_info)
-        mlp_layer = mlp_layer + attention_output
-        hidden_layer = F.softplus(self.lambdas[4]) * ((1 - self.lambdas[5]) * hidden_layer + self.lambdas[5] * embeddings)
-        output = hidden_layer + attention_output + self.mlp(mlp_layer)
-        return output, v1
 class Embedding(nn.Module):
     def __init__(self, config: GptBertConfig):
         super().__init__()
@@ -246,6 +192,110 @@ def flash_attention_forward(qkv: torch.Tensor, rotary_emb: UnpaddedRotaryEmbeddi
     return attn
 class SelfAttention(nn.Module):
     def __init__(self, config: GptBertConfig, layer_idx: int):
         super().__init__()
@@ -280,7 +330,7 @@ class SelfAttention(nn.Module):
         theta = 160_000 if (layer_idx + 1) % config.short_long_ratio == 0 else 10_000
         # Initialize rotary embeddings based on whether FlashAttention is available
-        if self.config._attn_implementation == "flash_attention_2":
             self.rope_embedding = UnpaddedRotaryEmbedding(dim=self.d_qk, base=theta, max_seqlen=config.max_sequence_length)
         else:
             self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
@@ -331,7 +381,7 @@ class SelfAttention(nn.Module):
     def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
         # Get original shape info
-        if self.config._attn_implementation == "flash_attention_2":
             # Unpadded case
             indices, cu_seqlens, max_seqlen = padding_info
             total_seqlen = hidden_layer.size(0)
@@ -346,7 +396,7 @@ class SelfAttention(nn.Module):
         query, key = self.qk_proj(qk_layer).tensor_split([self.q_out_dim], dim=-1)
         value = self.v_proj(hidden_layer)
-        if self.config._attn_implementation == "flash_attention_2":
             # Reshape for FlashAttention: (total_seqlen, num_heads, head_dim)
             query = query.view(total_seqlen, self.num_attention_heads, self.d_qk)
             key = key.view(total_seqlen, self.num_kv_heads, self.d_qk)
@@ -437,108 +487,58 @@ class FeedForward(nn.Module):
         return x
-# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
-class ApplyRotaryEmbUnpad(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
-        # (total_nnz, 3, nheads, headdim)
-        qkv = qkv.contiguous()
-        total_nnz, _three, _nheads, headdim = qkv.shape
-        # We need qkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
-        # we get the same tensor
-        # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
-        qk = qkv[:, :2].view(total_nnz, -1, headdim)
-        apply_rotary(qk, cos, sin, seqlen_offsets=0, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, interleaved=False, inplace=True)
-        ctx.save_for_backward(cos, sin, cu_seqlens)
-        ctx.max_seqlen = max_seqlen
-        return qkv
-    @staticmethod
-    def backward(ctx, do):
-        cos, sin, cu_seqlens = ctx.saved_tensors
-        do = do.contiguous()
-        total_nnz, _three, _nheads, headdim = do.shape
-        # We need dqkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
-        # we get the same tensor
-        dqk = do[:, :2].view(total_nnz, -1, headdim)
-        apply_rotary(
-            dqk,
-            cos,
-            sin,
-            seqlen_offsets=0,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-            interleaved=False,
-            inplace=True,
-            conjugate=True,
-        )
-        return do, None, None, None, None, None, None
-# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
-def apply_rotary_unpadded(qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
-    return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)
-# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
-class UnpaddedRotaryEmbedding(RotaryEmbedding):
-    def __init__(self, dim: int, base: float = 10000.0, max_seqlen: Optional[int] = None):
-        super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=None, interleaved=False)
-        self.max_seqlen = max_seqlen
-        if max_seqlen is not None and device is not None and dtype is not None:
-            self._update_cos_sin_cache(max_seqlen, device=device, dtype=None)
-    def forward(self, qkv: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: Optional[int] = None) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        if max_seqlen is not None:
-            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
-        qkv = apply_rotary_unpadded(
-            qkv,
-            self._cos_cached,
-            self._sin_cached,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-        )
-        return qkv
-class RotaryPositionalEmbeddings(nn.Module):
-    def __init__(self, config, theta: int):
         super().__init__()
-        head_size = config.query_key_head_size
-        assert head_size % 2 == 0
-        max_seq_len = config.max_sequence_length
-        inv_freq = 1.0 / (theta ** (torch.arange(0, head_size, 2, dtype=torch.float32) / head_size))
-        pos = torch.arange(max_seq_len, dtype=torch.float32)
-        embedding = torch.einsum('n, d -> nd', pos, inv_freq)
-        embedding = torch.cat([embedding, embedding], dim=-1).unsqueeze(0)
-        self.register_buffer("cos_matrix", embedding.cos(), persistent=False)
-        self.register_buffer("sin_matrix", embedding.sin(), persistent=False)
-    def forward(self, x: torch.Tensor):
-        hidden_layer = x.float()
-        seq_len = x.shape[2]
-        cos_matrix = self.cos_matrix[:, None, :seq_len, :]
-        sin_matrix = self.sin_matrix[:, None, :seq_len, :]
-        x_rotate_half = torch.cat(
-            [
-                -hidden_layer[:, :, :, x.size(-1) // 2:],
-                hidden_layer[:, :, :, :x.size(-1) // 2]
-            ],
-            dim=-1
-        )
-        out = hidden_layer * cos_matrix + x_rotate_half * sin_matrix
-        return out.type_as(x)
 #
@@ -565,33 +565,6 @@ class GptBertPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-    @classmethod
-    def _autoset_attn_implementation(
-        cls,
-        config,
-        torch_dtype: Optional[torch.dtype] = None,
-        device_map: Optional[Union[str, Dict[str, int]]] = None,
-        check_device_map: bool = True,
-    ):
-        if config._attn_implementation_internal is None:
-            config._attn_implementation_internal = "flash_attention_2"
-            try:
-                return cls._check_and_enable_flash_attn_2(
-                    config,
-                    torch_dtype=torch.float16,
-                    device_map=device_map,
-                    hard_check_only=False,
-                    check_device_map=check_device_map,
-                )
-            except (ValueError, ImportError):
-                config._attn_implementation_internal = None
-        return super()._autoset_attn_implementation(
-            config,
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            check_device_map=check_device_map,
-        )
 class GptBertModel(GptBertPreTrainedModel):
     def __init__(self, config: GptBertConfig, add_mlm_layer=False, **kwargs):
@@ -634,7 +607,7 @@ class GptBertModel(GptBertPreTrainedModel):
         else:
             attention_mask = attention_mask.bool()
-        if self.config._attn_implementation == "flash_attention_2":
             if len(attention_mask.size()) != 2:
                 raise ValueError("Bare `attention_mask` med to dimensjoner støttes nå for FlashAttention.")
             with torch.no_grad():
@@ -665,7 +638,7 @@ class GptBertModel(GptBertPreTrainedModel):
             contextualized_embeddings = [layer.to(original_dtype) for layer in contextualized_embeddings]
         # Pad output if using FlashAttention
-        if self.config._attn_implementation == "flash_attention_2":
             last_layer = _pad_output(last_layer, indices, batch_size, seq_length)
             if output_hidden_states:
                 contextualized_embeddings = [_pad_output(layer, indices, batch_size, seq_length) for layer in contextualized_embeddings]

         return x * gelu_new(gate)
 class Embedding(nn.Module):
     def __init__(self, config: GptBertConfig):
         super().__init__()
     return attn
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+class ApplyRotaryEmbUnpad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
+        # (total_nnz, 3, nheads, headdim)
+        qkv = qkv.contiguous()
+        total_nnz, _three, _nheads, headdim = qkv.shape
+        # We need qkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
+        # we get the same tensor
+        # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
+        qk = qkv[:, :2].view(total_nnz, -1, headdim)
+        apply_rotary(qk, cos, sin, seqlen_offsets=0, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, interleaved=False, inplace=True)
+        ctx.save_for_backward(cos, sin, cu_seqlens)
+        ctx.max_seqlen = max_seqlen
+        return qkv
+    @staticmethod
+    def backward(ctx, do):
+        cos, sin, cu_seqlens = ctx.saved_tensors
+        do = do.contiguous()
+        total_nnz, _three, _nheads, headdim = do.shape
+        # We need dqkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
+        # we get the same tensor
+        dqk = do[:, :2].view(total_nnz, -1, headdim)
+        apply_rotary(
+            dqk,
+            cos,
+            sin,
+            seqlen_offsets=0,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=ctx.max_seqlen,
+            interleaved=False,
+            inplace=True,
+            conjugate=True,
+        )
+        return do, None, None, None, None, None, None
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+def apply_rotary_unpadded(qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
+    return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+class UnpaddedRotaryEmbedding(RotaryEmbedding):
+    def __init__(self, dim: int, base: float = 10000.0, max_seqlen: Optional[int] = None):
+        super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=None, interleaved=False)
+        self.max_seqlen = max_seqlen
+        if max_seqlen is not None and device is not None and dtype is not None:
+            self._update_cos_sin_cache(max_seqlen, device=device, dtype=None)
+    def forward(self, qkv: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: Optional[int] = None) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if max_seqlen is not None:
+            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
+        qkv = apply_rotary_unpadded(
+            qkv,
+            self._cos_cached,
+            self._sin_cached,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        return qkv
+class RotaryPositionalEmbeddings(nn.Module):
+    def __init__(self, config, theta: int):
+        super().__init__()
+        head_size = config.query_key_head_size
+        assert head_size % 2 == 0
+        max_seq_len = config.max_sequence_length
+        inv_freq = 1.0 / (theta ** (torch.arange(0, head_size, 2, dtype=torch.float32) / head_size))
+        pos = torch.arange(max_seq_len, dtype=torch.float32)
+        embedding = torch.einsum('n, d -> nd', pos, inv_freq)
+        embedding = torch.cat([embedding, embedding], dim=-1).unsqueeze(0)
+        self.register_buffer("cos_matrix", embedding.cos(), persistent=False)
+        self.register_buffer("sin_matrix", embedding.sin(), persistent=False)
+    def forward(self, x: torch.Tensor):
+        hidden_layer = x.float()
+        seq_len = x.shape[2]
+        cos_matrix = self.cos_matrix[:, None, :seq_len, :]
+        sin_matrix = self.sin_matrix[:, None, :seq_len, :]
+        x_rotate_half = torch.cat(
+            [
+                -hidden_layer[:, :, :, x.size(-1) // 2:],
+                hidden_layer[:, :, :, :x.size(-1) // 2]
+            ],
+            dim=-1
+        )
+        out = hidden_layer * cos_matrix + x_rotate_half * sin_matrix
+        return out.type_as(x)
 class SelfAttention(nn.Module):
     def __init__(self, config: GptBertConfig, layer_idx: int):
         super().__init__()
         theta = 160_000 if (layer_idx + 1) % config.short_long_ratio == 0 else 10_000
         # Initialize rotary embeddings based on whether FlashAttention is available
+        if is_flash_attn_2_available():
             self.rope_embedding = UnpaddedRotaryEmbedding(dim=self.d_qk, base=theta, max_seqlen=config.max_sequence_length)
         else:
             self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
     def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
         # Get original shape info
+        if is_flash_attn_2_available():
             # Unpadded case
             indices, cu_seqlens, max_seqlen = padding_info
             total_seqlen = hidden_layer.size(0)
         query, key = self.qk_proj(qk_layer).tensor_split([self.q_out_dim], dim=-1)
         value = self.v_proj(hidden_layer)
+        if is_flash_attn_2_available():
             # Reshape for FlashAttention: (total_seqlen, num_heads, head_dim)
             query = query.view(total_seqlen, self.num_attention_heads, self.d_qk)
             key = key.view(total_seqlen, self.num_kv_heads, self.d_qk)
         return x
+class Layer(nn.Module):
+    def __init__(self, config: GptBertConfig, layer_idx: int):
+        super().__init__()
+        self.attention = SelfAttention(config, layer_idx)
+        self.mlp = FeedForward(config)
+        self.lambdas = nn.Parameter(torch.tensor([0., 0., 1., 0., 1., 0.]))
+    def set_window_length(self, window_length: int):
+        self.attention.set_window_length(window_length)
+    def forward(self, hidden_layer: torch.Tensor, embeddings: torch.Tensor, v1: torch.Tensor | None, padding_info):
+        attention_output = (1 - self.lambdas[0]) * hidden_layer + self.lambdas[0] * embeddings
+        qk_layer = (1 - self.lambdas[1]) * hidden_layer + self.lambdas[1] * embeddings
+        mlp_layer = F.softplus(self.lambdas[2]) * ((1 - self.lambdas[3]) * hidden_layer + self.lambdas[3] * embeddings)
+        attention_output, v1 = self.attention(attention_output, qk_layer, v1, padding_info)
+        mlp_layer = mlp_layer + attention_output
+        hidden_layer = F.softplus(self.lambdas[4]) * ((1 - self.lambdas[5]) * hidden_layer + self.lambdas[5] * embeddings)
+        output = hidden_layer + attention_output + self.mlp(mlp_layer)
+        return output, v1
+class Encoder(nn.Module):
+    def __init__(self, config: GptBertConfig):
         super().__init__()
+        self.layers = nn.ModuleList([Layer(config, i) for i in range(config.num_layers)])
+        self.short_long_ratio = config.short_long_ratio
+    def set_window_length(self, config: GptBertConfig):
+        for i, layer in enumerate(self.layers):
+            if (i + 1) % self.local_global_ratio == 0:
+                layer.set_window_length(config.global_window_length)
+            else:
+                layer.set_window_length(config.local_window_length)
+    def forward(self, hidden_layer: torch.Tensor, padding_info, output_hidden_states=False, checkpoint_activations=False):
+        hidden_layers = [hidden_layer] if output_hidden_states else None
+        v1 = None
+        embeddings = hidden_layer
+        for layer in self.layers:
+            if checkpoint_activations:
+                hidden_layer, v1 = torch.utils.checkpoint.checkpoint(layers, hidden_layer, embeddings, v1, padding_info, use_reentrant=True)
+            else:
+                hidden_layer, v1 = layer(hidden_layer, embeddings, v1, padding_info)
+            if output_hidden_states:
+                hidden_layers.append(hidden_layer)
+        return hidden_layer, hidden_layers
 #
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 class GptBertModel(GptBertPreTrainedModel):
     def __init__(self, config: GptBertConfig, add_mlm_layer=False, **kwargs):
         else:
             attention_mask = attention_mask.bool()
+        if is_flash_attn_2_available():
             if len(attention_mask.size()) != 2:
                 raise ValueError("Bare `attention_mask` med to dimensjoner støttes nå for FlashAttention.")
             with torch.no_grad():
             contextualized_embeddings = [layer.to(original_dtype) for layer in contextualized_embeddings]
         # Pad output if using FlashAttention
+        if is_flash_attn_2_available():
             last_layer = _pad_output(last_layer, indices, batch_size, seq_length)
             if output_hidden_states:
                 contextualized_embeddings = [_pad_output(layer, indices, batch_size, seq_length) for layer in contextualized_embeddings]