davda54 commited on
Commit
7ee002c
·
verified ·
1 Parent(s): 4d4d26d
Files changed (1) hide show
  1. modeling_gptbert.py +2 -5
modeling_gptbert.py CHANGED
@@ -10,7 +10,7 @@ from functools import partial, lru_cache
10
  from .configuration_gptbert import GptBertConfig
11
  from transformers.modeling_utils import PreTrainedModel
12
  from transformers.activations import gelu_new
13
- from transformers.utils import is_flash_attn_2_available, is_flax_available
14
  from transformers.modeling_outputs import (
15
  MaskedLMOutput,
16
  MultipleChoiceModelOutput,
@@ -352,8 +352,6 @@ class SelfAttention(nn.Module):
352
  self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
353
 
354
  self.scale = 1.0 / math.sqrt(self.d_qk)
355
- self.dropout = nn.Dropout(config.attention_dropout if hasattr(config, "attention_dropout") else 0.0)
356
-
357
  self.lambdas = nn.Parameter(torch.tensor([0.5]))
358
 
359
  self.sequence_length = config.max_sequence_length
@@ -399,7 +397,7 @@ class SelfAttention(nn.Module):
399
 
400
  return value, attention_probabilities.detach()
401
 
402
- def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info)]:
403
  # Get original shape info
404
  if is_flash_attn_2_available() and isinstance(padding_info, tuple):
405
  # Unpadded case
@@ -412,7 +410,6 @@ class SelfAttention(nn.Module):
412
  hidden_layer = hidden_layer.transpose(0, 1) # [seq_len, batch_size, hidden_size]
413
  qk_layer = qk_layer.transpose(0, 1)
414
 
415
-
416
  hidden_layer = self.pre_v_norm(hidden_layer)
417
  qk_layer = self.pre_qk_norm(qk_layer)
418
 
 
10
  from .configuration_gptbert import GptBertConfig
11
  from transformers.modeling_utils import PreTrainedModel
12
  from transformers.activations import gelu_new
13
+ from transformers.utils import is_flash_attn_2_available
14
  from transformers.modeling_outputs import (
15
  MaskedLMOutput,
16
  MultipleChoiceModelOutput,
 
352
  self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
353
 
354
  self.scale = 1.0 / math.sqrt(self.d_qk)
 
 
355
  self.lambdas = nn.Parameter(torch.tensor([0.5]))
356
 
357
  self.sequence_length = config.max_sequence_length
 
397
 
398
  return value, attention_probabilities.detach()
399
 
400
+ def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
401
  # Get original shape info
402
  if is_flash_attn_2_available() and isinstance(padding_info, tuple):
403
  # Unpadded case
 
410
  hidden_layer = hidden_layer.transpose(0, 1) # [seq_len, batch_size, hidden_size]
411
  qk_layer = qk_layer.transpose(0, 1)
412
 
 
413
  hidden_layer = self.pre_v_norm(hidden_layer)
414
  qk_layer = self.pre_qk_norm(qk_layer)
415