Fix error
Browse files- modeling_gptbert.py +2 -5
modeling_gptbert.py
CHANGED
|
@@ -10,7 +10,7 @@ from functools import partial, lru_cache
|
|
| 10 |
from .configuration_gptbert import GptBertConfig
|
| 11 |
from transformers.modeling_utils import PreTrainedModel
|
| 12 |
from transformers.activations import gelu_new
|
| 13 |
-
from transformers.utils import is_flash_attn_2_available
|
| 14 |
from transformers.modeling_outputs import (
|
| 15 |
MaskedLMOutput,
|
| 16 |
MultipleChoiceModelOutput,
|
|
@@ -352,8 +352,6 @@ class SelfAttention(nn.Module):
|
|
| 352 |
self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
|
| 353 |
|
| 354 |
self.scale = 1.0 / math.sqrt(self.d_qk)
|
| 355 |
-
self.dropout = nn.Dropout(config.attention_dropout if hasattr(config, "attention_dropout") else 0.0)
|
| 356 |
-
|
| 357 |
self.lambdas = nn.Parameter(torch.tensor([0.5]))
|
| 358 |
|
| 359 |
self.sequence_length = config.max_sequence_length
|
|
@@ -399,7 +397,7 @@ class SelfAttention(nn.Module):
|
|
| 399 |
|
| 400 |
return value, attention_probabilities.detach()
|
| 401 |
|
| 402 |
-
def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info)
|
| 403 |
# Get original shape info
|
| 404 |
if is_flash_attn_2_available() and isinstance(padding_info, tuple):
|
| 405 |
# Unpadded case
|
|
@@ -412,7 +410,6 @@ class SelfAttention(nn.Module):
|
|
| 412 |
hidden_layer = hidden_layer.transpose(0, 1) # [seq_len, batch_size, hidden_size]
|
| 413 |
qk_layer = qk_layer.transpose(0, 1)
|
| 414 |
|
| 415 |
-
|
| 416 |
hidden_layer = self.pre_v_norm(hidden_layer)
|
| 417 |
qk_layer = self.pre_qk_norm(qk_layer)
|
| 418 |
|
|
|
|
| 10 |
from .configuration_gptbert import GptBertConfig
|
| 11 |
from transformers.modeling_utils import PreTrainedModel
|
| 12 |
from transformers.activations import gelu_new
|
| 13 |
+
from transformers.utils import is_flash_attn_2_available
|
| 14 |
from transformers.modeling_outputs import (
|
| 15 |
MaskedLMOutput,
|
| 16 |
MultipleChoiceModelOutput,
|
|
|
|
| 352 |
self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
|
| 353 |
|
| 354 |
self.scale = 1.0 / math.sqrt(self.d_qk)
|
|
|
|
|
|
|
| 355 |
self.lambdas = nn.Parameter(torch.tensor([0.5]))
|
| 356 |
|
| 357 |
self.sequence_length = config.max_sequence_length
|
|
|
|
| 397 |
|
| 398 |
return value, attention_probabilities.detach()
|
| 399 |
|
| 400 |
+
def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
|
| 401 |
# Get original shape info
|
| 402 |
if is_flash_attn_2_available() and isinstance(padding_info, tuple):
|
| 403 |
# Unpadded case
|
|
|
|
| 410 |
hidden_layer = hidden_layer.transpose(0, 1) # [seq_len, batch_size, hidden_size]
|
| 411 |
qk_layer = qk_layer.transpose(0, 1)
|
| 412 |
|
|
|
|
| 413 |
hidden_layer = self.pre_v_norm(hidden_layer)
|
| 414 |
qk_layer = self.pre_qk_norm(qk_layer)
|
| 415 |
|