Update modelling_hat.py
Browse files- modelling_hat.py +113 -110
modelling_hat.py
CHANGED
|
@@ -319,116 +319,119 @@ class SentenceClassifierOutput(ModelOutput):
|
|
| 319 |
sentence_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 320 |
|
| 321 |
|
| 322 |
-
class HATConfig(PretrainedConfig):
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
class HATEmbeddings(nn.Module):
|
| 434 |
"""
|
|
|
|
| 319 |
sentence_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 320 |
|
| 321 |
|
| 322 |
+
# class HATConfig(PretrainedConfig):
|
| 323 |
+
# r"""
|
| 324 |
+
# This is the configuration class to store the configuration of a :class:`~transformers.HAT`.
|
| 325 |
+
# It is used to instantiate a HAT model according to the specified arguments,
|
| 326 |
+
# defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
|
| 327 |
+
# to that of the HAT `kiddothe2b/hat-base-4096 <https://huggingface.co/kiddothe2b/hat-base-4096>`__ architecture.
|
| 328 |
+
|
| 329 |
+
# Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
| 330 |
+
# outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# Args:
|
| 334 |
+
# vocab_size (:obj:`int`, `optional`, defaults to 30522):
|
| 335 |
+
# Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
| 336 |
+
# :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
|
| 337 |
+
# :class:`~transformers.TFBertModel`.
|
| 338 |
+
# max_sentences (:obj:`int`, `optional`, defaults to 64):
|
| 339 |
+
# The maximum number of sentences that this model might ever be used with.
|
| 340 |
+
# max_sentence_size (:obj:`int`, `optional`, defaults to 128):
|
| 341 |
+
# The maximum sentence length that this model might ever be used with.
|
| 342 |
+
# model_max_length (:obj:`int`, `optional`, defaults to 8192):
|
| 343 |
+
# The maximum sequence length (max_sentences * max_sentence_size) that this model might ever be used with
|
| 344 |
+
# encoder_layout (:obj:`Dict`):
|
| 345 |
+
# The sentence/document encoder layout.
|
| 346 |
+
# hidden_size (:obj:`int`, `optional`, defaults to 768):
|
| 347 |
+
# Dimensionality of the encoder layers and the pooler layer.
|
| 348 |
+
# num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
|
| 349 |
+
# Number of hidden layers in the Transformer encoder.
|
| 350 |
+
# num_attention_heads (:obj:`int`, `optional`, defaults to 12):
|
| 351 |
+
# Number of attention heads for each attention layer in the Transformer encoder.
|
| 352 |
+
# intermediate_size (:obj:`int`, `optional`, defaults to 3072):
|
| 353 |
+
# Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
| 354 |
+
# hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
| 355 |
+
# The non-linear activation function (function or string) in the encoder and pooler. If string,
|
| 356 |
+
# :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
| 357 |
+
# hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
| 358 |
+
# The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
| 359 |
+
# attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
| 360 |
+
# The dropout ratio for the attention probabilities.
|
| 361 |
+
# max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
| 362 |
+
# The maximum sequence length that this model might ever be used with. Typically set this to something large
|
| 363 |
+
# just in case (e.g., 512 or 1024 or 2048).
|
| 364 |
+
# type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
| 365 |
+
# The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
|
| 366 |
+
# :class:`~transformers.TFBertModel`.
|
| 367 |
+
# initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
| 368 |
+
# The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 369 |
+
# layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
|
| 370 |
+
# The epsilon used by the layer normalization layers.
|
| 371 |
+
# position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
| 372 |
+
# Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
| 373 |
+
# :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
| 374 |
+
# :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
| 375 |
+
# <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
| 376 |
+
# `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
| 377 |
+
# <https://arxiv.org/abs/2009.13658>`__.
|
| 378 |
+
# use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
| 379 |
+
# Whether or not the model should return the last key/values attentions (not used by all models). Only
|
| 380 |
+
# relevant if ``config.is_decoder=True``.
|
| 381 |
+
# classifier_dropout (:obj:`float`, `optional`):
|
| 382 |
+
# The dropout ratio for the classification head.
|
| 383 |
+
# """
|
| 384 |
+
# model_type = "hierarchical-transformer"
|
| 385 |
+
|
| 386 |
+
# def __init__(
|
| 387 |
+
# self,
|
| 388 |
+
# vocab_size=30522,
|
| 389 |
+
# hidden_size=768,
|
| 390 |
+
# max_sentences=64,
|
| 391 |
+
# max_sentence_size=128,
|
| 392 |
+
# model_max_length=8192,
|
| 393 |
+
# num_hidden_layers=12,
|
| 394 |
+
# num_attention_heads=12,
|
| 395 |
+
# intermediate_size=3072,
|
| 396 |
+
# hidden_act="gelu",
|
| 397 |
+
# hidden_dropout_prob=0.1,
|
| 398 |
+
# attention_probs_dropout_prob=0.1,
|
| 399 |
+
# max_position_embeddings=512,
|
| 400 |
+
# type_vocab_size=2,
|
| 401 |
+
# initializer_range=0.02,
|
| 402 |
+
# layer_norm_eps=1e-12,
|
| 403 |
+
# pad_token_id=0,
|
| 404 |
+
# position_embedding_type="absolute",
|
| 405 |
+
# encoder_layout=None,
|
| 406 |
+
# use_cache=True,
|
| 407 |
+
# classifier_dropout=None,
|
| 408 |
+
# **kwargs
|
| 409 |
+
# ):
|
| 410 |
+
# super().__init__(pad_token_id=pad_token_id, **kwargs)
|
| 411 |
+
|
| 412 |
+
# self.vocab_size = vocab_size
|
| 413 |
+
# self.hidden_size = hidden_size
|
| 414 |
+
# self.max_sentences = max_sentences
|
| 415 |
+
# self.max_sentence_size = max_sentence_size
|
| 416 |
+
# self.model_max_length = model_max_length
|
| 417 |
+
# self.encoder_layout = encoder_layout
|
| 418 |
+
# self.num_hidden_layers = num_hidden_layers
|
| 419 |
+
# self.num_attention_heads = num_attention_heads
|
| 420 |
+
# self.hidden_act = hidden_act
|
| 421 |
+
# self.intermediate_size = intermediate_size
|
| 422 |
+
# self.hidden_dropout_prob = hidden_dropout_prob
|
| 423 |
+
# self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
| 424 |
+
# self.max_position_embeddings = max_position_embeddings
|
| 425 |
+
# self.type_vocab_size = type_vocab_size
|
| 426 |
+
# self.initializer_range = initializer_range
|
| 427 |
+
# self.layer_norm_eps = layer_norm_eps
|
| 428 |
+
# self.position_embedding_type = position_embedding_type
|
| 429 |
+
# self.use_cache = use_cache
|
| 430 |
+
# self.classifier_dropout = classifier_dropout
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
from configuration_hat import HATConfig
|
| 435 |
|
| 436 |
class HATEmbeddings(nn.Module):
|
| 437 |
"""
|