yihongLiu commited on
Commit
b265df7
·
1 Parent(s): 9606e2e

Upload modeling_xlmr_extra.py

Browse files
Files changed (1) hide show
  1. modeling_xlmr_extra.py +951 -0
modeling_xlmr_extra.py ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch XLM-RoBERTa assembled model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.activations import ACT2FN, gelu
27
+ from transformers.models.roberta.modeling_roberta import (
28
+ BaseModelOutputWithPoolingAndCrossAttentions,
29
+ MaskedLMOutput,
30
+ MultipleChoiceModelOutput,
31
+ QuestionAnsweringModelOutput,
32
+ SequenceClassifierOutput,
33
+ TokenClassifierOutput,
34
+ )
35
+
36
+ # we should simply import all things that do not need to change
37
+ from transformers.models.xlm_roberta.modeling_xlm_roberta import (
38
+ XLMRobertaEncoder,
39
+ XLMRobertaPooler,
40
+ XLMRobertaPreTrainedModel,
41
+ XLMRobertaClassificationHead
42
+ )
43
+
44
+
45
+ from transformers import PreTrainedModel
46
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
47
+ from transformers.utils import (
48
+ add_code_sample_docstrings,
49
+ add_start_docstrings,
50
+ add_start_docstrings_to_model_forward,
51
+ logging,
52
+ replace_return_docstrings,
53
+ )
54
+ from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
55
+ from torch.nn import functional as F
56
+
57
+
58
+ logger = logging.get_logger(__name__)
59
+
60
+ _CHECKPOINT_FOR_DOC = "xlm-roberta-base"
61
+ _CONFIG_FOR_DOC = "XLMRobertaConfig"
62
+
63
+ XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
64
+ "xlm-roberta-base",
65
+ "xlm-roberta-large",
66
+ "xlm-roberta-large-finetuned-conll02-dutch",
67
+ "xlm-roberta-large-finetuned-conll02-spanish",
68
+ "xlm-roberta-large-finetuned-conll03-english",
69
+ "xlm-roberta-large-finetuned-conll03-german",
70
+ # See all XLM-RoBERTa models at https://huggingface.co/models?filter=xlm-roberta
71
+ ]
72
+
73
+
74
+ class LinearTranspose(nn.Module):
75
+ def __init__(self, in_features, out_features):
76
+ super().__init__()
77
+ self.weight = nn.Parameter(torch.empty((out_features, in_features)))
78
+
79
+ def forward(self, x, transpose=False):
80
+ if transpose:
81
+ return F.linear(x, self.weight.t())
82
+ else:
83
+ return F.linear(x, self.weight)
84
+
85
+
86
+ # adapted from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
87
+ class PrimitiveEmbeddings(nn.Module):
88
+ def __init__(self, config):
89
+ super().__init__()
90
+
91
+ self.primitive_embeddings = LinearTranspose(in_features=config.num_primitive, out_features=config.hidden_size)
92
+ self.target_coordinates = nn.Embedding(num_embeddings=config.vocab_size,
93
+ embedding_dim=config.num_primitive,
94
+ padding_idx=config.pad_token_id)
95
+
96
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
97
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
98
+
99
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
100
+ # any TensorFlow checkpoint file
101
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
102
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
103
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
104
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
105
+ self.register_buffer(
106
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
107
+ )
108
+ self.register_buffer(
109
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
110
+ )
111
+
112
+ # End copy
113
+ self.padding_idx = config.pad_token_id
114
+ self.position_embeddings = nn.Embedding(
115
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
116
+ )
117
+
118
+ def forward(self, input_ids=None, token_type_ids=None,
119
+ position_ids=None, inputs_embeds=None, past_key_values_length=0):
120
+
121
+ if position_ids is None:
122
+ if input_ids is not None:
123
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
124
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
125
+ else:
126
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
127
+
128
+ # if inputs_embeds is given, it should match the original model dimension
129
+ if input_ids is not None:
130
+ input_shape = input_ids.size()
131
+ else:
132
+ input_shape = inputs_embeds.size()[:-1]
133
+
134
+ seq_length = input_shape[1]
135
+
136
+ if token_type_ids is None:
137
+ if hasattr(self, "token_type_ids"):
138
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
139
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
140
+ token_type_ids = buffered_token_type_ids_expanded
141
+ else:
142
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
143
+
144
+ if inputs_embeds is None:
145
+ # use primitive_embeddings and coordinates
146
+ inputs_embeds = self.target_coordinates(input_ids)
147
+ inputs_embeds = self.primitive_embeddings.forward(inputs_embeds)
148
+ # inputs_embeds will be mapped to the same dimension as the hidden state
149
+
150
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
151
+
152
+ embeddings = inputs_embeds + token_type_embeddings
153
+ if self.position_embedding_type == "absolute":
154
+ position_embeddings = self.position_embeddings(position_ids)
155
+ embeddings += position_embeddings
156
+ embeddings = self.LayerNorm(embeddings)
157
+ embeddings = self.dropout(embeddings)
158
+ return embeddings
159
+
160
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
161
+ """
162
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
163
+
164
+ Args:
165
+ inputs_embeds: torch.Tensor
166
+
167
+ Returns: torch.Tensor
168
+ """
169
+ input_shape = inputs_embeds.size()[:-1]
170
+ sequence_length = input_shape[1]
171
+
172
+ position_ids = torch.arange(
173
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
174
+ )
175
+ return position_ids.unsqueeze(0).expand(input_shape)
176
+
177
+
178
+ XLM_ROBERTA_START_DOCSTRING = r"""
179
+
180
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
181
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
182
+ etc.)
183
+
184
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
185
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
186
+ and behavior.
187
+
188
+ Parameters:
189
+ config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
190
+ model. Initializing with a config file does not load the weights associated with the model, only the
191
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
192
+ """
193
+
194
+ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
195
+ Args:
196
+ input_ids (`torch.LongTensor` of shape `({0})`):
197
+ Indices of input sequence tokens in the vocabulary.
198
+
199
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
200
+ [`PreTrainedTokenizer.__call__`] for details.
201
+
202
+ [What are input IDs?](../glossary#input-ids)
203
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
204
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
205
+
206
+ - 1 for tokens that are **not masked**,
207
+ - 0 for tokens that are **masked**.
208
+
209
+ [What are attention masks?](../glossary#attention-mask)
210
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
211
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
212
+ 1]`:
213
+
214
+ - 0 corresponds to a *sentence A* token,
215
+ - 1 corresponds to a *sentence B* token.
216
+
217
+ [What are token type IDs?](../glossary#token-type-ids)
218
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
219
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
220
+ config.max_position_embeddings - 1]`.
221
+
222
+ [What are position IDs?](../glossary#position-ids)
223
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
224
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
225
+
226
+ - 1 indicates the head is **not masked**,
227
+ - 0 indicates the head is **masked**.
228
+
229
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
230
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
231
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
232
+ model's internal embedding lookup matrix.
233
+ output_attentions (`bool`, *optional*):
234
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
235
+ tensors for more detail.
236
+ output_hidden_states (`bool`, *optional*):
237
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
238
+ more detail.
239
+ return_dict (`bool`, *optional*):
240
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
241
+ """
242
+
243
+
244
+ @add_start_docstrings(
245
+ "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
246
+ XLM_ROBERTA_START_DOCSTRING,
247
+ )
248
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
249
+ class XLMRobertaAssembledModel(XLMRobertaPreTrainedModel):
250
+ """
251
+
252
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
253
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
254
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
255
+ Kaiser and Illia Polosukhin.
256
+
257
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
258
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
259
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
260
+
261
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
262
+
263
+ """
264
+
265
+ # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta
266
+ def __init__(self, config, add_pooling_layer=True):
267
+ super().__init__(config)
268
+ self.config = config
269
+
270
+ self.embeddings = PrimitiveEmbeddings(config)
271
+ self.encoder = XLMRobertaEncoder(config)
272
+
273
+ self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
274
+
275
+ # Initialize weights and apply final processing
276
+ self.post_init()
277
+
278
+ def get_input_embeddings(self):
279
+ # this returns both primitive_embeddings and target_coordinates
280
+ return self.embeddings.primitive_embeddings.weight, self.embeddings.target_coordinates
281
+
282
+ def set_input_embeddings(self, value1, value2):
283
+ self.embeddings.primitive_embeddings.weight = value1
284
+ self.embeddings.target_coordinates = value2
285
+
286
+ def _prune_heads(self, heads_to_prune):
287
+ """
288
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
289
+ class PreTrainedModel
290
+ """
291
+ for layer, heads in heads_to_prune.items():
292
+ self.encoder.layer[layer].attention.prune_heads(heads)
293
+
294
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
295
+ @add_code_sample_docstrings(
296
+ checkpoint=_CHECKPOINT_FOR_DOC,
297
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
298
+ config_class=_CONFIG_FOR_DOC,
299
+ )
300
+ # Copied from transformers.models.bert.modeling_bert.BertModel.forward
301
+ def forward(
302
+ self,
303
+ input_ids: Optional[torch.Tensor] = None,
304
+ attention_mask: Optional[torch.Tensor] = None,
305
+ token_type_ids: Optional[torch.Tensor] = None,
306
+ position_ids: Optional[torch.Tensor] = None,
307
+ head_mask: Optional[torch.Tensor] = None,
308
+ inputs_embeds: Optional[torch.Tensor] = None,
309
+ encoder_hidden_states: Optional[torch.Tensor] = None,
310
+ encoder_attention_mask: Optional[torch.Tensor] = None,
311
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
312
+ use_cache: Optional[bool] = None,
313
+ output_attentions: Optional[bool] = None,
314
+ output_hidden_states: Optional[bool] = None,
315
+ return_dict: Optional[bool] = None,
316
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
317
+ r"""
318
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
319
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
320
+ the model is configured as a decoder.
321
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
322
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
323
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
324
+
325
+ - 1 for tokens that are **not masked**,
326
+ - 0 for tokens that are **masked**.
327
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
328
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
329
+
330
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
331
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
332
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
333
+ use_cache (`bool`, *optional*):
334
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
335
+ `past_key_values`).
336
+ """
337
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
338
+ output_hidden_states = (
339
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
340
+ )
341
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
342
+
343
+ if self.config.is_decoder:
344
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
345
+ else:
346
+ use_cache = False
347
+
348
+ if input_ids is not None and inputs_embeds is not None:
349
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
350
+ elif input_ids is not None:
351
+ # self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
352
+ input_shape = input_ids.size()
353
+ elif inputs_embeds is not None:
354
+ input_shape = inputs_embeds.size()[:-1]
355
+ else:
356
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
357
+
358
+ batch_size, seq_length = input_shape
359
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
360
+
361
+ # past_key_values_length
362
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
363
+
364
+ if attention_mask is None:
365
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
366
+
367
+ if token_type_ids is None:
368
+ if hasattr(self.embeddings, "token_type_ids"):
369
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
370
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
371
+ token_type_ids = buffered_token_type_ids_expanded
372
+ else:
373
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
374
+
375
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
376
+ # ourselves in which case we just need to make it broadcastable to all heads.
377
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
378
+
379
+ # If a 2D or 3D attention mask is provided for the cross-attention
380
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
381
+ if self.config.is_decoder and encoder_hidden_states is not None:
382
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
383
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
384
+ if encoder_attention_mask is None:
385
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
386
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
387
+ else:
388
+ encoder_extended_attention_mask = None
389
+
390
+ # Prepare head mask if needed
391
+ # 1.0 in head_mask indicate we keep the head
392
+ # attention_probs has shape bsz x n_heads x N x N
393
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
394
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
395
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
396
+
397
+ embedding_output = self.embeddings(
398
+ input_ids=input_ids,
399
+ position_ids=position_ids,
400
+ token_type_ids=token_type_ids,
401
+ inputs_embeds=inputs_embeds,
402
+ past_key_values_length=past_key_values_length,
403
+ )
404
+ encoder_outputs = self.encoder(
405
+ embedding_output,
406
+ attention_mask=extended_attention_mask,
407
+ head_mask=head_mask,
408
+ encoder_hidden_states=encoder_hidden_states,
409
+ encoder_attention_mask=encoder_extended_attention_mask,
410
+ past_key_values=past_key_values,
411
+ use_cache=use_cache,
412
+ output_attentions=output_attentions,
413
+ output_hidden_states=output_hidden_states,
414
+ return_dict=return_dict,
415
+ )
416
+ sequence_output = encoder_outputs[0]
417
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
418
+
419
+ if not return_dict:
420
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
421
+
422
+ return BaseModelOutputWithPoolingAndCrossAttentions(
423
+ last_hidden_state=sequence_output,
424
+ pooler_output=pooled_output,
425
+ past_key_values=encoder_outputs.past_key_values,
426
+ hidden_states=encoder_outputs.hidden_states,
427
+ attentions=encoder_outputs.attentions,
428
+ cross_attentions=encoder_outputs.cross_attentions,
429
+ )
430
+
431
+
432
+ @add_start_docstrings(
433
+ """XLM-RoBERTa Model with a `language modeling` head on top.""",
434
+ XLM_ROBERTA_START_DOCSTRING,
435
+ )
436
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
437
+ class XLMRobertaAssembledForMaskedLM(XLMRobertaPreTrainedModel):
438
+ # _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
439
+
440
+ def __init__(self, config):
441
+ super().__init__(config)
442
+
443
+ if config.is_decoder:
444
+ logger.warning(
445
+ "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
446
+ "bi-directional self-attention."
447
+ )
448
+
449
+ self.roberta = XLMRobertaAssembledModel(config, add_pooling_layer=False)
450
+ self.lm_head = XLMRobertaAssembledLMHead(config)
451
+
452
+ # tie the weights
453
+ self.lm_head.down_project.weight = self.roberta.embeddings.primitive_embeddings.weight
454
+ self.lm_head.vocab_project.weight = self.roberta.embeddings.target_coordinates.weight
455
+
456
+ # def get_output_embeddings(self):
457
+ # return self.lm_head.decoder
458
+
459
+ # def set_output_embeddings(self, new_embeddings):
460
+ # self.lm_head.decoder = new_embeddings
461
+
462
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
463
+ @add_code_sample_docstrings(
464
+ checkpoint=_CHECKPOINT_FOR_DOC,
465
+ output_type=MaskedLMOutput,
466
+ config_class=_CONFIG_FOR_DOC,
467
+ mask="<mask>",
468
+ expected_output="' Paris'",
469
+ expected_loss=0.1,
470
+ )
471
+ def forward(
472
+ self,
473
+ input_ids: Optional[torch.LongTensor] = None,
474
+ attention_mask: Optional[torch.FloatTensor] = None,
475
+ token_type_ids: Optional[torch.LongTensor] = None,
476
+ position_ids: Optional[torch.LongTensor] = None,
477
+ head_mask: Optional[torch.FloatTensor] = None,
478
+ inputs_embeds: Optional[torch.FloatTensor] = None,
479
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
480
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
481
+ labels: Optional[torch.LongTensor] = None,
482
+ output_attentions: Optional[bool] = None,
483
+ output_hidden_states: Optional[bool] = None,
484
+ return_dict: Optional[bool] = None,
485
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
486
+ r"""
487
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
488
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
489
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
490
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
491
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
492
+ Used to hide legacy arguments that have been deprecated.
493
+ """
494
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
495
+
496
+ outputs = self.roberta(
497
+ input_ids,
498
+ attention_mask=attention_mask,
499
+ token_type_ids=token_type_ids,
500
+ position_ids=position_ids,
501
+ head_mask=head_mask,
502
+ inputs_embeds=inputs_embeds,
503
+ encoder_hidden_states=encoder_hidden_states,
504
+ encoder_attention_mask=encoder_attention_mask,
505
+ output_attentions=output_attentions,
506
+ output_hidden_states=output_hidden_states,
507
+ return_dict=return_dict,
508
+ )
509
+ sequence_output = outputs[0]
510
+ prediction_scores = self.lm_head(sequence_output)
511
+
512
+ masked_lm_loss = None
513
+ if labels is not None:
514
+ # move labels to correct device to enable model parallelism
515
+ labels = labels.to(prediction_scores.device)
516
+ loss_fct = CrossEntropyLoss()
517
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
518
+
519
+ if not return_dict:
520
+ output = (prediction_scores,) + outputs[2:]
521
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
522
+
523
+ return MaskedLMOutput(
524
+ loss=masked_lm_loss,
525
+ logits=prediction_scores,
526
+ hidden_states=outputs.hidden_states,
527
+ attentions=outputs.attentions,
528
+ )
529
+
530
+
531
+ # Adapted from transformers.models.roberta.modeling_roberta.RobertaLMHead
532
+ class XLMRobertaAssembledLMHead(nn.Module):
533
+ """Roberta Head for masked language modeling."""
534
+
535
+ def __init__(self, config):
536
+ super().__init__()
537
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
538
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
539
+
540
+ self.down_project = LinearTranspose(in_features=config.num_primitive, out_features=config.hidden_size)
541
+ self.vocab_project = nn.Linear(in_features=config.num_primitive, out_features=config.vocab_size, bias=True)
542
+
543
+ def forward(self, features, **kwargs):
544
+ x = self.dense(features)
545
+ x = gelu(x)
546
+ x = self.layer_norm(x)
547
+ # project back to size of vocabulary with bias
548
+ x = self.vocab_project(self.down_project.forward(x, transpose=True))
549
+
550
+ return x
551
+
552
+
553
+
554
+ @add_start_docstrings(
555
+ """
556
+ XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
557
+ pooled output) e.g. for GLUE tasks.
558
+ """,
559
+ XLM_ROBERTA_START_DOCSTRING,
560
+ )
561
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
562
+ class XLMRobertaAssembledForSequenceClassification(XLMRobertaPreTrainedModel):
563
+ def __init__(self, config):
564
+ super().__init__(config)
565
+ self.num_labels = config.num_labels
566
+ self.config = config
567
+
568
+ self.roberta = XLMRobertaAssembledModel(config, add_pooling_layer=False)
569
+ self.classifier = XLMRobertaClassificationHead(config)
570
+
571
+ # Initialize weights and apply final processing
572
+ self.post_init()
573
+
574
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
575
+ @add_code_sample_docstrings(
576
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
577
+ output_type=SequenceClassifierOutput,
578
+ config_class=_CONFIG_FOR_DOC,
579
+ expected_output="'optimism'",
580
+ expected_loss=0.08,
581
+ )
582
+ def forward(
583
+ self,
584
+ input_ids: Optional[torch.LongTensor] = None,
585
+ attention_mask: Optional[torch.FloatTensor] = None,
586
+ token_type_ids: Optional[torch.LongTensor] = None,
587
+ position_ids: Optional[torch.LongTensor] = None,
588
+ head_mask: Optional[torch.FloatTensor] = None,
589
+ inputs_embeds: Optional[torch.FloatTensor] = None,
590
+ labels: Optional[torch.LongTensor] = None,
591
+ output_attentions: Optional[bool] = None,
592
+ output_hidden_states: Optional[bool] = None,
593
+ return_dict: Optional[bool] = None,
594
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
595
+ r"""
596
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
597
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
598
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
599
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
600
+ """
601
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
602
+
603
+ outputs = self.roberta(
604
+ input_ids,
605
+ attention_mask=attention_mask,
606
+ token_type_ids=token_type_ids,
607
+ position_ids=position_ids,
608
+ head_mask=head_mask,
609
+ inputs_embeds=inputs_embeds,
610
+ output_attentions=output_attentions,
611
+ output_hidden_states=output_hidden_states,
612
+ return_dict=return_dict,
613
+ )
614
+ sequence_output = outputs[0]
615
+ logits = self.classifier(sequence_output)
616
+
617
+ loss = None
618
+ if labels is not None:
619
+ # move labels to correct device to enable model parallelism
620
+ labels = labels.to(logits.device)
621
+ if self.config.problem_type is None:
622
+ if self.num_labels == 1:
623
+ self.config.problem_type = "regression"
624
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
625
+ self.config.problem_type = "single_label_classification"
626
+ else:
627
+ self.config.problem_type = "multi_label_classification"
628
+
629
+ if self.config.problem_type == "regression":
630
+ loss_fct = MSELoss()
631
+ if self.num_labels == 1:
632
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
633
+ else:
634
+ loss = loss_fct(logits, labels)
635
+ elif self.config.problem_type == "single_label_classification":
636
+ loss_fct = CrossEntropyLoss()
637
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
638
+ elif self.config.problem_type == "multi_label_classification":
639
+ loss_fct = BCEWithLogitsLoss()
640
+ loss = loss_fct(logits, labels)
641
+
642
+ if not return_dict:
643
+ output = (logits,) + outputs[2:]
644
+ return ((loss,) + output) if loss is not None else output
645
+
646
+ return SequenceClassifierOutput(
647
+ loss=loss,
648
+ logits=logits,
649
+ hidden_states=outputs.hidden_states,
650
+ attentions=outputs.attentions,
651
+ )
652
+
653
+
654
+ @add_start_docstrings(
655
+ """
656
+ XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
657
+ a softmax) e.g. for RocStories/SWAG tasks.
658
+ """,
659
+ XLM_ROBERTA_START_DOCSTRING,
660
+ )
661
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
662
+ class XLMRobertaAssembledForMultipleChoice(XLMRobertaPreTrainedModel):
663
+ def __init__(self, config):
664
+ super().__init__(config)
665
+
666
+ self.roberta = XLMRobertaAssembledModel(config)
667
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
668
+ self.classifier = nn.Linear(config.hidden_size, 1)
669
+
670
+ # Initialize weights and apply final processing
671
+ self.post_init()
672
+
673
+ @add_start_docstrings_to_model_forward(
674
+ XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
675
+ )
676
+ @add_code_sample_docstrings(
677
+ checkpoint=_CHECKPOINT_FOR_DOC,
678
+ output_type=MultipleChoiceModelOutput,
679
+ config_class=_CONFIG_FOR_DOC,
680
+ )
681
+ def forward(
682
+ self,
683
+ input_ids: Optional[torch.LongTensor] = None,
684
+ token_type_ids: Optional[torch.LongTensor] = None,
685
+ attention_mask: Optional[torch.FloatTensor] = None,
686
+ labels: Optional[torch.LongTensor] = None,
687
+ position_ids: Optional[torch.LongTensor] = None,
688
+ head_mask: Optional[torch.FloatTensor] = None,
689
+ inputs_embeds: Optional[torch.FloatTensor] = None,
690
+ output_attentions: Optional[bool] = None,
691
+ output_hidden_states: Optional[bool] = None,
692
+ return_dict: Optional[bool] = None,
693
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
694
+ r"""
695
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
696
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
697
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
698
+ `input_ids` above)
699
+ """
700
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
701
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
702
+
703
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
704
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
705
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
706
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
707
+ flat_inputs_embeds = (
708
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
709
+ if inputs_embeds is not None
710
+ else None
711
+ )
712
+
713
+ outputs = self.roberta(
714
+ flat_input_ids,
715
+ position_ids=flat_position_ids,
716
+ token_type_ids=flat_token_type_ids,
717
+ attention_mask=flat_attention_mask,
718
+ head_mask=head_mask,
719
+ inputs_embeds=flat_inputs_embeds,
720
+ output_attentions=output_attentions,
721
+ output_hidden_states=output_hidden_states,
722
+ return_dict=return_dict,
723
+ )
724
+ pooled_output = outputs[1]
725
+
726
+ pooled_output = self.dropout(pooled_output)
727
+ logits = self.classifier(pooled_output)
728
+ reshaped_logits = logits.view(-1, num_choices)
729
+
730
+ loss = None
731
+ if labels is not None:
732
+ # move labels to correct device to enable model parallelism
733
+ labels = labels.to(reshaped_logits.device)
734
+ loss_fct = CrossEntropyLoss()
735
+ loss = loss_fct(reshaped_logits, labels)
736
+
737
+ if not return_dict:
738
+ output = (reshaped_logits,) + outputs[2:]
739
+ return ((loss,) + output) if loss is not None else output
740
+
741
+ return MultipleChoiceModelOutput(
742
+ loss=loss,
743
+ logits=reshaped_logits,
744
+ hidden_states=outputs.hidden_states,
745
+ attentions=outputs.attentions,
746
+ )
747
+
748
+
749
+ @add_start_docstrings(
750
+ """
751
+ XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
752
+ for Named-Entity-Recognition (NER) tasks.
753
+ """,
754
+ XLM_ROBERTA_START_DOCSTRING,
755
+ )
756
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
757
+ class XLMRobertaAssembledForTokenClassification(XLMRobertaPreTrainedModel):
758
+ def __init__(self, config):
759
+ super().__init__(config)
760
+ self.num_labels = config.num_labels
761
+
762
+ self.roberta = XLMRobertaAssembledModel(config, add_pooling_layer=False)
763
+ classifier_dropout = (
764
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
765
+ )
766
+ self.dropout = nn.Dropout(classifier_dropout)
767
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
768
+
769
+ # Initialize weights and apply final processing
770
+ self.post_init()
771
+
772
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
773
+ @add_code_sample_docstrings(
774
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
775
+ output_type=TokenClassifierOutput,
776
+ config_class=_CONFIG_FOR_DOC,
777
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
778
+ expected_loss=0.01,
779
+ )
780
+ def forward(
781
+ self,
782
+ input_ids: Optional[torch.LongTensor] = None,
783
+ attention_mask: Optional[torch.FloatTensor] = None,
784
+ token_type_ids: Optional[torch.LongTensor] = None,
785
+ position_ids: Optional[torch.LongTensor] = None,
786
+ head_mask: Optional[torch.FloatTensor] = None,
787
+ inputs_embeds: Optional[torch.FloatTensor] = None,
788
+ labels: Optional[torch.LongTensor] = None,
789
+ output_attentions: Optional[bool] = None,
790
+ output_hidden_states: Optional[bool] = None,
791
+ return_dict: Optional[bool] = None,
792
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
793
+ r"""
794
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
795
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
796
+ """
797
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
798
+
799
+ outputs = self.roberta(
800
+ input_ids,
801
+ attention_mask=attention_mask,
802
+ token_type_ids=token_type_ids,
803
+ position_ids=position_ids,
804
+ head_mask=head_mask,
805
+ inputs_embeds=inputs_embeds,
806
+ output_attentions=output_attentions,
807
+ output_hidden_states=output_hidden_states,
808
+ return_dict=return_dict,
809
+ )
810
+
811
+ sequence_output = outputs[0]
812
+
813
+ sequence_output = self.dropout(sequence_output)
814
+ logits = self.classifier(sequence_output)
815
+
816
+ loss = None
817
+ if labels is not None:
818
+ # move labels to correct device to enable model parallelism
819
+ labels = labels.to(logits.device)
820
+ loss_fct = CrossEntropyLoss()
821
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
822
+
823
+ if not return_dict:
824
+ output = (logits,) + outputs[2:]
825
+ return ((loss,) + output) if loss is not None else output
826
+
827
+ return TokenClassifierOutput(
828
+ loss=loss,
829
+ logits=logits,
830
+ hidden_states=outputs.hidden_states,
831
+ attentions=outputs.attentions,
832
+ )
833
+
834
+
835
+ @add_start_docstrings(
836
+ """
837
+ XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
838
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
839
+ """,
840
+ XLM_ROBERTA_START_DOCSTRING,
841
+ )
842
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
843
+ class XLMRobertaAssembledForQuestionAnswering(XLMRobertaPreTrainedModel):
844
+ def __init__(self, config):
845
+ super().__init__(config)
846
+ self.num_labels = config.num_labels
847
+
848
+ self.roberta = XLMRobertaAssembledModel(config, add_pooling_layer=False)
849
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
850
+
851
+ # Initialize weights and apply final processing
852
+ self.post_init()
853
+
854
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
855
+ @add_code_sample_docstrings(
856
+ checkpoint="deepset/roberta-base-squad2",
857
+ output_type=QuestionAnsweringModelOutput,
858
+ config_class=_CONFIG_FOR_DOC,
859
+ expected_output="' puppet'",
860
+ expected_loss=0.86,
861
+ )
862
+ def forward(
863
+ self,
864
+ input_ids: Optional[torch.LongTensor] = None,
865
+ attention_mask: Optional[torch.FloatTensor] = None,
866
+ token_type_ids: Optional[torch.LongTensor] = None,
867
+ position_ids: Optional[torch.LongTensor] = None,
868
+ head_mask: Optional[torch.FloatTensor] = None,
869
+ inputs_embeds: Optional[torch.FloatTensor] = None,
870
+ start_positions: Optional[torch.LongTensor] = None,
871
+ end_positions: Optional[torch.LongTensor] = None,
872
+ output_attentions: Optional[bool] = None,
873
+ output_hidden_states: Optional[bool] = None,
874
+ return_dict: Optional[bool] = None,
875
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
876
+ r"""
877
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
878
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
879
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
880
+ are not taken into account for computing the loss.
881
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
882
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
883
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
884
+ are not taken into account for computing the loss.
885
+ """
886
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
887
+
888
+ outputs = self.roberta(
889
+ input_ids,
890
+ attention_mask=attention_mask,
891
+ token_type_ids=token_type_ids,
892
+ position_ids=position_ids,
893
+ head_mask=head_mask,
894
+ inputs_embeds=inputs_embeds,
895
+ output_attentions=output_attentions,
896
+ output_hidden_states=output_hidden_states,
897
+ return_dict=return_dict,
898
+ )
899
+
900
+ sequence_output = outputs[0]
901
+
902
+ logits = self.qa_outputs(sequence_output)
903
+ start_logits, end_logits = logits.split(1, dim=-1)
904
+ start_logits = start_logits.squeeze(-1).contiguous()
905
+ end_logits = end_logits.squeeze(-1).contiguous()
906
+
907
+ total_loss = None
908
+ if start_positions is not None and end_positions is not None:
909
+ # If we are on multi-GPU, split add a dimension
910
+ if len(start_positions.size()) > 1:
911
+ start_positions = start_positions.squeeze(-1)
912
+ if len(end_positions.size()) > 1:
913
+ end_positions = end_positions.squeeze(-1)
914
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
915
+ ignored_index = start_logits.size(1)
916
+ start_positions = start_positions.clamp(0, ignored_index)
917
+ end_positions = end_positions.clamp(0, ignored_index)
918
+
919
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
920
+ start_loss = loss_fct(start_logits, start_positions)
921
+ end_loss = loss_fct(end_logits, end_positions)
922
+ total_loss = (start_loss + end_loss) / 2
923
+
924
+ if not return_dict:
925
+ output = (start_logits, end_logits) + outputs[2:]
926
+ return ((total_loss,) + output) if total_loss is not None else output
927
+
928
+ return QuestionAnsweringModelOutput(
929
+ loss=total_loss,
930
+ start_logits=start_logits,
931
+ end_logits=end_logits,
932
+ hidden_states=outputs.hidden_states,
933
+ attentions=outputs.attentions,
934
+ )
935
+
936
+
937
+ # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
938
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
939
+ """
940
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
941
+ are ignored. This is modified from fairseq's `utils.make_positions`.
942
+
943
+ Args:
944
+ x: torch.Tensor x:
945
+
946
+ Returns: torch.Tensor
947
+ """
948
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
949
+ mask = input_ids.ne(padding_idx).int()
950
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
951
+ return incremental_indices.long() + padding_idx