Spaces:
Runtime error
Runtime error
| # coding=utf-8 | |
| # Copyright 2021 The HuggingFace Inc. team. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ Classes to support Vision-Encoder-Text-Decoder architectures""" | |
| import timeit | |
| from typing import Optional | |
| import torch | |
| from torch import nn | |
| from torch.nn import CrossEntropyLoss | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput | |
| from transformers.modeling_utils import PreTrainedModel | |
| #from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings | |
| from transformers.utils import logging | |
| from transformers.models.auto.configuration_auto import AutoConfig | |
| from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM | |
| from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig | |
| import inspect | |
| from .gpt2 import ThisGPT2LMHeadModel | |
| from .gpt2 import ThisGPT2Config | |
| from .xglm import ThisXGLMForCausalLM | |
| from .xglm import ThisXGLMConfig | |
| from .opt import ThisOPTForCausalLM | |
| from .opt import ThisOPTConfig | |
| # Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right | |
| def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): | |
| """ | |
| Shift input ids one token to the right. | |
| """ | |
| shifted_input_ids = input_ids.new_zeros(input_ids.shape) | |
| shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() | |
| if decoder_start_token_id is None: | |
| raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.") | |
| shifted_input_ids[:, 0] = decoder_start_token_id | |
| if pad_token_id is None: | |
| raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.") | |
| # replace possible -100 values in labels by `pad_token_id` | |
| shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) | |
| return shifted_input_ids | |
| logger = logging.get_logger(__name__) | |
| _CONFIG_FOR_DOC = "SmallCapConfig" | |
| VISION_ENCODER_DECODER_START_DOCSTRING = r""" | |
| This class can be used to initialize an image-to-text-sequence model with any pretrained vision autoencoding model | |
| as the encoder and any pretrained text autoregressive model as the decoder. The encoder is loaded via | |
| [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`] | |
| function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream | |
| generative task, like image captioning. | |
| The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation | |
| tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation | |
| Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi | |
| Zhou, Wei Li, Peter J. Liu. | |
| Additionally, in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained | |
| Models](https://arxiv.org/abs/2109.10282) it is shown how leveraging large pretrained vision models for optical | |
| character recognition (OCR) yields a significant performance improvement. | |
| After such a Vision-Encoder-Text-Decoder model has been trained/fine-tuned, it can be saved/loaded just like any | |
| other models (see the examples for more information). | |
| This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the | |
| library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads | |
| etc.) | |
| This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. | |
| Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage | |
| and behavior. | |
| Parameters: | |
| config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model. | |
| Initializing with a config file does not load the weights associated with the model, only the | |
| configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. | |
| """ | |
| VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r""" | |
| Args: | |
| pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): | |
| Pixel values. Pixel values can be obtained using a feature extractor (e.g. if you use ViT as the encoder, | |
| you should use [`ViTFeatureExtractor`]). See [`ViTFeatureExtractor.__call__`] for details. | |
| decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): | |
| Indices of decoder input sequence tokens in the vocabulary. | |
| Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and | |
| [`PreTrainedTokenizer.__call__`] for details. | |
| [What are input IDs?](../glossary#input-ids) | |
| If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see | |
| `past_key_values`). | |
| For training, `decoder_input_ids` are automatically created by the model by shifting the `labels` to the | |
| right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`. | |
| decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): | |
| Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also | |
| be used by default. | |
| encoder_outputs (`tuple(torch.FloatTensor)`, *optional*): | |
| This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) | |
| `last_hidden_state` (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) is a tensor | |
| of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the | |
| decoder. | |
| past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): | |
| Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. | |
| If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that | |
| don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all | |
| `decoder_input_ids` of shape `(batch_size, sequence_length)`. | |
| decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): | |
| Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded | |
| representation. This is useful if you want more control over how to convert `decoder_input_ids` indices | |
| into associated vectors than the model's internal embedding lookup matrix. | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, | |
| ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored | |
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
| use_cache (`bool`, *optional*): | |
| If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see | |
| `past_key_values`). | |
| output_attentions (`bool`, *optional*): | |
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned | |
| tensors for more detail. | |
| output_hidden_states (`bool`, *optional*): | |
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for | |
| more detail. | |
| return_dict (`bool`, *optional*): | |
| If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple. | |
| kwargs: (*optional*) Remaining dictionary of keyword arguments. Keyword arguments come in two flavors: | |
| - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function. | |
| - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function. | |
| """ | |
| class SmallCapConfig(VisionEncoderDecoderConfig): | |
| model_type = "smallcap" | |
| def __init__( | |
| self, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| class SmallCap(PreTrainedModel): | |
| r""" | |
| [`VisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with | |
| one of the base vision model classes of the library as encoder and another one as decoder when created with the | |
| :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and | |
| :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder. | |
| """ | |
| config_class = SmallCapConfig | |
| base_model_prefix = "smallcap" | |
| main_input_name = "pixel_values" | |
| def __init__( | |
| self, | |
| config: Optional[PretrainedConfig] = None, | |
| encoder: Optional[PreTrainedModel] = None, | |
| decoder: Optional[PreTrainedModel] = None, | |
| ): | |
| if config is None and (encoder is None or decoder is None): | |
| raise ValueError("Either a configuration or an encoder and a decoder has to be provided.") | |
| if config is None: | |
| config = SmallCapConfig.from_encoder_decoder_configs(encoder.config, decoder.config) | |
| else: | |
| if not isinstance(config, self.config_class): | |
| raise ValueError(f"Config: {config} has to be of type {self.config_class}") | |
| if config.decoder.cross_attention_hidden_size is not None: | |
| if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size: | |
| raise ValueError( | |
| "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal#" | |
| f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for" | |
| f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for" | |
| " `config.encoder.hidden_size`." | |
| ) | |
| # initialize with config | |
| # make sure input & output embeddings is not tied | |
| config.tie_word_embeddings = False | |
| super().__init__(config) | |
| if encoder is None: | |
| encoder = AutoModel.from_config(config.encoder) | |
| if decoder is None: | |
| decoder = AutoModelForCausalLM.from_config(config.decoder) | |
| self.encoder = encoder.vision_model | |
| self.encoder.main_input_name = 'pixel_values' | |
| self.decoder = decoder | |
| # make sure that the individual model's config refers to the shared config | |
| # so that the updates to the config will be synced | |
| self.encoder.config = self.config.encoder | |
| self.decoder.config = self.config.decoder | |
| def get_encoder(self): | |
| return self.encoder | |
| def get_decoder(self): | |
| return self.decoder | |
| def get_output_embeddings(self): | |
| return self.decoder.get_output_embeddings() | |
| def set_output_embeddings(self, new_embeddings): | |
| return self.decoder.set_output_embeddings(new_embeddings) | |
| def from_pretrained(cls, *args, **kwargs): | |
| # At the moment fast initialization is not supported for composite models | |
| if kwargs.get("_fast_init", False): | |
| logger.warning( | |
| "Fast initialization is currently not supported for VisionEncoderDecoderModel. " | |
| "Falling back to slow initialization..." | |
| ) | |
| kwargs["_fast_init"] = False | |
| return super().from_pretrained(*args, **kwargs) | |
| def from_encoder_decoder_pretrained( | |
| cls, | |
| encoder_pretrained_model_name_or_path: str = None, | |
| decoder_pretrained_model_name_or_path: str = None, | |
| cross_attention_reduce_factor: int = None, | |
| *model_args, | |
| **kwargs | |
| ) -> PreTrainedModel: | |
| r""" | |
| Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model | |
| checkpoints. | |
| The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train | |
| the model, you need to first set it back in training mode with `model.train()`. | |
| Params: | |
| encoder_pretrained_model_name_or_path (`str`, *optional*): | |
| Information necessary to initiate the image encoder. Can be either: | |
| - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An | |
| example is `google/vit-base-patch16-224-in21k`. | |
| - A path to a *directory* containing model weights saved using | |
| [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. | |
| - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In | |
| this case, `from_tf` should be set to `True` and a configuration object should be provided as | |
| `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a | |
| PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. | |
| decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`): | |
| Information necessary to initiate the text decoder. Can be either: | |
| - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. | |
| Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a | |
| user or organization name, like `dbmdz/bert-base-german-cased`. | |
| - A path to a *directory* containing model weights saved using | |
| [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. | |
| - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In | |
| this case, `from_tf` should be set to `True` and a configuration object should be provided as | |
| `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a | |
| PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. | |
| model_args (remaining positional arguments, *optional*): | |
| All remaning positional arguments will be passed to the underlying model's `__init__` method. | |
| kwargs (remaining dictionary of keyword arguments, *optional*): | |
| Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., | |
| `output_attentions=True`). | |
| - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter. | |
| - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter. | |
| - To update the parent model configuration, do not use a prefix for each configuration parameter. | |
| Behaves differently depending on whether a `config` is provided or automatically loaded. | |
| Example: | |
| ```python | |
| >>> from transformers import VisionEncoderDecoderModel | |
| >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized | |
| >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( | |
| ... "google/vit-base-patch16-224-in21k", "bert-base-uncased" | |
| ... ) | |
| >>> # saving model after fine-tuning | |
| >>> model.save_pretrained("./vit-bert") | |
| >>> # load fine-tuned model | |
| >>> model = VisionEncoderDecoderModel.from_pretrained("./vit-bert") | |
| ```""" | |
| kwargs_encoder = { | |
| argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") | |
| } | |
| kwargs_decoder = { | |
| argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") | |
| } | |
| # remove encoder, decoder kwargs from kwargs | |
| for key in kwargs_encoder.keys(): | |
| del kwargs["encoder_" + key] | |
| for key in kwargs_decoder.keys(): | |
| del kwargs["decoder_" + key] | |
| # Load and initialize the encoder and decoder | |
| # The distinction between encoder and decoder at the model level is made | |
| # by the value of the flag `is_decoder` that we need to set correctly. | |
| encoder = kwargs_encoder.pop("model", None) | |
| if encoder is None: | |
| if encoder_pretrained_model_name_or_path is None: | |
| raise ValueError( | |
| "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has " | |
| "to be defined." | |
| ) | |
| if "config" not in kwargs_encoder: | |
| encoder_config, kwargs_encoder = AutoConfig.from_pretrained( | |
| encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True | |
| ) | |
| if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: | |
| logger.info( | |
| f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " | |
| "from a decoder model. Cross-attention and casual mask are disabled." | |
| ) | |
| encoder_config.is_decoder = False | |
| encoder_config.add_cross_attention = False | |
| kwargs_encoder["config"] = encoder_config | |
| encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) | |
| decoder = kwargs_decoder.pop("model", None) | |
| if decoder is None: | |
| if decoder_pretrained_model_name_or_path is None: | |
| raise ValueError( | |
| "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has " | |
| "to be defined." | |
| ) | |
| if "config" not in kwargs_decoder: | |
| if "xglm" in decoder_pretrained_model_name_or_path: | |
| decoder_config, kwargs_decoder = ThisXGLMConfig.from_pretrained( | |
| decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True | |
| ) | |
| elif "opt" in decoder_pretrained_model_name_or_path: | |
| decoder_config, kwargs_decoder = ThisOPTConfig.from_pretrained( | |
| decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True | |
| ) | |
| else: | |
| decoder_config, kwargs_decoder = ThisGPT2Config.from_pretrained( | |
| decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True | |
| ) | |
| if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: | |
| logger.info( | |
| f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention" | |
| f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if" | |
| f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." | |
| ) | |
| decoder_config.is_decoder = True | |
| decoder_config.add_cross_attention = True | |
| decoder_config.encoder_hidden_size = encoder.config.vision_config.hidden_size | |
| decoder_config.cross_attention_reduce_factor = cross_attention_reduce_factor | |
| kwargs_decoder["config"] = decoder_config | |
| if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False: | |
| logger.warning( | |
| f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. " | |
| f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, " | |
| "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` " | |
| "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a " | |
| "`decoder_config` to `.from_encoder_decoder_pretrained(...)`" | |
| ) | |
| #decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) | |
| if "xglm" in decoder_pretrained_model_name_or_path: | |
| decoder = ThisXGLMForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) | |
| elif "opt" in decoder_pretrained_model_name_or_path: | |
| decoder = ThisOPTForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) | |
| else: | |
| decoder = ThisGPT2LMHeadModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) | |
| # instantiate config with corresponding kwargs | |
| config = SmallCapConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) | |
| # make sure input & output embeddings is not tied | |
| config.tie_word_embeddings = False | |
| return cls(encoder=encoder, decoder=decoder, config=config) | |
| def forward( | |
| self, | |
| pixel_values=None, | |
| decoder_input_ids=None, | |
| decoder_attention_mask=None, | |
| encoder_outputs=None, | |
| past_key_values=None, | |
| decoder_inputs_embeds=None, | |
| labels=None, | |
| use_cache=None, | |
| output_attentions=None, | |
| output_hidden_states=None, | |
| return_dict=None, | |
| **kwargs, | |
| ): | |
| r""" | |
| Returns: | |
| Examples: | |
| ```python | |
| >>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| >>> import requests | |
| >>> from PIL import Image | |
| >>> import torch | |
| >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") | |
| >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") | |
| >>> # load image from the IAM dataset | |
| >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" | |
| >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") | |
| >>> # training | |
| >>> model.config.decoder_start_token_id = processor.tokenizer.cls_token_id | |
| >>> model.config.pad_token_id = processor.tokenizer.pad_token_id | |
| >>> model.config.vocab_size = model.config.decoder.vocab_size | |
| >>> pixel_values = processor(image, return_tensors="pt").pixel_values | |
| >>> text = "hello world" | |
| >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids | |
| >>> outputs = model(pixel_values=pixel_values, labels=labels) | |
| >>> loss = outputs.loss | |
| >>> # inference (generation) | |
| >>> generated_ids = model.generate(pixel_values) | |
| >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| ```""" | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} | |
| kwargs_decoder = { | |
| argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") | |
| } | |
| if encoder_outputs is None: | |
| if pixel_values is None: | |
| raise ValueError("You have to specify pixel_values") | |
| encoder_outputs = self.encoder( | |
| pixel_values=pixel_values, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=return_dict, | |
| **kwargs_encoder, | |
| ) | |
| elif isinstance(encoder_outputs, tuple): | |
| encoder_outputs = BaseModelOutput(*encoder_outputs) | |
| else: | |
| encoder_outputs = BaseModelOutput(encoder_outputs, None) | |
| encoder_hidden_states = encoder_outputs[0] | |
| # else: | |
| encoder_attention_mask = None | |
| if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None): | |
| decoder_input_ids = shift_tokens_right( | |
| labels, self.config.pad_token_id, self.config.decoder_start_token_id | |
| ) | |
| # Decode | |
| decoder_outputs = self.decoder( | |
| input_ids=decoder_input_ids, | |
| attention_mask=decoder_attention_mask, | |
| encoder_hidden_states=encoder_hidden_states, | |
| encoder_attention_mask=encoder_attention_mask, | |
| inputs_embeds=decoder_inputs_embeds, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| use_cache=use_cache, | |
| past_key_values=past_key_values, | |
| return_dict=return_dict, | |
| **kwargs_decoder, | |
| ) | |
| # Compute loss independent from decoder (as some shift the logits inside them) | |
| loss = None | |
| if labels is not None: | |
| logits = decoder_outputs.logits if return_dict else decoder_outputs[0] | |
| loss_fct = CrossEntropyLoss() | |
| loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.view(-1)) | |
| if not return_dict: | |
| if loss is not None: | |
| return (loss,) + decoder_outputs + encoder_outputs | |
| else: | |
| return decoder_outputs + encoder_outputs | |
| return Seq2SeqLMOutput( | |
| loss=loss, | |
| logits=decoder_outputs.logits, | |
| past_key_values=decoder_outputs.past_key_values, | |
| decoder_hidden_states=decoder_outputs.hidden_states, | |
| decoder_attentions=decoder_outputs.attentions, | |
| cross_attentions=decoder_outputs.cross_attentions, | |
| encoder_last_hidden_state=encoder_outputs.last_hidden_state, | |
| encoder_hidden_states=encoder_outputs.hidden_states, | |
| encoder_attentions=encoder_outputs.attentions, | |
| ) | |
| def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): | |
| return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) | |
| def prepare_inputs_for_generation( | |
| self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs | |
| ): | |
| decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past=past) | |
| decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None | |
| input_dict = { | |
| "attention_mask": attention_mask, | |
| "decoder_attention_mask": decoder_attention_mask, | |
| "decoder_input_ids": decoder_inputs["input_ids"], | |
| "encoder_outputs": encoder_outputs, | |
| "past_key_values": decoder_inputs["past_key_values"], | |
| "use_cache": use_cache, | |
| } | |
| return input_dict | |
| def resize_token_embeddings(self, *args, **kwargs): | |
| raise NotImplementedError( | |
| "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the" | |
| " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))" | |
| ) | |
| def _reorder_cache(self, past, beam_idx): | |
| # apply decoder cache reordering here | |
| return self.decoder._reorder_cache(past, beam_idx) | |