from transformers import ProcessorMixin, AutoProcessor from transformers.models.auto.processing_auto import AutoProcessor from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import BatchEncoding import json import os class GITProcessor(ProcessorMixin): """ Custom processor that combines a tokenizer and feature extractor. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) def __call__(self, text=None, images=None, **kwargs): """ Main processing method that handles both text and images. Args: text: Text input(s) to tokenize images: Image input(s) to process **kwargs: Additional arguments passed to tokenizer/image_processor Returns: Dictionary with processed inputs """ if text is None and images is None: raise ValueError("You need to specify either text or images") encoding = {} # Process text if provided if text is not None: text_encoding = self.tokenizer(text, **kwargs) encoding.update(text_encoding) # Process images if provided if images is not None: image_encoding = self.image_processor(images, **kwargs) # Add prefix to avoid key conflicts for key, value in image_encoding.items(): encoding[f"pixel_values" if key == "pixel_values" else f"image_{key}"] = value return BatchEncoding(encoding) def batch_decode(self, *args, **kwargs): """ Delegate batch decoding to the tokenizer. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """ Delegate decoding to the tokenizer. """ return self.tokenizer.decode(*args, **kwargs)