| from transformers import SmolVLMProcessor | |
| from typing import TYPE_CHECKING, Dict, List, Optional, Union | |
| class SmolVLMQwen3Processor(SmolVLMProcessor): | |
| attributes = ["image_processor", "tokenizer"] | |
| valid_kwargs = ["image_seq_len", "chat_template"] | |
| image_processor_class = "SmolVLMImageProcessor" | |
| tokenizer_class = "Qwen2TokenizerFast" | |
| def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, | |
| **kwargs): | |
| super().__init__(image_processor, tokenizer, image_seq_len, chat_template=chat_template, **kwargs) | |
| self.fake_image_token = "<vision_start>" | |
| self.image_token = "<|image_pad|>" | |
| self.image_token_id = 151655 | |
| self.end_of_utterance_token = "<im_end>" | |
| self.global_image_token = "<|vision_pad|>" | |
| self.video_token = "<|vision_pad|>" | |
| self.chat_template = self.tokenizer.chat_template | |