Model Initial Update 1

Browse files

Update without safetensors

Files changed (12) hide show

added_tokens.json +106 -0
config.json +63 -0
convnext.py +624 -0
model.safetensors.index.json +0 -0
modeling_chatrex.py +880 -0
preprocessing_chatrex.py +263 -0
preprocessor_config.json +28 -0
processor_config.json +6 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +876 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,106 @@

+{
+  "</ground>": 32101,
+  "</objects>": 32103,
+  "<ground>": 32100,
+  "<obj0>": 32000,
+  "<obj10>": 32010,
+  "<obj11>": 32011,
+  "<obj12>": 32012,
+  "<obj13>": 32013,
+  "<obj14>": 32014,
+  "<obj15>": 32015,
+  "<obj16>": 32016,
+  "<obj17>": 32017,
+  "<obj18>": 32018,
+  "<obj19>": 32019,
+  "<obj1>": 32001,
+  "<obj20>": 32020,
+  "<obj21>": 32021,
+  "<obj22>": 32022,
+  "<obj23>": 32023,
+  "<obj24>": 32024,
+  "<obj25>": 32025,
+  "<obj26>": 32026,
+  "<obj27>": 32027,
+  "<obj28>": 32028,
+  "<obj29>": 32029,
+  "<obj2>": 32002,
+  "<obj30>": 32030,
+  "<obj31>": 32031,
+  "<obj32>": 32032,
+  "<obj33>": 32033,
+  "<obj34>": 32034,
+  "<obj35>": 32035,
+  "<obj36>": 32036,
+  "<obj37>": 32037,
+  "<obj38>": 32038,
+  "<obj39>": 32039,
+  "<obj3>": 32003,
+  "<obj40>": 32040,
+  "<obj41>": 32041,
+  "<obj42>": 32042,
+  "<obj43>": 32043,
+  "<obj44>": 32044,
+  "<obj45>": 32045,
+  "<obj46>": 32046,
+  "<obj47>": 32047,
+  "<obj48>": 32048,
+  "<obj49>": 32049,
+  "<obj4>": 32004,
+  "<obj50>": 32050,
+  "<obj51>": 32051,
+  "<obj52>": 32052,
+  "<obj53>": 32053,
+  "<obj54>": 32054,
+  "<obj55>": 32055,
+  "<obj56>": 32056,
+  "<obj57>": 32057,
+  "<obj58>": 32058,
+  "<obj59>": 32059,
+  "<obj5>": 32005,
+  "<obj60>": 32060,
+  "<obj61>": 32061,
+  "<obj62>": 32062,
+  "<obj63>": 32063,
+  "<obj64>": 32064,
+  "<obj65>": 32065,
+  "<obj66>": 32066,
+  "<obj67>": 32067,
+  "<obj68>": 32068,
+  "<obj69>": 32069,
+  "<obj6>": 32006,
+  "<obj70>": 32070,
+  "<obj71>": 32071,
+  "<obj72>": 32072,
+  "<obj73>": 32073,
+  "<obj74>": 32074,
+  "<obj75>": 32075,
+  "<obj76>": 32076,
+  "<obj77>": 32077,
+  "<obj78>": 32078,
+  "<obj79>": 32079,
+  "<obj7>": 32007,
+  "<obj80>": 32080,
+  "<obj81>": 32081,
+  "<obj82>": 32082,
+  "<obj83>": 32083,
+  "<obj84>": 32084,
+  "<obj85>": 32085,
+  "<obj86>": 32086,
+  "<obj87>": 32087,
+  "<obj88>": 32088,
+  "<obj89>": 32089,
+  "<obj8>": 32008,
+  "<obj90>": 32090,
+  "<obj91>": 32091,
+  "<obj92>": 32092,
+  "<obj93>": 32093,
+  "<obj94>": 32094,
+  "<obj95>": 32095,
+  "<obj96>": 32096,
+  "<obj97>": 32097,
+  "<obj98>": 32098,
+  "<obj99>": 32099,
+  "<obj9>": 32009,
+  "<objects>": 32102
+}

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "architectures": [
+    "ChatRexAuxForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_chatrex.ChatRexAuxConfig",
+    "AutoModelForCausalLM": "modeling_chatrex.ChatRexAuxForConditionalGeneration"
+  },
+  "ignore_index": -100,
+  "image_token_index": 32000,
+  "model_type": "chatrex",
+  "projector_depth": 2,
+  "projector_hidden_act": "gelu",
+  "text_config": {
+    "_name_or_path": "huggingface_checkpoints/lmsys/vicuna-7b-v1.5",
+    "architectures": [
+      "LlamaForCausalLM"
+    ],
+    "max_position_embeddings": 4096,
+    "model_type": "llama",
+    "pad_token_id": 0,
+    "rms_norm_eps": 1e-05,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 32104
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "vision_aux_config": {
+    "optimize_vision_tower_aux": false,
+    "type": "OpenCLIPVisionTower",
+    "use_last_feat": true,
+    "vision_tower": "openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup"
+  },
+  "vision_config": {
+    "_name_or_path": "huggingface_checkpoints/openai/clip-vit-large-patch14-336",
+    "dropout": 0.0,
+    "hidden_size": 1024,
+    "image_size": 336,
+    "intermediate_size": 4096,
+    "model_type": "clip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "projection_dim": 768
+  },
+  "vision_feature_layer": -2,
+  "vision_feature_select_strategy": "default",
+  "visual_prompt_encoder_config": {
+    "add_pos_embedding": true,
+    "channel_per_level": [
+      192,
+      384,
+      768,
+      1536
+    ],
+    "output_size": 7,
+    "pos_embedding_dim": 2880,
+    "spatail_scale": 0.25,
+    "type": "MultiLevelROIVisualPrompt",
+    "with_additional_projection": false
+  },
+  "visual_prompt_hidden_size": 2880
+}

convnext.py ADDED Viewed

	@@ -0,0 +1,624 @@

+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from open_clip.factory import get_model_config
+from open_clip.model import CLIPVisionCfg
+from timm.layers import (AvgPool2dSame, ClassifierHead, DropPath,
+                         GlobalResponseNormMlp, LayerNorm, LayerNorm2d, Mlp,
+                         NormMlpClassifierHead, create_conv2d, get_act_layer,
+                         make_divisible, to_ntuple, trunc_normal_)
+from timm.models._builder import build_model_with_cfg
+from timm.models._features import feature_take_indices
+from timm.models._manipulate import checkpoint_seq, named_apply
+__all__ = ['ConvNeXt']  # model_registry will add each entrypoint fn to this
+class Downsample(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1):
+        super().__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        if in_chs != out_chs:
+            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+        else:
+            self.conv = nn.Identity()
+    def forward(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        return x
+class ConvNeXtBlock(nn.Module):
+    """ ConvNeXt Block
+    There are two equivalent implementations:
+      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+    """
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: Optional[int] = None,
+            kernel_size: int = 7,
+            stride: int = 1,
+            dilation: Union[int, Tuple[int, int]] = (1, 1),
+            mlp_ratio: float = 4,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            ls_init_value: Optional[float] = 1e-6,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Callable] = None,
+            drop_path: float = 0.,
+    ):
+        """
+        Args:
+            in_chs: Block input channels.
+            out_chs: Block output channels (same as in_chs if None).
+            kernel_size: Depthwise convolution kernel size.
+            stride: Stride of depthwise convolution.
+            dilation: Tuple specifying input and output dilation of block.
+            mlp_ratio: MLP expansion ratio.
+            conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
+            conv_bias: Apply bias for all convolution (linear) layers.
+            use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
+            ls_init_value: Layer-scale init values, layer-scale applied if not None.
+            act_layer: Activation layer.
+            norm_layer: Normalization layer (defaults to LN if not specified).
+            drop_path: Stochastic depth probability.
+        """
+        super().__init__()
+        out_chs = out_chs or in_chs
+        dilation = to_ntuple(2)(dilation)
+        act_layer = get_act_layer(act_layer)
+        if not norm_layer:
+            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+        mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
+        self.use_conv_mlp = conv_mlp
+        self.conv_dw = create_conv2d(
+            in_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation[0],
+            depthwise=True,
+            bias=conv_bias,
+        )
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+        self.ramma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
+        else:
+            self.shortcut = nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = x.permute(0, 3, 1, 2)
+        if self.ramma is not None:
+            x = x.mul(self.ramma.reshape(1, -1, 1, 1))
+        x = self.drop_path(x) + self.shortcut(shortcut)
+        return x
+class ConvNeXtStage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size=7,
+            stride=2,
+            depth=2,
+            dilation=(1, 1),
+            drop_path_rates=None,
+            ls_init_value=1.0,
+            conv_mlp=False,
+            conv_bias=True,
+            use_grn=False,
+            act_layer='gelu',
+            norm_layer=None,
+            norm_layer_cl=None
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+            pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                create_conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=ds_ks,
+                    stride=stride,
+                    dilation=dilation[0],
+                    padding=pad,
+                    bias=conv_bias,
+                ),
+            )
+            in_chs = out_chs
+        else:
+            self.downsample = nn.Identity()
+        drop_path_rates = drop_path_rates or [0.] * depth
+        stage_blocks = []
+        for i in range(depth):
+            stage_blocks.append(ConvNeXtBlock(
+                in_chs=in_chs,
+                out_chs=out_chs,
+                kernel_size=kernel_size,
+                dilation=dilation[1],
+                drop_path=drop_path_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                use_grn=use_grn,
+                act_layer=act_layer,
+                norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+            ))
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*stage_blocks)
+    def forward(self, x):
+        x = self.downsample(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
+    """
+    def __init__(
+            self,
+            in_chans: int = 3,
+            num_classes: int = 1000,
+            global_pool: str = 'avg',
+            output_stride: int = 32,
+            depths: Tuple[int, ...] = (3, 3, 9, 3),
+            dims: Tuple[int, ...] = (96, 192, 384, 768),
+            kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+            ls_init_value: Optional[float] = 1e-6,
+            stem_type: str = 'patch',
+            patch_size: int = 4,
+            head_init_scale: float = 1.,
+            head_norm_first: bool = False,
+            head_hidden_size: Optional[int] = None,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Union[str, Callable]] = None,
+            norm_eps: Optional[float] = None,
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
+    ):
+        """
+        Args:
+            in_chans: Number of input image channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Global pooling type.
+            output_stride: Output stride of network, one of (8, 16, 32).
+            depths: Number of blocks at each stage.
+            dims: Feature dimension at each stage.
+            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+            ls_init_value: Init value for Layer Scale, disabled if None.
+            stem_type: Type of stem.
+            patch_size: Stem patch size for patch stem.
+            head_init_scale: Init scaling value for classifier weights and biases.
+            head_norm_first: Apply normalization before global pool + head.
+            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+            conv_bias: Use bias layers w/ all convolutions.
+            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+            act_layer: Activation layer type.
+            norm_layer: Normalization layer type.
+            drop_rate: Head pre-classifier dropout rate.
+            drop_path_rate: Stochastic depth drop rate.
+        """
+        super().__init__()
+        assert output_stride in (8, 16, 32)
+        kernel_sizes = to_ntuple(4)(kernel_sizes)
+        if norm_layer is None:
+            norm_layer = LayerNorm2d
+            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+            if norm_eps is not None:
+                norm_layer = partial(norm_layer, eps=norm_eps)
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        else:
+            assert conv_mlp,\
+                'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
+            norm_layer_cl = norm_layer
+            if norm_eps is not None:
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.feature_info = []
+        assert stem_type in ('patch', 'overlap', 'overlap_tiered')
+        if stem_type == 'patch':
+            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = patch_size
+        else:
+            mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = 4
+        self.stages = nn.Sequential()
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        stages = []
+        prev_chs = dims[0]
+        curr_stride = stem_stride
+        dilation = 1
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            if curr_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            curr_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+            out_chs = dims[i]
+            stages.append(ConvNeXtStage(
+                prev_chs,
+                out_chs,
+                kernel_size=kernel_sizes[i],
+                stride=stride,
+                dilation=(first_dilation, dilation),
+                depth=depths[i],
+                drop_path_rates=dp_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                use_grn=use_grn,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                norm_layer_cl=norm_layer_cl,
+            ))
+            prev_chs = out_chs
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
+        self.stages = nn.Sequential(*stages)
+        self.num_features = self.head_hidden_size = prev_chs
+        # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
+        # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
+        if head_norm_first:
+            assert not head_hidden_size
+            self.norm_pre = norm_layer(self.num_features)
+            self.head = ClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+            )
+        else:
+            self.norm_pre = nn.Identity()
+            self.head = NormMlpClassifierHead(
+                self.num_features,
+                num_classes,
+                hidden_size=head_hidden_size,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+                norm_layer=norm_layer,
+                act_layer='gelu',
+            )
+            self.head_hidden_size = self.head.num_features
+        named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.downsample', (0,)),  # blocks
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^norm_pre', (99999,))
+            ]
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head.fc
+    def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None):
+        self.num_classes = num_classes
+        self.head.reset(num_classes, global_pool)
+    def forward_intermediates(
+            self,
+            x: torch.Tensor,
+            indices: Optional[Union[int, List[int], Tuple[int]]] = None,
+            norm: bool = False,
+            stop_early: bool = False,
+            output_fmt: str = 'NCHW',
+            intermediates_only: bool = False,
+    ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
+        """ Forward features that returns intermediates.
+        Args:
+            x: Input image tensor
+            indices: Take last n blocks if int, all if None, select matching indices if sequence
+            norm: Apply norm layer to compatible intermediates
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            output_fmt: Shape of intermediate feature outputs
+            intermediates_only: Only return intermediate features
+        Returns:
+        """
+        assert output_fmt in ('NCHW',), 'Output shape must be NCHW.'
+        intermediates = []
+        take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
+        # forward pass
+        feat_idx = 0  # stem is index 0
+        x = self.stem(x)
+        if feat_idx in take_indices:
+            intermediates.append(x)
+        if torch.jit.is_scripting() or not stop_early:  # can't slice blocks in torchscript
+            stages = self.stages
+        else:
+            stages = self.stages[:max_index]
+        for stage in stages:
+            feat_idx += 1
+            x = stage(x)
+            if feat_idx in take_indices:
+                # NOTE not bothering to apply norm_pre when norm=True as almost no models have it enabled
+                intermediates.append(x)
+        if intermediates_only:
+            return intermediates
+        x = self.norm_pre(x)
+        return x, intermediates
+    def prune_intermediate_layers(
+            self,
+            indices: Union[int, List[int], Tuple[int]] = 1,
+            prune_norm: bool = False,
+            prune_head: bool = True,
+    ):
+        """ Prune layers not required for specified intermediates.
+        """
+        take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
+        self.stages = self.stages[:max_index]  # truncate blocks w/ stem as idx 0
+        if prune_norm:
+            self.norm_pre = nn.Identity()
+        if prune_head:
+            self.reset_classifier(0, '')
+        return take_indices
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm_pre(x)
+        return x
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=True) if pre_logits else self.head(x)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+def _init_weights(module, name=None, head_init_scale=1.0):
+    if isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=.02)
+        nn.init.zeros_(module.bias)
+        if name and 'head.' in name:
+            module.weight.data.mul_(head_init_scale)
+            module.bias.data.mul_(head_init_scale)
+def checkpoint_filter_fn(state_dict, model):
+    """ Remap FB checkpoints -> timm """
+    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
+        return state_dict  # non-FB checkpoint
+    if 'model' in state_dict:
+        state_dict = state_dict['model']
+    out_dict = {}
+    if 'visual.trunk.stem.0.weight' in state_dict:
+        out_dict = {k.replace('visual.trunk.', ''): v for k, v in state_dict.items() if k.startswith('visual.trunk.')}
+        if 'visual.head.proj.weight' in state_dict:
+            out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
+            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
+        elif 'visual.head.mlp.fc1.weight' in state_dict:
+            out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
+            out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
+            out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
+            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
+        return out_dict
+    import re
+    for k, v in state_dict.items():
+        k = k.replace('downsample_layers.0.', 'stem.')
+        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
+        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
+        k = k.replace('dwconv', 'conv_dw')
+        k = k.replace('pwconv', 'mlp.fc')
+        if 'grn' in k:
+            k = k.replace('grn.beta', 'mlp.grn.bias')
+            k = k.replace('grn.ramma', 'mlp.grn.weight')
+            v = v.reshape(v.shape[-1])
+        k = k.replace('head.', 'head.fc.')
+        if k.startswith('norm.'):
+            k = k.replace('norm', 'head.norm')
+        if v.ndim == 2 and 'head' not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        out_dict[k] = v
+    return out_dict
+def _create_convnext(variant, pretrained=False, **kwargs):
+    if kwargs.get('pretrained_cfg', '') == 'fcmae':
+        # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
+        # This is workaround loading with num_classes=0 w/o removing norm-layer.
+        kwargs.setdefault('pretrained_strict', False)
+    model = build_model_with_cfg(
+        ConvNeXt, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **kwargs)
+    return model
+def convnext_large(pretrained=False, **kwargs) -> ConvNeXt:
+    model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
+    model = _create_convnext('convnext_large', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+        self,
+        embed_dim: int,
+        vision_cfg: CLIPVisionCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+        output_dict: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = convnext_large()
+class ConvNextVisionEncoder(nn.Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.model_type = "convnext_large_d_320"
+        self.model_channel = [192, 384, 768, 1536]  # stage 0-3
+        clip_model = CLIP(**get_model_config(self.model_type), use_text=False)
+        # decompose stem and stages blocks in vision tower
+        self.vision_stem = clip_model.visual.stem
+        self.vision_stages = clip_model.visual.stages
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.backbone(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                )
+                image_features.append(image_feature)
+        else:
+            image_features = self.backbone(
+                images.to(device=self.device, dtype=self.dtype),
+            )
+        return {
+            "image_features": image_features,
+            "last_feat": image_features[-1],
+        }
+    def backbone(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], List[int]]:
+        """Process the input images through the backbone network.
+        Inputs:
+            images (torch.Tensor): The input images.
+        Returns:
+            Tuple[List[torch.Tensor], List[int]]: A tuple containing a list of feature maps and a
+                ist of channels per level.
+        """
+        with torch.no_grad():
+            results = self.basic_forward(images)
+        feature_maps = []
+        for _stage in results:
+            feature_maps.append(results[_stage].contiguous())
+        return feature_maps
+    def basic_forward(self, images):
+        results = {}
+        x = self.vision_stem(images)
+        for _idx in range(len(self.vision_stages)):
+            x = self.vision_stages[_idx](x)
+            results[f"stage_{_idx}"] = x
+        return results
+    @property
+    def dtype(self):
+        return self.vision_stem[0].weight.dtype
+    @property
+    def device(self):
+        return self.vision_stem[0].weight.device
+    @property
+    def config(self):
+        return self.vision_config
+    @property
+    def hidden_size(self):
+        return sum(self.model_channel)
+if __name__ == '__main__':
+    model = ConvNextVisionEncoder()
+    print(model.state_dict().keys())

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_chatrex.py ADDED Viewed

	@@ -0,0 +1,880 @@

+import json
+import logging
+import math
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from open_clip.factory import get_model_config, load_state_dict
+from open_clip.model import (CLIPTextCfg, CLIPVisionCfg, _build_text_tower,
+                             _build_vision_tower,
+                             convert_to_custom_text_state_dict)
+from open_clip.transformer import text_global_pool
+from torch import nn
+from torchvision.ops import roi_align
+from transformers import (CONFIG_MAPPING, AutoConfig, AutoModel,
+                          AutoModelForCausalLM, GenerationConfig,
+                          PretrainedConfig, PreTrainedModel, StoppingCriteria,
+                          StoppingCriteriaList)
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation import GenerationConfig
+from transformers.modeling_utils import load_state_dict
+from transformers.utils import logging, strtobool
+from .convnext import ConvNextVisionEncoder
+logger = logging.get_logger(__name__)
+XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
+XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN_INDEX = 0
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+# For Objects
+DEFAULT_OBJECT_TOKEN = "<obj<i>>"
+DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
+DEFAULT_OBJECT_INDEX = -300
+# For Grounding
+DEFAULT_GROUNDING_START = "<ground>"
+DEFAULT_GROUNDING_END = "</ground>"
+DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
+DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
+def is_fsdp_enabled():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
+        and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
+    )
+def get_token_slices(input_ids: torch.Tensor):
+    """
+    Get slices of tokens based on special markers in the input tensor.
+    Args:
+        input_ids (torch.Tensor): A tensor of token IDs where IMAGE_TOKEN_INDEX represents an image token,
+            DEFAULT_OBJECT_INDEX represents an object token, and all other values represent text tokens.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the type of the
+            token slice ('text', 'image', 'object') and the span as a list of start and end indices.
+    """
+    # define type markers and corresponding types
+    type_map = {IMAGE_TOKEN_INDEX: "image", DEFAULT_OBJECT_INDEX: "object"}
+    # find the positions of special markers
+    image_indices = torch.where(input_ids == IMAGE_TOKEN_INDEX)[0]
+    object_indices = torch.where(input_ids == DEFAULT_OBJECT_INDEX)[0]
+    if len(object_indices) > 0:
+        has_object = True
+    else:
+        has_object = False
+    # merge all the positions of special markers
+    special_indices = torch.cat((image_indices, object_indices))
+    special_indices, _ = torch.sort(special_indices)
+    special_tokens = input_ids[special_indices]
+    slices = []
+    start_idx = 0
+    for i, idx in enumerate(special_indices):
+        if start_idx < idx:
+            slices.append({"type": "text", "span": [start_idx, idx.item()]})
+        token_type = type_map[special_tokens[i].item()]
+        slices.append({"type": token_type, "span": [idx.item(), idx.item() + 1]})
+        start_idx = idx.item() + 1
+    if start_idx < len(input_ids):
+        slices.append({"type": "text", "span": [start_idx, len(input_ids)]})
+    return slices, has_object
+def prepare_inputs_labels_for_multimodal(
+    llm,
+    input_ids: torch.LongTensor = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    labels: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    bbox_feats=None,
+    extra_llm_input_embed: nn.Embedding = None,
+    **kwargs,
+):
+    if pixel_values is None:
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "inputs_embeds": None,
+            "labels": labels,
+        }
+    _labels = labels
+    _position_ids = position_ids
+    _attention_mask = attention_mask
+    if attention_mask is None:
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+    else:
+        attention_mask = attention_mask.bool()
+    if position_ids is None:
+        position_ids = torch.arange(
+            0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+        )
+    if labels is None:
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+    # remove the padding using attention_mask -- TODO: double check
+    input_ids = [
+        cur_input_ids[cur_attention_mask]
+        for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+    ]
+    labels = [
+        cur_labels[cur_attention_mask]
+        for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+    ]
+    new_inputs_embeds = []
+    new_labels = []
+    cur_image_idx = 0
+    cur_object_idx = 0
+    for batch_idx, cur_input_ids in enumerate(input_ids):
+        num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+        if num_images == 0:
+            cur_pixel_values = pixel_values[cur_image_idx]
+            cur_inputs_embeds_1 = llm.get_input_embeddings()(cur_input_ids)
+            cur_inputs_embeds = torch.cat(
+                [cur_inputs_embeds_1, cur_pixel_values[0:0]], dim=0
+            )
+            new_inputs_embeds.append(cur_inputs_embeds)
+            new_labels.append(labels[batch_idx])
+            cur_image_idx += 1
+            cur_object_idx += 1
+            continue
+        cur_labels = labels[batch_idx]
+        token_slices, has_object = get_token_slices(cur_input_ids)
+        result_input_embeddings = []
+        result_output_labels = []
+        cur_gt_bnox_indice = 0
+        for slice in token_slices:
+            slice_type = slice["type"]
+            slice_span = slice["span"]
+            if slice_type == "text":
+                cur_input_ids_noim = cur_input_ids[slice_span[0] : slice_span[1]]
+                cur_labels_noim = cur_labels[slice_span[0] : slice_span[1]]
+                cur_input_embeds = llm.get_input_embeddings()(cur_input_ids_noim)
+                result_input_embeddings.append(cur_input_embeds)
+                result_output_labels.append(cur_labels_noim)
+            elif slice_type == "image":
+                cur_input_embeds = pixel_values[cur_image_idx]
+                result_input_embeddings.append(cur_input_embeds)
+                result_output_labels.append(
+                    torch.full(
+                        (cur_input_embeds.shape[0],),
+                        IGNORE_INDEX,
+                        device=cur_labels.device,
+                        dtype=cur_labels.dtype,
+                    )
+                )
+                cur_image_idx += 1
+            elif slice_type == "object":
+                try:
+                    result_input_embeddings.append(
+                        bbox_feats[cur_object_idx][cur_gt_bnox_indice].unsqueeze(0)
+                    )
+                except:
+                    raise ValueError(
+                        f"current boxe_feats.shape: {bbox_feats[cur_object_idx].shape}, "
+                    )
+                cur_gt_bnox_indice += 1
+                result_output_labels.append(
+                    torch.full(
+                        (1,),
+                        IGNORE_INDEX,
+                        device=cur_labels.device,
+                        dtype=cur_labels.dtype,
+                    )
+                )
+        cur_object_idx += 1
+        result_input_embeddings = torch.cat(result_input_embeddings)
+        result_output_labels = torch.cat(result_output_labels)
+        assert len(result_output_labels) == len(result_input_embeddings)
+        new_inputs_embeds.append(result_input_embeddings)
+        new_labels.append(result_output_labels)
+    # Combine them
+    max_len = max(x.shape[0] for x in new_inputs_embeds)
+    batch_size = len(new_inputs_embeds)
+    new_inputs_embeds_padded = []
+    new_labels_padded = torch.full(
+        (batch_size, max_len),
+        IGNORE_INDEX,
+        dtype=new_labels[0].dtype,
+        device=new_labels[0].device,
+    )
+    attention_mask = torch.zeros(
+        (batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device
+    )
+    position_ids = torch.zeros(
+        (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
+    )
+    for i, (cur_new_embed, cur_new_labels) in enumerate(
+        zip(new_inputs_embeds, new_labels)
+    ):
+        cur_len = cur_new_embed.shape[0]
+        new_inputs_embeds_padded.append(
+            torch.cat(
+                (
+                    cur_new_embed,
+                    torch.zeros(
+                        (max_len - cur_len, cur_new_embed.shape[1]),
+                        dtype=cur_new_embed.dtype,
+                        device=cur_new_embed.device,
+                    ),
+                ),
+                dim=0,
+            )
+        )
+        if cur_len > 0:
+            new_labels_padded[i, :cur_len] = cur_new_labels
+            attention_mask[i, :cur_len] = True
+            position_ids[i, :cur_len] = torch.arange(
+                0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+            )
+    new_inputs_embeds = torch.stack(new_inputs_embeds_padded, dim=0)
+    if _labels is None:
+        new_labels = None
+    else:
+        new_labels = new_labels_padded
+    if _attention_mask is None:
+        attention_mask = None
+    else:
+        attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+    if _position_ids is None:
+        position_ids = None
+    return {
+        "input_ids": None,
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "past_key_values": past_key_values,
+        "inputs_embeds": new_inputs_embeds,
+        "labels": new_labels,
+    }
+class StopWordStoppingCriteria(StoppingCriteria):
+    """StopWord stopping criteria."""
+    def __init__(self, tokenizer, stop_word):
+        self.tokenizer = tokenizer
+        self.stop_word = stop_word
+        self.length = len(self.stop_word)
+    def __call__(self, input_ids, *args, **kwargs) -> bool:
+        cur_text = self.tokenizer.decode(input_ids[0])
+        cur_text = cur_text.replace('\r', '').replace('\n', '')
+        return cur_text[-self.length:] == self.stop_word
+def get_stop_criteria(
+    tokenizer,
+    stop_words=[],
+):
+    stop_criteria = StoppingCriteriaList()
+    for word in stop_words:
+        stop_criteria.append(StopWordStoppingCriteria(tokenizer, word))
+    return stop_criteria
+class DualPathFuseModule(nn.Module):
+    # change channel+gate+sum
+    def __init__(self, low_res_dim, high_res_dim, zero_init=True):
+        super().__init__()
+        self.slow_conv = nn.Conv2d(high_res_dim, high_res_dim, 1)
+        self.slow_proj = nn.Conv2d(high_res_dim, low_res_dim, 1)
+        self.fast_conv = nn.Conv2d(
+            low_res_dim, low_res_dim, 7, padding=3, groups=low_res_dim
+        )
+        self.fast_proj = nn.Conv2d(low_res_dim, low_res_dim, 1)
+        self.gate = nn.Sequential(
+            nn.Linear(low_res_dim * 2, low_res_dim // 2),
+            nn.GELU(),
+            nn.Linear(low_res_dim // 2, 1),
+        )
+        nn.init.xavier_uniform_(self.slow_conv.weight)
+        nn.init.xavier_uniform_(self.fast_conv.weight)
+        nn.init.zeros_(self.slow_conv.bias)
+        nn.init.zeros_(self.fast_conv.bias)
+        if zero_init:
+            nn.init.zeros_(self.slow_proj.weight)
+            nn.init.zeros_(self.fast_proj.weight)
+        else:
+            nn.init.xavier_uniform_(self.slow_proj.weight)
+            nn.init.xavier_uniform_(self.fast_proj.weight)
+        nn.init.zeros_(self.slow_proj.bias)
+        nn.init.zeros_(self.fast_proj.bias)
+    def forward(self, low_res_feat, high_res_feat, sampler=None):
+        b, c, h, w = high_res_feat.shape  # (2, 1536, 24, 24)
+        _, _, d = low_res_feat.shape  # (2, 576, 1024)
+        high_res_feat = self.slow_proj(
+            F.gelu(self.slow_conv(high_res_feat))
+        )  # (2, 1024, 24, 24)
+        high_res_feat = high_res_feat.view(b, d, -1).transpose(1, 2)  # (2, 576, 1024)
+        dst_size = int(math.sqrt(low_res_feat.shape[1]))  # 24
+        low_res_feat = low_res_feat.transpose(1, 2).view(
+            b, d, dst_size, dst_size
+        )  # (2, 1024, 24, 24)
+        low_res_feat = low_res_feat + self.fast_proj(
+            F.gelu(self.fast_conv(low_res_feat))
+        )
+        low_res_feat = low_res_feat.view(b, d, dst_size * dst_size).transpose(
+            1, 2
+        )  # (2, 576, 1024)
+        gate = self.gate(
+            torch.cat([low_res_feat, high_res_feat], -1).mean(1)
+        ).unsqueeze(
+            1
+        )  # (2, 1, 1)
+        low_res_feat = low_res_feat + high_res_feat * gate.tanh()
+        return low_res_feat
+class ProjectorConfig(PretrainedConfig):
+    model_type = "projector"
+    _auto_class = "AutoConfig"
+    def __init__(
+        self,
+        visual_hidden_size=4096,
+        llm_hidden_size=4096,
+        depth=2,
+        hidden_act="gelu",
+        bias=True,
+        **kwargs,
+    ):
+        self.visual_hidden_size = visual_hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.depth = depth
+        self.hidden_act = hidden_act
+        self.bias = bias
+        super().__init__(**kwargs)
+class ProjectorModel(PreTrainedModel):
+    _auto_class = "AutoModel"
+    config_class = ProjectorConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+    def __init__(self, config: ProjectorConfig) -> None:
+        super().__init__(config)
+        self.gradient_checkpointing = False
+        modules = [
+            nn.Linear(
+                config.visual_hidden_size, config.llm_hidden_size, bias=config.bias
+            )
+        ]
+        for _ in range(1, config.depth):
+            modules.append(ACT2FN[config.hidden_act])
+            modules.append(
+                nn.Linear(
+                    config.llm_hidden_size, config.llm_hidden_size, bias=config.bias
+                )
+            )
+        self.model = nn.Sequential(*modules)
+    def enable_input_require_grads(self):
+        def make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+        self.model.register_forward_hook(make_inputs_require_grad)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ProjectorModel):
+            module.gradient_checkpointing = value
+    def forward(self, x):
+        layer_outputs = self.model(x)
+        return layer_outputs
+def gen_sineembed_for_position(pos_tensor, dim_of_pos_feats):
+    """Generate sine position embedding from a position tensor.
+    Args:
+        pos_tensor (torch.Tensor): shape: [batch_size, N, 4]. the last dimension is [cx, cy, w, h] in
+            normalized coordinates in range [0, 1].
+        out_dim (int): the output dimension of the position embedding.
+    Returns:
+        pos (torch.Tensor): shape: [batch_size, N, out_dim].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(
+        dim_of_pos_feats, dtype=torch.float32, device=pos_tensor.device
+    )
+    dim_t = 10000 ** (2 * (dim_t // 2) / dim_of_pos_feats)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack(
+        (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3
+    ).flatten(2)
+    pos_y = torch.stack(
+        (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3
+    ).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack(
+            (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3
+        ).flatten(2)
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack(
+            (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3
+        ).flatten(2)
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+class MultiLevelROIVisualPrompt(nn.Module):
+    """Initialize the MultiLevelROIVisualPrompt.
+    Args:
+        output_size (Optional[int]): The size of the output. Default is None.
+        channel_per_level (List[int]): List of channels per level. Default is [192, 384, 768, 1536].
+        spatial_scale (Optional[float]): The spatial scale factor. Default is None.
+        with_additional_projection (bool): Whether to use additional projection. Default is False.
+        visual_prompt_hidden_size (int): The hidden size of the visual prompt. Default is 1024.
+        add_pos_embedding (bool): Whether to add position embedding. Default is False.
+        pos_embedding_dim (int): The dimension of the position embedding. Default is 1024.
+    """
+    def __init__(
+        self,
+        output_size: int = None,
+        channel_per_level: List[int] = [192, 384, 768, 1536],
+        spatail_scale: float = None,
+        visual_prompt_hidden_size: bool = 1024,
+        add_pos_embedding: bool = False,
+        pos_embedding_dim: int = 1024,
+    ):
+        super(MultiLevelROIVisualPrompt, self).__init__()
+        self.output_size = output_size
+        self.channel_per_level = channel_per_level
+        self.spatail_scale = spatail_scale
+        self.add_pos_embedding = add_pos_embedding
+        self.pos_embedding_dim = pos_embedding_dim
+    def __call__(
+        self,
+        multi_level_features: List[torch.Tensor],
+        boxes: Union[torch.Tensor, List[torch.Tensor]],
+    ) -> torch.Tensor:
+        """Performs Region of Interest (RoI) Align operator on multi-level features. The RoI
+        feature on each scale will go through a different linear layer for projection. Different
+        RoI features will be summed up and then average pooled.
+        Args:
+            multi_level_features (Listp[Tensor[N, C, H, W]]): Feature maps from different levels
+            boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
+                format where the regions will be taken from.
+        Returns:
+            Tensor[1, K, C]: The output tensor that has the shape KxC, where K is the number of RoIs
+        """
+        boxes[0] = boxes[0].float()
+        concat_multi_level_feature = []
+        max_height = max([feature.shape[2] for feature in multi_level_features])
+        max_width = max([feature.shape[3] for feature in multi_level_features])
+        # interpolate to the same size
+        for level, feature in enumerate(multi_level_features):
+            if level != 0:
+                concat_multi_level_feature.append(
+                    F.interpolate(
+                        feature.float(),
+                        size=(max_height, max_width),
+                        mode="bilinear",
+                        align_corners=False,
+                    )
+                )
+            else:
+                concat_multi_level_feature.append(feature.float())
+        concat_multi_level_feature = torch.cat(concat_multi_level_feature, dim=1)
+        out_box_feat = roi_align(
+            concat_multi_level_feature,
+            boxes,
+            output_size=self.output_size,
+            spatial_scale=self.spatail_scale,
+        )
+        # Average Pooling -> n,c -> 1,n,c
+        out_box_feat = out_box_feat.mean(dim=(2, 3)).reshape(
+            1, out_box_feat.shape[0], out_box_feat.shape[1]
+        )
+        if self.add_pos_embedding:
+            # note that this boxes is in xyxy, unormalized format, so we need to normalize it first
+            boxes = boxes[0]  # (N, 4)
+            boxes = boxes.to(out_box_feat.dtype)
+            original_img_width = max_width / self.spatail_scale
+            original_img_height = max_height / self.spatail_scale
+            boxes[:, [0, 2]] = boxes[:, [0, 2]] / original_img_width
+            boxes[:, [1, 3]] = boxes[:, [1, 3]] / original_img_height
+            # convert from xyxy to cx, cy, w, h
+            boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+            boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+            boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
+            boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
+            pos_embed = gen_sineembed_for_position(
+                boxes.unsqueeze(0), self.pos_embedding_dim // 4
+            )
+            out_box_feat = out_box_feat + pos_embed
+        return out_box_feat
+class ChatRexAuxConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of ChatRexAux model.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        vision_aux_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `OpenCLIPVisionTower`):
+        visual_prompt_encoder (`Union[AutoConfig, dict]`,  *optional*, defaults to `MultiLevelROIVisualPrompt`):
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+    Example:
+    ```python
+    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+    >>> # Initializing a Llava llava-1.5-7b style configuration
+    >>> configuration = LlavaConfig(vision_config, text_config)
+    >>> # Initializing a model from the llava-1.5-7b style configuration
+    >>> model = LlavaForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "chatrex"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        vision_aux_config=None,
+        visual_prompt_encoder_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        projector_depth=2,
+        visual_prompt_hidden_size=2880,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.projector_depth = projector_depth
+        self.visual_prompt_hidden_size = visual_prompt_hidden_size
+        self.visual_prompt_encoder_config = visual_prompt_encoder_config
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"]
+                if "model_type" in vision_config
+                else "clip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+        self.vision_config = vision_config
+        self.vision_aux_config = vision_aux_config
+        if isinstance(text_config, dict):
+            text_config["model_type"] = (
+                text_config["model_type"] if "model_type" in text_config else "llama"
+            )
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+        self.text_config = text_config
+        super().__init__(**kwargs)
+class ChatRexAuxPreTrainedModel(PreTrainedModel):
+    config_class = ChatRexAuxConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlavaVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    # def _init_weights(self, module):
+    #     # important: this ported version of Llava isn't meant for training from scratch - only
+    #     # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+    #     # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
+    #     std = (
+    #         self.config.initializer_range
+    #         if hasattr(self.config, "initializer_range")
+    #         else self.config.text_config.initializer_range
+    #     )
+    #     if hasattr(module, "class_embedding"):
+    #         module.class_embedding.data.normal_(mean=0.0, std=std)
+    #     if isinstance(module, (nn.Linear, nn.Conv2d)):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+class ChatRexAuxForConditionalGeneration(ChatRexAuxPreTrainedModel):
+    def __init__(self, config: ChatRexAuxConfig):
+        super().__init__(config)
+        # low resolusion vision encoder
+        self.vision_encoder = AutoModel.from_config(config.vision_config)
+        # high resolusion vision encoder
+        self.vision_encoder_aux = ConvNextVisionEncoder()
+        # vision projector
+        projector_config = ProjectorConfig(
+            visual_hidden_size=config.vision_config.hidden_size,
+            llm_hidden_size=config.text_config.hidden_size,
+            depth=config.projector_depth,
+        )
+        self.projector = ProjectorModel(projector_config)
+        # visual prompt encoder
+        vp_projector_config = ProjectorConfig(
+            visual_hidden_size=config.visual_prompt_hidden_size,
+            llm_hidden_size=config.text_config.hidden_size,
+            depth=config.projector_depth,
+        )
+        self.vp_projector = ProjectorModel(vp_projector_config)
+        # fuser
+        self.fuser = DualPathFuseModule(
+            low_res_dim=config.vision_config.hidden_size,
+            high_res_dim=1536,
+        )
+        # visual prompt encoder
+        self.vp_encoder = MultiLevelROIVisualPrompt(
+            output_size=7,
+            channel_per_level=[192, 384, 768, 1536],
+            spatail_scale=192 / 768,
+            add_pos_embedding=True,
+            pos_embedding_dim=2880,
+        )
+        # genconfig
+        self.gen_config = None
+        self.vocab_size = config.text_config.vocab_size
+        self.llm = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
+        self.pad_token_id = (
+            self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        )
+        self.post_init()
+    def _prepare_data_for_llm(self, data):
+        if "pixel_values" in data:
+            visual_outputs = self.vision_encoder(
+                data["pixel_values"].to(self.vision_encoder.dtype),
+                output_hidden_states=True,
+            )
+            if type(self.vision_encoder).__name__ in [
+                "CLIPVisionModel",
+                "CLIPVisionModelAnyRes",
+            ]:
+                visual_outputs = visual_outputs.hidden_states[-2][
+                    :, 1:
+                ]
+            elif type(self.vision_encoder).__name__ == "SiglipVisionModel":
+                visual_outputs = visual_outputs.hidden_states[-2]
+            else:
+                raise NotImplementedError
+            # aux encoder
+            if self.vision_encoder_aux is not None:
+                pixels_aux = []
+                for pixels in data["pixel_values_aux"]:
+                    if pixels.dim() == 3:
+                        pixels = pixels.unsqueeze(0)
+                    elif pixels.dim() == 4:
+                        pixels = pixels.permute(1, 0, 2, 3)
+                    pixels_aux.append(pixels)
+                visual_outputs_aux = torch.cat(
+                    pixels_aux, dim=0
+                )  # shape (2, 3, 768, 768)
+                aux_output = self.vision_encoder_aux(
+                    visual_outputs_aux
+                )
+                visual_outputs_aux = aux_output["image_features"]
+                last_feat = aux_output["last_feat"]  # (B, 1536, 24, 24)
+            # fuser
+            fuse_features = self.fuser(
+                low_res_feat=visual_outputs, high_res_feat=last_feat
+            )  # (2, 576, 1024)
+            pixel_values = self.projector(fuse_features)
+            data["pixel_values"] = pixel_values
+            # extract visual prompt features
+            bbox_visual_outputs = []
+            if "gt_boxes" in data:
+                for batch_idx, boxes in enumerate(data["gt_boxes"]):
+                    if len(boxes) == 0:
+                        bbox_visual_outputs.append(None)
+                        continue
+                    multi_level_aux_features = [
+                        visual_output_aux[batch_idx].unsqueeze(0)
+                        for visual_output_aux in visual_outputs_aux
+                    ]
+                    boxes = boxes.to(torch.float32)
+                    out_vp_feat = self.vp_encoder(
+                        multi_level_aux_features,
+                        [boxes],
+                    ).squeeze(0)
+                    out_vp_feat = out_vp_feat.to(pixel_values.dtype)
+                    out_vp_feat = self.vp_projector(out_vp_feat)
+                    bbox_visual_outputs.append(out_vp_feat)
+                # b,n,c
+                data["bbox_feats"] = bbox_visual_outputs
+            data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data)
+        return data
+    def generate(self, data_dict: Dict[str, Any], gen_config=None, tokenizer=None):
+        """Perform inference on the given data.
+        Args:
+            data_dict (Dict[str, Any]): The data to perform inference on.
+        Returns:
+            str: The answer to the question.
+        """
+        data_dict = self._prepare_data_for_llm(data_dict)
+        data_dict["inputs_embeds"] = data_dict["inputs_embeds"].to(self.llm.dtype)
+        stop_criteria = get_stop_criteria(
+            tokenizer=tokenizer, stop_words=[]
+        )
+        generate_output = self.llm.generate(
+            **data_dict,
+            generation_config=self.gen_config if gen_config is None else gen_config,
+            streamer=None,
+            bos_token_id=tokenizer.bos_token_id,
+            stopping_criteria=stop_criteria,
+        )
+        print(f'generate_output:', generate_output)
+        prediction = tokenizer.decode(
+            generate_output[0], skip_special_tokens=False
+        ).strip()
+        prediction = prediction.replace("<s>", "").replace("</s>", "").strip()
+        return prediction
+AutoConfig.register("chatrex", ChatRexAuxConfig)
+AutoModelForCausalLM.register(ChatRexAuxConfig, ChatRexAuxForConditionalGeneration)

preprocessing_chatrex.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Processor class for Molmo.
+"""
+from typing import Optional
+import PIL
+from PIL import Image
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+import re
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import torchvision.transforms.functional as F
+from transformers import AutoTokenizer
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           TextKwargs)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN_INDEX = 0
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+# For Objects
+DEFAULT_OBJECT_TOKEN = "<obj<i>>"
+DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
+DEFAULT_OBJECT_INDEX = -300
+# For Grounding
+DEFAULT_GROUNDING_START = "<ground>"
+DEFAULT_GROUNDING_END = "</ground>"
+DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
+DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
+def xyxy_to_xywh(boxes):
+    """
+    Convert boxes from xywh to xyxy format.
+    Parameters:
+    boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
+                           Each box is represented as [x, y, x, y].
+    Returns:
+    numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, w, h].
+    """
+    boxes = np.array(boxes)
+    x_min, y_min, x_max, y_max = (
+        boxes[:, 0],
+        boxes[:, 1],
+        boxes[:, 2],
+        boxes[:, 3],
+    )
+    w = x_max - x_min
+    h = y_max - y_min
+    return np.stack([x_min, y_min, w, h], axis=1)
+def xywh_to_xyxy(boxes):
+    """
+    Convert boxes from xywh to xyxy format.
+    Parameters:
+    boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
+                           Each box is represented as [x, y, width, height].
+    Returns:
+    numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, x_max, y_max].
+    """
+    boxes = np.array(boxes)
+    x, y, width, height = (
+        boxes[:, 0],
+        boxes[:, 1],
+        boxes[:, 2],
+        boxes[:, 3],
+    )
+    x_max = x + width
+    y_max = y + height
+    return np.stack([x, y, x_max, y_max], axis=1)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def pad_boxes(gt_boxes, old_size):
+    old_w, old_h = old_size
+    gt_boxes = np.array(gt_boxes).astype(np.float32)
+    # Calculate the padding added
+    if old_w > old_h:
+        pad_top = (old_w - old_h) // 2
+        pad_bottom = old_w - old_h - pad_top
+        pad_left, pad_right = 0, 0
+    else:
+        pad_left = (old_h - old_w) // 2
+        pad_right = old_h - old_w - pad_left
+        pad_top, pad_bottom = 0, 0
+    # Adjust the boxes for padding
+    gt_boxes[:, 0] += pad_left  # x
+    gt_boxes[:, 1] += pad_top  # y
+    return gt_boxes
+def resize_boxes(gt_boxes, old_size, new_size):
+    old_w, old_h = old_size
+    new_h, new_w = new_size
+    gt_boxes = np.array(gt_boxes).astype(np.float32)
+    # Calculate scale factors
+    scale_x = new_w / max(old_w, old_h)
+    scale_y = new_h / max(old_w, old_h)
+    # Resize the boxes
+    gt_boxes[:, 0] *= scale_x  # x
+    gt_boxes[:, 1] *= scale_y  # y
+    gt_boxes[:, 2] *= scale_x  # w
+    gt_boxes[:, 3] *= scale_y  # h
+    return gt_boxes
+def split_special_strings(input_string: str, special_strings: list[str] = None):
+    """Split the input string into a list of strings, keeping the special strings.
+    Args:
+        input_string (str): The input string to split.
+        Example:
+            input_string = "<image>\n<obj0><objfeat><obj1><objfeat>\n I am happy today."
+            output = ['<image>', '\n<obj0>', '<objfeat>', '<obj1>', '<objfeat>', '\n I am happy today.']
+    Returns:
+        list: A list of strings, with the special strings separated from the rest of the input string.
+    """
+    # Create a regex pattern to match the special strings
+    pattern = "|".join(map(re.escape, special_strings))
+    # Split the input string using the pattern, keeping the special strings in the result
+    split_list = re.split(f"({pattern})", input_string)
+    # Remove empty strings from the list
+    split_list = [s for s in split_list if s]
+    return split_list
+def tokenizer_image_object_token(prompt, tokenizer):
+    bos_token_id = tokenizer.bos_token_id
+    split_tokens = [DEFAULT_IMAGE_TOKEN, DEFAULT_OBJECT_FEATURE_TOKEN]
+    chunks = split_special_strings(prompt, split_tokens)
+    input_encode = [bos_token_id]
+    for chunk in chunks:
+        if chunk == DEFAULT_IMAGE_TOKEN:
+            input_encode.append(IMAGE_TOKEN_INDEX)
+        elif chunk == DEFAULT_OBJECT_FEATURE_TOKEN:
+            input_encode.append(DEFAULT_OBJECT_INDEX)
+        else:
+            input_encode.extend(tokenizer.encode(chunk, add_special_tokens=False))
+    return input_encode
+class ChatRexProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor = None, tokenizer : AutoTokenizer = None, **kwargs):
+        # self.image_processor = image_processor
+        # self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer)
+        self._special_tokens = None
+        self.template = dict(
+            SYSTEM=('A chat between a curious user and an artificial '
+                    'intelligence assistant. The assistant gives '
+                    'helpful, detailed, and polite answers to the '
+                    'user\'s questions. {system}\n '),
+            INSTRUCTION=('USER: {input} ASSISTANT:'),
+            SEP='\n')
+    def process(
+        self,
+        image: Union[str, Image.Image],
+        bbox: List[List[int]],
+        question: str,
+    ):
+        """Prepare input data for inference.
+        Args:
+            image (Union[str, Image.Image]): The image to process.
+            bbox (List[List[int]]): A list of bounding boxes for the image. Each bounding box should
+                be in order of [x, y, x , y].
+            question (str): The question to ask about the image.
+        """
+        data_dict = {}
+        # step1 load image
+        if type(image) == str:
+            image = Image.open(image).convert("RGB")
+        ori_w, ori_h = F.get_image_size(image)
+        image = expand2square(
+            image,
+            tuple(int(x * 255) for x in self.image_processor.image_mean),
+        )
+        pad_w, pad_h = F.get_image_size(image)
+        image_aux = self.image_processor.preprocess(image, return_tensors="pt")[
+            "pixel_values"
+        ][0]
+        resize_h, resize_w = image_aux.shape[-2:]
+        data_dict["pixel_values_aux"] = image_aux.unsqueeze(0)
+        image = image_aux.clone()
+        image = torch.nn.functional.interpolate(
+            image[None],
+            size=[336, 336],
+            mode="bilinear",
+            align_corners=False,
+        )[0]
+        data_dict["pixel_values"] = image.unsqueeze(0)
+        # step2 load boxes
+        bbox= xyxy_to_xywh(bbox)
+        bbox = pad_boxes(bbox, (ori_w, ori_h))
+        bbox = resize_boxes(bbox, (pad_w, pad_h), (resize_h, resize_w))
+        data_dict["gt_boxes"] = torch.tensor(xywh_to_xyxy(bbox)).unsqueeze(0)
+        # step3 prepare question
+        total_num_boxes = len(bbox)
+        obj_tokens = [
+            DEFAULT_OBJECT_TOKEN.replace("<i>", str(i)) for i in range(total_num_boxes)
+        ]
+        obj_tokens = (
+            DEFAULT_OBJECT_FEATURE_TOKEN.join(obj_tokens) + DEFAULT_OBJECT_FEATURE_TOKEN
+        )
+        question = question.replace(DEFAULT_IMAGE_TOKEN, "")
+        question = DEFAULT_IMAGE_TOKEN + "\n" + obj_tokens + "\n" + question
+        inputs = ""
+        inputs += self.template["INSTRUCTION"].format(input=question, round=1)
+        # step4 tokenize question
+        input_ids = tokenizer_image_object_token(inputs, self.tokenizer)
+        data_dict["input_ids"] = torch.tensor(input_ids).unsqueeze(0)
+        return data_dict
+ChatRexProcessor.register_for_auto_class()

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "ChatRexProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 768
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "preprocessing_chatrex.ChatRexProcessor"
+  },
+  "processor_class": "ChatRexProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,876 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<obj0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<obj1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<obj2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<obj3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<obj4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<obj5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<obj6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<obj7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<obj8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<obj9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<obj10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<obj11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<obj12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<obj13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<obj14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<obj15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<obj16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<obj17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<obj18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<obj19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<obj20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<obj21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<obj22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<obj23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<obj24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<obj25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<obj26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<obj27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<obj28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<obj29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<obj30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<obj31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<obj32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<obj33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<obj34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<obj35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<obj36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<obj37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<obj38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<obj39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<obj40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<obj41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<obj42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<obj43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<obj44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32045": {
+      "content": "<obj45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32046": {
+      "content": "<obj46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32047": {
+      "content": "<obj47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32048": {
+      "content": "<obj48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32049": {
+      "content": "<obj49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32050": {
+      "content": "<obj50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32051": {
+      "content": "<obj51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32052": {
+      "content": "<obj52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32053": {
+      "content": "<obj53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32054": {
+      "content": "<obj54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32055": {
+      "content": "<obj55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32056": {
+      "content": "<obj56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32057": {
+      "content": "<obj57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32058": {
+      "content": "<obj58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32059": {
+      "content": "<obj59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32060": {
+      "content": "<obj60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32061": {
+      "content": "<obj61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32062": {
+      "content": "<obj62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32063": {
+      "content": "<obj63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32064": {
+      "content": "<obj64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "<obj65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32066": {
+      "content": "<obj66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32067": {
+      "content": "<obj67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32068": {
+      "content": "<obj68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32069": {
+      "content": "<obj69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32070": {
+      "content": "<obj70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32071": {
+      "content": "<obj71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32072": {
+      "content": "<obj72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32073": {
+      "content": "<obj73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32074": {
+      "content": "<obj74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32075": {
+      "content": "<obj75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32076": {
+      "content": "<obj76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32077": {
+      "content": "<obj77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32078": {
+      "content": "<obj78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32079": {
+      "content": "<obj79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32080": {
+      "content": "<obj80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32081": {
+      "content": "<obj81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32082": {
+      "content": "<obj82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32083": {
+      "content": "<obj83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32084": {
+      "content": "<obj84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32085": {
+      "content": "<obj85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32086": {
+      "content": "<obj86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32087": {
+      "content": "<obj87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32088": {
+      "content": "<obj88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32089": {
+      "content": "<obj89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32090": {
+      "content": "<obj90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32091": {
+      "content": "<obj91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32092": {
+      "content": "<obj92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32093": {
+      "content": "<obj93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32094": {
+      "content": "<obj94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32095": {
+      "content": "<obj95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32096": {
+      "content": "<obj96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32097": {
+      "content": "<obj97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32098": {
+      "content": "<obj98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32099": {
+      "content": "<obj99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32100": {
+      "content": "<ground>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32101": {
+      "content": "</ground>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32102": {
+      "content": "<objects>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32103": {
+      "content": "</objects>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "processor_class": "LlavaProcessor",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}