Spaces:

aleo1
/

LuojiaHOG-demo

Running

App Files Files Community

aleo1 commited on May 20, 2024

Commit

9f3352f

verified ·

1 Parent(s): 4072e7b

Upload 23 files

Browse files

Files changed (23) hide show

cisen/config/cisen_r0.9_fpn.yaml +76 -0
cisen/engine/__init__.py +0 -0
cisen/engine/__pycache__/__init__.cpython-38.pyc +0 -0
cisen/engine/__pycache__/engine.cpython-38.pyc +0 -0
cisen/engine/demo.py +0 -0
cisen/engine/engine.py +0 -0
cisen/model/__init__.py +354 -0
cisen/model/__pycache__/__init__.cpython-38.pyc +0 -0
cisen/model/__pycache__/clip.cpython-38.pyc +0 -0
cisen/model/__pycache__/layers.cpython-38.pyc +0 -0
cisen/model/__pycache__/segmenter.cpython-38.pyc +0 -0
cisen/model/builder.py +25 -0
cisen/model/clip.py +1207 -0
cisen/model/layers.py +633 -0
cisen/model/segmenter.py +2045 -0
cisen/utils/__pycache__/config.cpython-38.pyc +0 -0
cisen/utils/__pycache__/dataset.cpython-38.pyc +0 -0
cisen/utils/bpe_simple_vocab_16e6.txt.gz +3 -0
cisen/utils/config.py +157 -0
cisen/utils/dataset.py +478 -0
cisen/utils/hash.py +314 -0
cisen/utils/misc.py +444 -0
cisen/utils/simple_tokenizer.py +132 -0

cisen/config/cisen_r0.9_fpn.yaml ADDED Viewed

	@@ -0,0 +1,76 @@

+DATA:
+  dataset: classification
+  dataset_json_file: /data02/xy/dataEngine/json_data/LuojiaHOG(test)_.json
+#  dataset_json_file: /data02/xy/dataEngine/json_data/merged_output_combined_9w_resplit.json
+  # dataset_json_file: /data02/xy/dataEngine/json_data/merged_output_combined_9w_resplit.json
+  exp_name: classifi
+  ratio: 0
+  dataset_train_split: 0.6
+  dataset_query_split: 0.2
+  imgs_folder: /data02/xy/Clip-hash/datasets/image/
+  label_path: /data02/xy/Clip-hash/labels.txt
+  num_classes: 10
+  # num_classes: 131
+TRAIN:
+  # Base Arch
+  # clip_pretrain: /data02/xy/Clip-hash/pretrain/RS5M_ViT-B-32.pt
+  clip_pretrain: ./cisen/pretrain/RS5M_ViT-B-32.pt
+  model_name: ViT-B-32
+  ckpt_path: /data02/xy/GeoRSCLIP/codebase/inference/pretrain/RS5M_ViT-B-32.pt
+  input_size: 224
+  word_len: 328
+  word_dim: 1024
+  vis_dim: 512
+  fpn_in: [ 512, 768, 768 ]
+  fpn_out: [ 768, 768, 768, 512 ]
+  sync_bn: True
+  # Decoder
+  num_layers: 3
+  num_head: 8
+  dim_ffn: 2048
+  dropout: 0.1
+  intermediate: False
+  # Training Setting
+  workers: 32  # data loader workers
+  workers_val: 16
+  epochs: 50
+  milestones: [50]
+  start_epoch: 0
+  batch_size: 256 # batch size for training
+  batch_size_val: 256 # batch size for validation during training, memory and speed tradeoff  11111
+  base_lr: 0.0001
+  min_lr: 0.00000001
+  lr_decay: 0.5
+  lr_multi: 0.1
+  weight_decay: 0.
+  max_norm: 0.
+  manual_seed: 0
+  print_freq: 1
+  lamda1: 0.5
+  lamda2: 0.5
+  beta1: 0.5
+  beta2: 0.5
+  eta: 0.2
+  warmup_epochs: 0
+  contrastive: [0.4, 0.3, 0.3]
+  # Resume & Save
+  output_folder: /data02/xy/Clip-hash/exp/
+  save_freq: 1
+  weight:  # path to initial weight (default: none)
+  resume: False # path to latest checkpoint (default: none)
+  evaluate: True  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://localhost:3693
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+TEST:
+  test_split: val-test
+  gpu : [0]
+  test_lmdb: /data02/xy/Clip-hash/datasets/lmdb/refcoco/val.lmdb
+  visualize: False
+  topk: 5
+  test_batch_size: 256 #1111111
+  val_batch_size: 1

cisen/engine/__init__.py ADDED Viewed

File without changes

cisen/engine/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (133 Bytes). View file

cisen/engine/__pycache__/engine.cpython-38.pyc ADDED Viewed

Binary file (7.95 kB). View file

cisen/engine/demo.py ADDED Viewed

File without changes

cisen/engine/engine.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cisen/model/__init__.py ADDED Viewed

	@@ -0,0 +1,354 @@

+from .segmenter import CRIS, CISEN, Clip_hash_model, zh_clip, poi_clip, Clip_model, CISEN_vit, CISEN_rsvit, CISEN_new, CISEN_rsvit_classification, CISEN_lclip
+from .segmenter import *
+from loguru import logger
+from transformers import AlignProcessor, AlignModel
+# def build_segmenter(args):
+#     model = CRIS(args)
+#     backbone = []
+#     backbone_no_decay = []
+#     head = []
+#     for k, v in model.named_parameters():
+#         if k.startswith('backbone') and 'positional_embedding' not in k:
+#             backbone.append(v)
+#         elif 'positional_embedding' in k:
+#             backbone_no_decay.append(v)
+#         else:
+#             head.append(v)
+#     print('Backbone with decay: {}, Backbone without decay: {}, Head: {}'.format(
+#         len(backbone), len(backbone_no_decay), len(head)))
+#     param_list = [{
+#         'params': backbone,
+#         'initial_lr': args.lr_multi * args.base_lr
+#     }, {
+#         'params': backbone_no_decay,
+#         'initial_lr': args.lr_multi * args.base_lr,
+#         'weight_decay': 0
+#     }, {
+#         'params': head,
+#         'initial_lr': args.base_lr
+#     }]
+#     return model, param_list
+def build_CISEN(args, stage):
+    model = CISEN_new(args)
+    backbone = []
+    head = []
+    ADP = []
+    ADP_t = []
+    fuse = []
+    name = []
+    for k, v in model.named_parameters():
+        if k.startswith('backbone') and 'backbone.positional_embedding' not in k:
+        # if k.startswith('backbone'):
+            v.requires_grad = False
+            backbone.append(v)
+        elif k.startswith('ADP'):
+            # v.requires_grad = False
+            ADP.append(v)
+        elif k.startswith('FPN'):
+            fuse.append(v)
+        elif k.startswith('gap'):
+            fuse.append(v)
+        elif k.startswith('ADP_t'):
+            ADP_t.append(v)
+        else:
+            head.append(v)
+            name.append(k)
+    # logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    # param_list = [{
+    #     'params': backbone,
+    #     'initial_lr': args.lr_multi * float(args.base_lr)
+    # }, {
+    #     'params': head,
+    #     'initial_lr': args.base_lr
+    # }, {
+    #     'params': proj,
+    #     'initial_lr': args.base_lr
+    # }]
+    if stage == '1st':
+        param_list = [{
+            'params': ADP,
+            'initial_lr': args.base_lr
+        },{
+            'params': head,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '2nd':
+        param_list = [{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '4th':
+        param_list = [{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '5th':
+        param_list = [{
+        #     'params': ADP,
+        #     'initial_lr': args.base_lr
+        # },{
+        #     'params': ADP_t,
+        #     'initial_lr': args.base_lr
+        # },{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    else:
+        print('stage should be either 1st or 2nd')
+    return model, param_list
+def build_CISEN_lclip(args, stage):
+    model = CISEN_lclip(args)
+    backbone = []
+    head = []
+    ADP = []
+    ADP_t = []
+    fuse = []
+    name = []
+    for k, v in model.named_parameters():
+        # if k.startswith('backbone') and 'backbone.positional_embedding' not in k:
+        if k.startswith('backbone'):
+            v.requires_grad = False
+            backbone.append(v)
+        elif k.startswith('ADP'):
+            # v.requires_grad = False
+            ADP.append(v)
+        elif k.startswith('FPN'):
+            fuse.append(v)
+        elif k.startswith('gap'):
+            fuse.append(v)
+        elif k.startswith('ADP_t'):
+            ADP_t.append(v)
+        else:
+            head.append(v)
+            name.append(k)
+    # logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    # param_list = [{
+    #     'params': backbone,
+    #     'initial_lr': args.lr_multi * float(args.base_lr)
+    # }, {
+    #     'params': head,
+    #     'initial_lr': args.base_lr
+    # }, {
+    #     'params': proj,
+    #     'initial_lr': args.base_lr
+    # }]
+    if stage == '1st':
+        param_list = [{
+            'params': ADP,
+            'initial_lr': args.base_lr
+        },{
+            'params': head,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '2nd':
+        param_list = [{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '4th':
+        param_list = [{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '5th':
+        param_list = [{
+        #     'params': ADP,
+        #     'initial_lr': args.base_lr
+        # },{
+        #     'params': ADP_t,
+        #     'initial_lr': args.base_lr
+        # },{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    else:
+        print('stage should be either 1st or 2nd')
+    return model, param_list
+def build_CISEN_vit(args, stage):
+    model = CISEN_rsvit(args)
+    backbone = []
+    head = []
+    ADP = []
+    ADP_t = []
+    fuse = []
+    name = []
+    for k, v in model.named_parameters():
+        # if k.startswith('backbone') and 'backbone.positional_embedding' not in k:
+        if k.startswith('backbone'):
+            v.requires_grad = False
+            backbone.append(v)
+        elif k.startswith('ADP'):
+            v.requires_grad = False
+            ADP.append(v)
+        elif k.startswith('FPN'):
+            # v.requires_grad = False
+            fuse.append(v)
+        elif k.startswith('ms_adaptor'):
+            # v.requires_grad = False
+            fuse.append(v)
+        else:
+            head.append(v)
+            name.append(k)
+    # logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    # param_list = [{
+    #     'params': backbone,
+    #     'initial_lr': args.lr_multi * float(args.base_lr)
+    # }, {
+    #     'params': head,
+    #     'initial_lr': args.base_lr
+    # }, {
+    #     'params': proj,
+    #     'initial_lr': args.base_lr
+    # }]
+    if stage == '1st':
+        param_list = [{
+            'params': ADP,
+            'initial_lr': args.base_lr
+        },{
+            'params': head,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '2nd':
+        param_list = [{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '4th':
+        param_list = [{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    elif stage == '5th':
+        param_list = [{
+        #     'params': ADP,
+        #     'initial_lr': args.base_lr
+        # },{
+        #     'params': ADP_t,
+        #     'initial_lr': args.base_lr
+        # },{
+            'params': fuse,
+            'initial_lr': args.base_lr
+        }]
+    else:
+        print('stage should be either 1st or 2nd')
+    return model, param_list
+def build_CISEN_vit_classification(args, stage):
+    model = CISEN_rsvit_classification(args)
+    # logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    # param_list = [{
+    #     'params': backbone,
+    #     'initial_lr': args.lr_multi * float(args.base_lr)
+    # }, {
+    #     'params': head,
+    #     'initial_lr': args.base_lr
+    # }, {
+    #     'params': proj,
+    #     'initial_lr': args.base_lr
+    # }]
+    return model
+def build_segmenter(args):
+    model = CRIS(args)
+    backbone = []
+    head = []
+    for k, v in model.named_parameters():
+        if k.startswith('backbone') and 'positional_embedding' not in k:
+            backbone.append(v)
+        elif k.startswith('Label_encoder') and "token_embedding" not in k:
+            v.requires_grad = False
+        else:
+            head.append(v)
+    logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    param_list = [{
+        'params': backbone,
+        'initial_lr': args.lr_multi * float(args.base_lr)
+    }, {
+        'params': head,
+        'initial_lr': args.base_lr
+    }]
+    return model, param_list
+def build_hash(args):
+    model = Clip_hash_model(args)
+    backbone = []
+    head = []
+    for k, v in model.named_parameters():
+        if k.startswith('backbone') and 'positional_embedding' not in k:
+            backbone.append(v)
+        else:
+            head.append(v)
+    logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    param_list = [{
+        'params': backbone,
+        'initial_lr': args.lr_multi * args.base_lr
+    }, {
+        'params': head,
+        'initial_lr': args.base_lr
+    }]
+    return model, param_list
+def build_zh_segmenter(args):
+    model = zh_clip(args)
+    backbone = []
+    head = []
+    for k, v in model.named_parameters():
+        if k.startswith('backbone') and 'positional_embedding' not in k:
+            backbone.append(v)
+        else:
+            head.append(v)
+    logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    param_list = [{
+        'params': backbone,
+        'initial_lr': args.lr_multi * args.base_lr
+    }, {
+        'params': head,
+        'initial_lr': args.base_lr
+    }]
+    return model, param_list
+def build_poi_segmenter(args):
+    model = poi_clip(args)
+    backbone = []
+    head = []
+    for k, v in model.named_parameters():
+        if k.startswith('backbone') and 'positional_embedding' not in k:
+            backbone.append(v)
+        else:
+            head.append(v)
+    logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    param_list = [{
+        'params': backbone,
+        'initial_lr': args.lr_multi * args.base_lr
+    }, {
+        'params': head,
+        'initial_lr': args.base_lr
+    }]
+    return model, param_list
+def build_clip(args):
+    model = Clip_model(args)
+    backbone = []
+    head = []
+    for k, v in model.named_parameters():
+        if k.startswith('backbone') and 'positional_embedding' not in k:
+            backbone.append(v)
+        else:
+            head.append(v)
+    logger.info('Backbone with decay={}, Head={}'.format(len(backbone), len(head)))
+    param_list = [{
+        'params': backbone,
+        'initial_lr': args.lr_multi * args.base_lr
+    }, {
+        'params': head,
+        'initial_lr': args.base_lr
+    }]
+    return model, param_list

cisen/model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (695 Bytes). View file

cisen/model/__pycache__/clip.cpython-38.pyc ADDED Viewed

Binary file (16.7 kB). View file

cisen/model/__pycache__/layers.cpython-38.pyc ADDED Viewed

Binary file (9.07 kB). View file

cisen/model/__pycache__/segmenter.cpython-38.pyc ADDED Viewed

Binary file (1.66 kB). View file

cisen/model/builder.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mmcv import Registry
+from mmcv import build_from_cfg
+MODELS = Registry('model')
+def build_model(config):
+    return build_from_cfg(config, MODELS)

cisen/model/clip.py ADDED Viewed

	@@ -0,0 +1,1207 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ..utils.dataset import tokenize
+from ..utils.simple_tokenizer import SimpleTokenizer as _Tokenizer
+_tokenizer = _Tokenizer()
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([("-1", nn.AvgPool2d(stride)),
+                             ("0",
+                              nn.Conv2d(inplanes,
+                                        planes * self.expansion,
+                                        1,
+                                        stride=1,
+                                        bias=False)),
+                             ("1", nn.BatchNorm2d(planes * self.expansion))]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+"""
+    attenpool used in CRIS (output: C1/C2/C3  3 deiffent feature maps)
+"""
+class ModifiedAttentionPool2d(nn.Module):
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.spacial_dim = spacial_dim
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        # residual
+        self.connect = nn.Sequential(
+            nn.Conv2d(embed_dim, output_dim, 1, stride=1, bias=False),
+            nn.BatchNorm2d(output_dim))
+    def resize_pos_embed(self, pos_embed, input_shpae):
+        """Resize pos_embed weights.
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, C, L_new]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h = pos_w = self.spacial_dim
+        cls_token_weight = pos_embed[:, 0]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = F.interpolate(pos_embed_weight,
+                                         size=input_shpae,
+                                         align_corners=False,
+                                         mode='bicubic')
+        cls_token_weight = cls_token_weight.unsqueeze(1)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        # pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed_weight.transpose(-2, -1)
+    def forward(self, x):
+        B, C, H, W = x.size()
+        res = self.connect(x)
+        x = x.reshape(B, C, -1)  # NC(HW)
+        # x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(1+HW)
+        pos_embed = self.positional_embedding.unsqueeze(0)
+        pos_embed = self.resize_pos_embed(pos_embed, (H, W))  # NC(HW)
+        x = x + pos_embed.to(x.dtype)  # NC(HW)
+        x = x.permute(2, 0, 1)  # (HW)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        xt = x[0]
+        x = x.permute(1, 2, 0).reshape(B, -1, H, W)
+        x = x + res
+        x = F.relu(x, True)
+        return x, xt
+"""
+    attenpool used in Clip (output: a tensor (b, dim) image encoding)
+"""
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3,
+                               width // 2,
+                               kernel_size=3,
+                               stride=2,
+                               padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2,
+                               width // 2,
+                               kernel_size=3,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2,
+                               width,
+                               kernel_size=3,
+                               padding=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+        # self.modifiedattnpool = ModifiedAttentionPool2d(input_resolution // 32, embed_dim,
+        #                                 heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x2 = self.layer2(x)
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+        x5 = self.attnpool(x4)
+        # x4 = self.modifiedattnpool(x4)
+        return (x2, x3, x4), x5
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        # print(n_head)
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)),
+                         ("gelu", QuickGELU()),
+                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        res = self.attn(x, x, x, need_weights=False,
+                         attn_mask=self.attn_mask)[0]
+        # print(res)
+        return res
+    def forward(self, x: torch.Tensor):
+        # a = self.attention(self.ln_1(x))
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class ViTTransformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        outputs = []
+        i = 1
+        for block in self.resblocks:
+            x = block(x)
+            if i > 7:
+                outputs.append(x)
+            i = i + 1
+        return outputs
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3,
+                               out_channels=width,
+                               kernel_size=patch_size,
+                               stride=patch_size,
+                               bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = ViTTransformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        # input: batch, 3, 224, 224
+        # batch, 1024, 16, 16
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        # batch, 1024, 256
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        # batch, 256, 1024
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        # batch, 257, 1024
+        x = torch.cat([
+            self.class_embedding.to(x.dtype) + torch.zeros(
+                x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x
+        ],
+            dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        # 257, batch, 1024
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        out = self.transformer(x)
+        # batch, 257, 1024
+        x1, x2 ,x3, x4 = out[0], out[1], out[2], out[3]
+        x1 = x1.permute(1, 0, 2)
+        x2 = x2.permute(1, 0, 2)
+        x3 = x3.permute(1, 0, 2)
+        x4 = x4.permute(1, 0, 2)  # LND -> NLD
+        # 用于分类
+        x = self.ln_post(x4[:, 0, :])
+        #feature
+        # x_f = self.ln_post(x[:, 1:, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return (x1[:, 1:, :], x2[:, 1:, :], x3[:, 1:, :], x4[:, 1:, :]), x
+class ModifiedVisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3,
+                               out_channels=width,
+                               kernel_size=patch_size,
+                               stride=patch_size,
+                               bias=False)
+        self.conv2 = nn.Conv2d(in_channels=3,
+                               out_channels=width // 2,
+                               kernel_size=patch_size // 2,
+                               stride=patch_size // 2,
+                               bias=False)
+        self.conv3 = nn.Conv2d(in_channels=3,
+                               out_channels=width,
+                               kernel_size=patch_size * 2,
+                               stride=patch_size * 2,
+                               bias=False)
+        self.conv_layers = [self.conv1, self.conv2]
+        scale = width**-0.5
+        self.class_embedding1 = nn.Parameter(scale * torch.randn(width))
+        self.class_embedding2 = nn.Parameter(scale * torch.randn(width // 2))
+        self.cls_layers = [self.class_embedding1, self.class_embedding2]
+        self.positional_embedding1 = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.positional_embedding2 = nn.Parameter(scale * torch.randn(
+            (input_resolution // (patch_size // 2)) ** 2 + 1, width // 2))
+        self.pos_layers = [self.positional_embedding1, self.positional_embedding2]
+        self.ln_pre1 = LayerNorm(width)
+        self.ln_pre2 = LayerNorm(width // 2)
+        self.pre_layers = [self.ln_pre1, self.ln_pre2]
+        self.transformer1 = Transformer(width, layers, heads)
+        self.transformer2 = Transformer(width // 2, layers, heads)
+        self.tran_layers = [self.transformer1, self.transformer2]
+        self.ln_post1 = LayerNorm(width)
+        self.ln_post2 = LayerNorm(width // 2)
+        self.post_layers = [self.ln_post1, self.ln_post2]
+        self.proj1 = nn.Parameter(scale * torch.randn(width, output_dim * 2))
+        self.proj2 = nn.Parameter(scale * torch.randn(width // 2, output_dim))
+        self.proj_layers = [self.proj1, self.proj2]
+    def forward(self, x: torch.Tensor):
+        # input: batch, 3, 224, 224
+        input = x
+        # batch, 1024, 16, 16
+        out = []
+        f = []
+        cl = []
+        for i in range(2):
+            x = self.conv_layers[i](input)  # shape = [*, width, grid, grid]
+            b, c, w, h = x.shape
+            # batch, 1024, 256
+            x = x.reshape(x.shape[0], x.shape[1],
+                          -1)  # shape = [*, width, grid ** 2]
+            # batch, 256, 1024
+            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+            # batch, 257, 1024
+            x = torch.cat([
+                self.cls_layers[i].to(x.dtype) + torch.zeros(
+                    x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x
+            ],
+                          dim=1)  # shape = [*, grid ** 2 + 1, width]
+            x = x + self.pos_layers[i].to(x.dtype)
+            x = self.pre_layers[i](x)
+            # 257, batch, 1024
+            x = x.permute(1, 0, 2)  # NLD -> LND
+            x, cls = self.tran_layers[i](x)
+            # batch, 257, 1024
+            x = x.permute(1, 0, 2)  # LND -> NLD
+            # 用于分类
+            # x = self.ln_post(x[:, 0, :])
+            # feature
+            x = self.post_layers[i](x[:, 1:, :])
+            if self.proj_layers[i] is not None:
+                x = x @ self.proj_layers[i]
+                cls = [j @ self.proj_layers[i] for j in cls]
+            feat = x.permute(0,2,1).reshape(b, x.shape[2] , w, h)
+            out.append(x)
+            f.append(feat)
+            cl.append(cls)
+        return out, f, cl
+"""
+    Long CLIP
+"""
+class LCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 load_from_clip: bool
+                 ):
+        super().__init__()
+        self.context_length = 248
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        # self.positional_embedding = nn.Parameter(torch.empty(248, transformer_width))
+        if load_from_clip == False:
+            self.positional_embedding = nn.Parameter(torch.empty(248, transformer_width))
+            self.positional_embedding_res = nn.Parameter(torch.empty(248, transformer_width))
+        else:
+            self.positional_embedding = nn.Parameter(torch.empty(248, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+        self.mask1 = torch.zeros([248, 1])
+        self.mask1[:20, :] = 1
+        self.mask2 = torch.zeros([248, 1])
+        self.mask2[20:, :] = 1
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        # x = x + (self.positional_embedding.to(x.device) * self.mask1.to(x.device)).type(self.dtype).to(x.device) + (self.positional_embedding_res.to(x.device) * self.mask2.to(x.device)).type(self.dtype).to(x.device)
+        x = x + (self.positional_embedding.to(x.device) * self.mask1.to(x.device)).type(self.dtype).to(x.device)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def encode_text_full(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + (self.positional_embedding.to(x.device) * self.mask1.to(x.device)).type(self.dtype).to(x.device) + (self.positional_embedding_res.to(x.device) * self.mask2.to(x.device)).type(self.dtype).to(x.device)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        #x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features, _ = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+"""
+    original CLIP
+"""
+class CLIP(nn.Module):
+    def __init__(
+            self,
+            embed_dim: int,
+            # vision
+            image_resolution: int,
+            vision_layers: Union[Tuple[int, int, int, int], int],
+            vision_width: int,
+            vision_patch_size: int,
+            # text
+            context_length: int,
+            txt_length: int,
+            vocab_size: int,
+            transformer_width: int,
+            transformer_heads: int,
+            transformer_layers: int):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(layers=vision_layers,
+                                         output_dim=embed_dim,
+                                         heads=vision_heads,
+                                         input_resolution=image_resolution,
+                                         width=vision_width)
+            # self.fq_attnpool = AttentionPool2d(image_resolution // 32, vision_width* 32,
+            #                                    vision_heads, embed_dim)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(input_resolution=image_resolution,
+                                            patch_size=vision_patch_size,
+                                            width=vision_width,
+                                            layers=vision_layers,
+                                            heads=vision_heads,
+                                            output_dim=embed_dim)
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask(txt_length))
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.token_embedding.requires_grad_ = False
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection,
+                            std=self.transformer.width**-0.5)
+    def build_attention_mask(self, context_length):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(context_length, context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_fq(self, image):
+        return self.fq_attnpool(image.type(self.dtype))
+    def encode_text(self, text):
+        a = self.token_embedding
+        x = self.token_embedding(text).type(
+            self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)[:x.size(1)]
+        # print(x.shape)
+        # print(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # print(text[0])
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        state = x[torch.arange(x.shape[0]),
+                  text.argmax(dim=-1)] @ self.text_projection
+        # x = x @ self.text_projection
+        # state = x[torch.arange(x.shape[0]), text.argmax(dim=-1)]
+        return x, state
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                           keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+"""
+    modified CLIP : without text encoder
+"""
+class zhCLIP(nn.Module):
+    def __init__(self,
+            embed_dim,
+            # vision
+            image_resolution: int,
+            vision_layers: Union[Tuple[int, int, int, int], int],
+            vision_width: int,
+            vision_patch_size: int):
+        super().__init__()
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(layers=vision_layers,
+                                         output_dim=embed_dim,
+                                         heads=vision_heads,
+                                         input_resolution=image_resolution,
+                                         width=vision_width)
+            self.fq_attnpool = AttentionPool2d(image_resolution // 32, vision_width* 32,
+                                               vision_heads, embed_dim)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = ModifiedVisionTransformer(input_resolution=image_resolution,
+                                            patch_size=vision_patch_size,
+                                            width=vision_width,
+                                            layers=vision_layers,
+                                            heads=vision_heads,
+                                            output_dim=embed_dim)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+    def build_attention_mask(self, context_length):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(context_length, context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_fq(self, image):
+        return self.fq_attnpool(image.type(self.dtype))
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                           keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [
+                    *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]],
+                    "in_proj_bias", "bias_k", "bias_v"
+            ]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+class PromptLearner(nn.Module):
+    def __init__(self, transformer_width, context_length, vocab_size,
+                 transformer_layers, transformer_heads, bert_embed_dim):
+        super().__init__()
+        self.transformer_width = transformer_width
+        self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(self.vocab_size, self.transformer_width)
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, bert_embed_dim))
+        # self.load_from_openai_model(pretrained_model=clip_pretrain)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def init_label_emb(self, labels_path):
+        label = open(labels_path, 'r').readlines()
+        # label81 = open(unseen_labels_path, 'r').readlines()
+        # label1006 = label925 + label81
+        self.name_lens = [len(_tokenizer.encode(name)) for name in label]
+        self.label_token = torch.zeros((len(self.name_lens), self.context_length), dtype=torch.long)
+        for i, c in enumerate(label):
+            self.label_token[i] = tokenize(f"There is a {c.strip()} in the scene")
+        self.label_emb = torch.zeros((len(self.name_lens), max(self.name_lens), self.transformer_width))
+        for i, embed in enumerate(self.token_embedding(self.label_token)):
+            self.label_emb[i][:self.name_lens[i]] = embed[4:4 + self.name_lens[i]].clone().detach()
+    # def load_from_openai_model(self, pretrained_model):
+    #     state_dict = clip.load(pretrained_model, jit=False)[0].state_dict()
+    #     load_dict = {}
+    #     for k, v in state_dict.items():
+    #         if not k.startswith("visual") and (
+    #                 k not in ["logit_scale", "input_resolution", "context_length", "vocab_size"]):
+    #             load_dict[k] = v
+    #     msg = self.load_state_dict(load_dict)
+    def load_label_emb(self, label=None):
+        self.name_lens = [len(_tokenizer.encode(name.split("\t")[-1])) for name in label]
+        self.label_token = torch.zeros((len(self.name_lens), self.context_length), dtype=torch.long).cuda()
+        for i, c in enumerate(label):
+            name = c.split("\t")[-1]
+            self.label_token[i] = tokenize(f"There is a {name.strip()} in the scene")
+        self.label_emb = torch.zeros((len(self.name_lens), max(self.name_lens), self.transformer_width)).cuda()
+        for i, embed in enumerate(self.token_embedding(self.label_token)):
+            self.label_emb[i][:self.name_lens[i]] = embed[4:4 + self.name_lens[i]].clone().detach()
+    def forward(self, device):
+        label_embeds = self.token_embedding(self.label_token.to(device))
+        for i in range(label_embeds.shape[0]):
+            label_embeds[i, 4:4 + self.name_lens[i], :] = self.label_emb[i][:self.name_lens[i]]
+        x = label_embeds + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        res = x[torch.arange(x.shape[0]), self.label_token.argmax(dim=-1)] @ self.text_projection
+        return res
+def build_promptlearner(state_dict: dict):
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split(".")[2] for k in state_dict
+            if k.startswith(f"transformer.resblocks")))
+    model = PromptLearner(transformer_width, context_length, vocab_size,
+                          transformer_layers, transformer_heads, embed_dim)
+    # model = PromptLearner(embed_dim, vision_patch_size, context_length, txt_length, vocab_size,
+    #              transformer_width, transformer_heads, transformer_layers)
+    load_dict = {}
+    for k, v in state_dict.items():
+        if not k.startswith("visual") and (
+                k not in ["logit_scale", "input_resolution", "context_length", "vocab_size"]):
+            load_dict[k] = v
+    convert_weights(model)
+    model.load_state_dict(load_dict, False)
+    return model
+def build_model(state_dict: dict, txt_length: int):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")
+        ])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round(
+            (state_dict["visual.positional_embedding"].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split(".")[2] for k in state_dict
+                    if k.startswith(f"visual.layer{b}")))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round(
+            (state_dict["visual.attnpool.positional_embedding"].shape[0] -
+             1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            "visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    vision_heads = vision_width * 32 // 64
+    embed_dim = state_dict["text_projection"].shape[1]
+    # context_length = state_dict["positional_embedding"].shape[0]
+    context_length = txt_length
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split(".")[2] for k in state_dict
+            if k.startswith(f"transformer.resblocks")))
+    model = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, txt_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+    for key in ["input_resolution", "context_length", "vocab_size", 'positional_embedding']:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict, False)
+    return model.eval(), image_resolution, vision_heads, embed_dim, vision_width, vision_patch_size
+def build_lclip_model(state_dict: dict, load_from_clip: bool):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    # print(embed_dim)
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = LCLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, load_from_clip
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    # model.load_state_dict(state_dict)
+    model.load_state_dict(state_dict, strict=False)
+    vision_heads = vision_width // 64
+    # print(vision_heads)
+    return model.eval(), image_resolution, vision_heads, embed_dim, vision_width, vision_patch_size
+def build_modified_model(state_dict: dict, txt_length: int):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")
+        ])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round(
+            (state_dict["visual.positional_embedding"].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split(".")[2] for k in state_dict
+                    if k.startswith(f"visual.layer{b}")))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round(
+            (state_dict["visual.attnpool.positional_embedding"].shape[0] -
+             1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            "visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    model = zhCLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size)
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    convert_weights(model)
+    model.load_state_dict(state_dict, False)
+    return model.eval()

cisen/model/layers.py ADDED Viewed

	@@ -0,0 +1,633 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# import open_clip
+def conv_layer(in_dim, out_dim, kernel_size=1, padding=0, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(in_dim, out_dim, kernel_size, stride, padding, bias=False),
+        nn.BatchNorm2d(out_dim), nn.ReLU(True))
+    # return nn.Sequential(
+    #     nn.Conv2d(in_dim, out_dim, kernel_size, stride, padding, bias=False),
+    #     nn.LayerNorm(out_dim), nn.ReLU(True))
+# def conv_layer_1(in_dim, out_dim, kernel_size=1, padding=0, stride=1):
+#     return nn.Sequential(
+#         nn.Conv2d(in_dim, out_dim, kernel_size, stride, padding, bias=False),
+#         nn.LayerNorm(out_dim), nn.ReLU(True))
+def linear_layer(in_dim, out_dim,bias=False):
+    return nn.Sequential(nn.Linear(in_dim, out_dim, bias),
+                         nn.BatchNorm1d(out_dim), nn.ReLU(True))
+    # return nn.Sequential(nn.Linear(in_dim, out_dim, bias),
+    #                      nn.LayerNorm(out_dim), nn.ReLU(True))
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+# class AttentionPool2d(nn.Module):
+#     def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+#         super().__init__()
+#         self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+#         self.k_proj = nn.Linear(embed_dim, embed_dim)
+#         self.q_proj = nn.Linear(embed_dim, embed_dim)
+#         self.v_proj = nn.Linear(embed_dim, embed_dim)
+#         self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+#         self.num_heads = num_heads
+#
+#     def forward(self, x):
+#         x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+#         x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+#         x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+#         x, _ = F.multi_head_attention_forward(
+#             query=x, key=x, value=x,
+#             embed_dim_to_check=x.shape[-1],
+#             num_heads=self.num_heads,
+#             q_proj_weight=self.q_proj.weight,
+#             k_proj_weight=self.k_proj.weight,
+#             v_proj_weight=self.v_proj.weight,
+#             in_proj_weight=None,
+#             in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+#             bias_k=None,
+#             bias_v=None,
+#             add_zero_attn=False,
+#             dropout_p=0,
+#             out_proj_weight=self.c_proj.weight,
+#             out_proj_bias=self.c_proj.bias,
+#             use_separate_proj_weight=True,
+#             training=self.training,
+#             need_weights=False
+#         )
+#
+#         return x[0]
+class CoordConv(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 padding=1,
+                 stride=1):
+        super().__init__()
+        self.conv1 = conv_layer(in_channels + 2, out_channels, kernel_size,
+                                padding, stride)
+    def add_coord(self, input):
+        b, _, h, w = input.size()
+        x_range = torch.linspace(-1, 1, w, device=input.device)
+        y_range = torch.linspace(-1, 1, h, device=input.device)
+        y, x = torch.meshgrid(y_range, x_range)
+        y = y.expand([b, 1, -1, -1])
+        x = x.expand([b, 1, -1, -1])
+        coord_feat = torch.cat([x, y], 1)
+        input = torch.cat([input, coord_feat], 1)
+        return input
+    def forward(self, x):
+        x = self.add_coord(x)
+        x = self.conv1(x)
+        return x
+class TransformerDecoder(nn.Module):
+    def __init__(self,
+                 num_layers,
+                 d_model,
+                 nhead,
+                 dim_ffn,
+                 dropout,
+                 return_intermediate=False):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            TransformerDecoderLayer(d_model=d_model,
+                                    nhead=nhead,
+                                    dim_feedforward=dim_ffn,
+                                    dropout=dropout) for _ in range(num_layers)
+        ])
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(d_model)
+        self.return_intermediate = return_intermediate
+    @staticmethod
+    def pos1d(d_model, length):
+        """
+        :param d_model: dimension of the model
+        :param length: length of positions
+        :return: length*d_model position matrix
+        """
+        if d_model % 2 != 0:
+            raise ValueError("Cannot use sin/cos positional encoding with "
+                             "odd dim (got dim={:d})".format(d_model))
+        pe = torch.zeros(length, d_model)
+        position = torch.arange(0, length).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *
+                              -(math.log(10000.0) / d_model)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        return pe.unsqueeze(1)  # n, 1, 512
+    @staticmethod
+    def pos2d(d_model, height, width):
+        """
+        :param d_model: dimension of the model
+        :param height: height of the positions
+        :param width: width of the positions
+        :return: d_model*height*width position matrix
+        """
+        if d_model % 4 != 0:
+            raise ValueError("Cannot use sin/cos positional encoding with "
+                             "odd dimension (got dim={:d})".format(d_model))
+        pe = torch.zeros(d_model, height, width)
+        # Each dimension use half of d_model
+        d_model = int(d_model / 2)
+        div_term = torch.exp(
+            torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
+        pos_w = torch.arange(0., width).unsqueeze(1)
+        pos_h = torch.arange(0., height).unsqueeze(1)
+        pe[0:d_model:2, :, :] = torch.sin(pos_w * div_term).transpose(
+            0, 1).unsqueeze(1).repeat(1, height, 1)
+        pe[1:d_model:2, :, :] = torch.cos(pos_w * div_term).transpose(
+            0, 1).unsqueeze(1).repeat(1, height, 1)
+        pe[d_model::2, :, :] = torch.sin(pos_h * div_term).transpose(
+            0, 1).unsqueeze(2).repeat(1, 1, width)
+        pe[d_model + 1::2, :, :] = torch.cos(pos_h * div_term).transpose(
+            0, 1).unsqueeze(2).repeat(1, 1, width)
+        return pe.reshape(-1, 1, height * width).permute(2, 1, 0)  # hw, 1, 512
+    def forward(self, vis, txt, pad_mask):
+        '''
+            vis: b, 512, h, w
+            txt: b, L, 512
+            pad_mask: b, L
+        '''
+        B, C, H, W = vis.size()
+        _, L, D = txt.size()
+        # position encoding
+        vis_pos = self.pos2d(C, H, W)
+        txt_pos = self.pos1d(D, L)
+        # reshape & permute
+        vis = vis.reshape(B, C, -1).permute(2, 0, 1)
+        txt = txt.permute(1, 0, 2)
+        # forward
+        output = vis
+        intermediate = []
+        for layer in self.layers:
+            output = layer(output, txt, vis_pos, txt_pos, pad_mask)
+            if self.return_intermediate:
+                # HW, b, 512 -> b, 512, HW
+                intermediate.append(self.norm(output).permute(1, 2, 0))
+        if self.norm is not None:
+            # HW, b, 512 -> b, 512, HW
+            output = self.norm(output).permute(1, 2, 0)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+                # [output1, output2, ..., output_n]
+                return intermediate
+            else:
+                # b, 512, HW
+                return output
+        return output
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=512,
+                 nhead=9,
+                 dim_feedforward=2048,
+                 dropout=0.1):
+        super().__init__()
+        # Normalization Layer
+        self.self_attn_norm = nn.LayerNorm(d_model)
+        self.cross_attn_norm = nn.LayerNorm(d_model)
+        # Attention Layer
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model,
+                                                    nhead,
+                                                    dropout=dropout,
+                                                    kdim=d_model,
+                                                    vdim=d_model)
+        # FFN
+        self.ffn = nn.Sequential(nn.Linear(d_model, dim_feedforward),
+                                 nn.ReLU(True), nn.Dropout(dropout),
+                                 nn.LayerNorm(dim_feedforward),
+                                 nn.Linear(dim_feedforward, d_model))
+        # LayerNorm & Dropout
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos.to(tensor.device)
+    def forward(self, vis, txt, vis_pos, txt_pos, pad_mask):
+        '''
+            vis: 26*26, b, 512
+            txt: L, b, 512
+            vis_pos: 26*26, 1, 512
+            txt_pos: L, 1, 512
+            pad_mask: b, L
+        '''
+        # Self-Attention
+        vis2 = self.norm1(vis)
+        q = k = self.with_pos_embed(vis2, vis_pos)
+        vis2 = self.self_attn(q, k, value=vis2)[0]
+        vis2 = self.self_attn_norm(vis2)
+        vis = vis + self.dropout1(vis2)
+        # Cross-Attention
+        vis2 = self.norm2(vis)
+        vis2 = self.multihead_attn(query=self.with_pos_embed(vis2, vis_pos),
+                                   key=self.with_pos_embed(txt, txt_pos),
+                                   value=txt,
+                                   key_padding_mask=pad_mask)[0]
+        vis2 = self.cross_attn_norm(vis2)
+        vis = vis + self.dropout2(vis2)
+        # FFN
+        vis2 = self.norm3(vis)
+        vis2 = self.ffn(vis2)
+        vis = vis + self.dropout3(vis2)
+        return vis
+class Text_Projector(nn.Module):
+    def __init__(self, args, in_channels=[512, 1024, 1024],
+                 out_channels=[256, 512, 1024]):
+        super(Text_Projector, self).__init__()
+        self.proj = linear_layer(args, in_channels[2], out_channels[2])
+        self.ReLU = nn.ReLU(True)
+    def forward(self, text):
+        text = self.ReLU(text + self.proj(text))
+        return text
+class Image_Projector(nn.Module):
+    def __init__(self, args, in_channels=[512, 1024, 1024],
+                 out_channels=[256, 512, 1024]):
+        super(Image_Projector, self).__init__()
+        self.proj = linear_layer(args, in_channels[0], out_channels[2])
+        self.ReLU = nn.ReLU(True)
+    def forward(self, image):
+        image = self.ReLU(image + self.proj(image))
+        return image
+class Adapter(nn.Module):
+    def __init__(self, c_in, reduction=4):
+        super(Adapter, self).__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(c_in, c_in // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(c_in // reduction, c_in, bias=False),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+class GAP(nn.Module):
+    def __init__(self, kernel):
+        super(GAP, self).__init__()
+        self.k = kernel
+        # self.fc = nn.Linear(512, 1024)
+    def forward(self, x):
+        x = F.adaptive_avg_pool2d(x, self.k)
+        return x.squeeze(-1).squeeze(-1)
+class AdaptiveSpatialFeatureFusion(nn.Module):
+    def __init__(self, args, in_channels=[512, 1024, 1024],
+                 out_channels=[256, 512, 1024]):
+        super(AdaptiveSpatialFeatureFusion, self).__init__()
+        self.weight = nn.LayerNorm(out_channels[2])
+        self.proj = linear_layer(args, in_channels[0], out_channels[2])
+    def forward(self, feature_map1, feature_map2):
+        # feature_map1 : b, 1024, 1, 1
+        # feature_map2 : b, 512, 1, 1
+        feature_map2 = self.proj(feature_map2.squeeze(-1).squeeze(-1))
+        feature_map1 = feature_map1.squeeze(-1).squeeze(-1)
+        weights1 = torch.norm(feature_map1, dim=1).unsqueeze(-1)
+        weights2 = torch.norm(feature_map2, dim=1).unsqueeze(-1)
+        weights1 = weights1 / (weights1 + weights2)
+        weights2 = 1 - weights1
+        fused_feature_map = weights1 * feature_map1 + weights2 * feature_map2
+        # b, 1024
+        return fused_feature_map
+class ModifiedAttentionPool2d(nn.Module):
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.spacial_dim = spacial_dim
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        # residual
+        self.connect = nn.Sequential(
+            nn.Conv2d(embed_dim, output_dim, 1, stride=1, bias=False),
+            nn.BatchNorm2d(output_dim))
+    def resize_pos_embed(self, pos_embed, input_shpae):
+        """Resize pos_embed weights.
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, C, L_new]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h = pos_w = self.spacial_dim
+        cls_token_weight = pos_embed[:, 0]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = F.interpolate(pos_embed_weight,
+                                         size=input_shpae,
+                                         align_corners=False,
+                                         mode='bicubic')
+        cls_token_weight = cls_token_weight.unsqueeze(1)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        # pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed_weight.transpose(-2, -1)
+    def forward(self, x):
+        B, C, H, W = x.size()
+        res = self.connect(x)
+        x = x.reshape(B, C, -1)  # NC(HW)
+        # x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(1+HW)
+        pos_embed = self.positional_embedding.unsqueeze(0)
+        pos_embed = self.resize_pos_embed(pos_embed, (H, W))  # NC(HW)
+        x = x + pos_embed.to(x.dtype)  # NC(HW)
+        x = x.permute(2, 0, 1)  # (HW)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        xt = x[0]
+        x = x.permute(1, 2, 0).reshape(B, -1, H, W)
+        x = x + res
+        x = F.relu(x, True)
+        return x, xt
+# modified
+class FPN(nn.Module):
+    def __init__(self, args,
+                 in_channels=[512, 1024, 1024],
+                 out_channels=[256, 512, 1024, 1024]):
+        super(FPN, self).__init__()
+        input_resolution = args.input_size
+        heads = args.heads
+        output_dim = args.output_dim
+        embed_dim = args.emb_dim
+        # image projection
+        self.attn = ModifiedAttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+        # text projection
+        self.txt_proj = linear_layer(args, in_channels[2], out_channels[2])
+        # fusion 1: v5 & seq -> f_5: b, 1024, 13, 13
+        self.f1_v_proj = conv_layer(in_channels[2], out_channels[2], 1, 0)
+        self.norm_layer = nn.Sequential(nn.BatchNorm2d(out_channels[2]),
+                                        nn.ReLU(True))
+        # fusion 2: v4 & fm -> f_4: b, 512, 26, 26
+        self.f2_v_proj = conv_layer(in_channels[1], out_channels[1], 3, 1)
+        self.f2_cat = conv_layer(out_channels[2] + out_channels[1],
+                                 out_channels[1], 1, 0)
+        # fusion 3: v3 & fm_mid -> f_3: b, 512, 52, 52
+        self.f3_v_proj = conv_layer(in_channels[0], out_channels[0], 3, 1)
+        self.f3_cat = conv_layer(out_channels[0] + out_channels[1],
+                                 out_channels[1], 1, 0)
+        # fusion 4: f_3 & f_4 & f_5 -> fq: b, 256, 26, 26
+        self.f4_proj5 = conv_layer(out_channels[2], out_channels[1], 3, 1)
+        self.f4_proj4 = conv_layer(out_channels[1], out_channels[1], 3, 1)
+        self.f4_proj3 = conv_layer(out_channels[1], out_channels[1], 3, 1)
+        # aggregation
+        self.aggr = conv_layer(3 * out_channels[1], out_channels[1], 1, 0)
+        self.coordconv = nn.Sequential(
+            CoordConv(out_channels[1], out_channels[1], 3, 1),
+            conv_layer(out_channels[1], out_channels[3], 3, 1))
+    def forward(self, imgs, text):
+        # v3, v4, v5: 256, 52, 52 / 512, 26, 26 / 1024, 13, 13
+        v3, v4, v5 = imgs
+        # fusion 1: b, 1024, 13, 13
+        # text projection: b, 1024 -> b, 1024
+        v5, _ = self.attn(v5)
+        text_ = self.txt_proj(text)
+        state = text_.unsqueeze(-1).unsqueeze(
+            -1)# b, 1024, 1, 1
+        f5 = self.f1_v_proj(v5) # b, 1024, 7, 7
+        f5 = self.norm_layer(f5 * state)
+        # fusion 2: b, 512, 26, 26
+        f4 = self.f2_v_proj(v4)
+        # f4 = f4.repeat(w2,1,1,1)
+        f5_ = F.interpolate(f5, scale_factor=2, mode='bilinear')
+        f4 = self.f2_cat(torch.cat([f4, f5_], dim=1))
+        # fusion 3: b, 256, 26, 26
+        f3 = self.f3_v_proj(v3)
+        f3 = F.avg_pool2d(f3, 2, 2)
+        # f3 = f3.repeat(w2, 1, 1, 1)
+        f3 = self.f3_cat(torch.cat([f3, f4], dim=1))
+        # fusion 4: b, 512, 13, 13 / b, 512, 26, 26 / b, 512, 26, 26
+        fq5 = self.f4_proj5(f5)
+        fq4 = self.f4_proj4(f4)
+        fq3 = self.f4_proj3(f3)
+        # query
+        fq5 = F.interpolate(fq5, scale_factor=2, mode='bilinear')
+        fq = torch.cat([fq3, fq4, fq5], dim=1)
+        fq = self.aggr(fq)
+        fq = self.coordconv(fq)
+            # fqq = fq.reshape(w1, w2, fq.shape[1], fq.shape[2], fq.shape[3])
+            # b, 512, 26, 26
+        # elif text.shape[0] != v3.shape[0]:
+        #
+        #     text = self.txt_proj(text)
+        #     state = text.unsqueeze(-1).unsqueeze(
+        #         -1)  # b, 1024, 1, 1
+        #     state = state.view(v5.shape[0], int(text.shape[0] / v5.shape[0]), state.shape[1], state.shape[2], state.shape[3])
+        #
+        #     f5 = self.f1_v_proj(v5)  # b, 1024, 7, 7
+        #     f5 = f5.unsqueeze(1)
+        #     f5_ = f5 * state
+        #     f5_ = f5_.view(-1, f5.shape[2], f5.shape[3], f5.shape[4])
+        #     f5 = self.norm_layer(f5_)
+        #     # fusion 2: b, 512, 26, 26
+        #     f4 = self.f2_v_proj(v4)
+        #     # f4 = f4.repeat(w2,1,1,1)
+        #
+        #     f5_ = F.interpolate(f5, scale_factor=2, mode='bilinear')
+        #     f4 = f4.repeat(int(f5_.shape[0] / f4.shape[0]), 1, 1, 1)
+        #     f4 = self.f2_cat(torch.cat([f4, f5_], dim=1))
+        #
+        #     # fusion 3: b, 256, 26, 26
+        #     f3 = self.f3_v_proj(v3)
+        #     f3 = F.avg_pool2d(f3, 2, 2)
+        #     # f3 = f3.repeat(w2, 1, 1, 1)
+        #     f3 = f3.repeat(int(f5_.shape[0] / f3.shape[0]), 1, 1, 1)
+        #     f3 = self.f3_cat(torch.cat([f3, f4], dim=1))
+        #     # fusion 4: b, 512, 13, 13 / b, 512, 26, 26 / b, 512, 26, 26
+        #     fq5 = self.f4_proj5(f5)
+        #     fq4 = self.f4_proj4(f4)
+        #     fq3 = self.f4_proj3(f3)
+        #     # query
+        #     fq5 = F.interpolate(fq5, scale_factor=2, mode='bilinear')
+        #     fq = torch.cat([fq3, fq4, fq5], dim=1)
+        #     fq = self.aggr(fq)
+        #     fq = self.coordconv(fq)
+        return fq
+class ViTFPN(nn.Module):
+    def __init__(self, image_resolution,
+                 in_channels=[512, 768, 768],
+                 out_channels=[768, 768, 768, 512]):
+        super(ViTFPN, self).__init__()
+        # text projection
+        self.txt_proj = linear_layer(in_channels[0], out_channels[1])
+        # fusion 1: v5 & seq -> f_5: b, 1024, 13, 13
+        self.f1_v_proj = conv_layer(in_channels[1], out_channels[1], 1, 0)
+        self.norm_layer = nn.Sequential(nn.BatchNorm2d(out_channels[1]),
+                                        nn.ReLU(True))
+        # fusion 2: v4 & fm -> f_4: b, 512, 26, 26
+        self.f2_v_proj = conv_layer(in_channels[1], out_channels[1], 3, 1)
+        self.f2_cat = conv_layer(out_channels[0] + out_channels[0],
+                                 out_channels[0], 1, 0)
+        # fusion 3: v3 & fm_mid -> f_3: b, 512, 52, 52
+        self.f3_v_proj = conv_layer(in_channels[1], out_channels[1], 3, 1)
+        self.f3_cat = conv_layer(out_channels[0] + out_channels[1],
+                                 out_channels[1], 1, 0)
+        # fusion 4: f_3 & f_4 & f_5 -> fq: b, 256, 26, 26
+        self.f4_proj5 = conv_layer(out_channels[1], out_channels[0], 3, 1)
+        self.f4_proj4 = conv_layer(out_channels[0], out_channels[0], 3, 1)
+        self.f4_proj3 = conv_layer(out_channels[1], out_channels[1], 3, 1)
+        # aggregation
+        self.aggr = conv_layer(3 * out_channels[0], out_channels[0], 1, 0)
+        self.coordconv = nn.Sequential(
+            CoordConv(out_channels[0], out_channels[0], 3, 1),
+            conv_layer(out_channels[0], out_channels[-1], 3, 1))
+        self.attnpool = AttentionPool2d(image_resolution // 32, out_channels[-1],
+                                    8, out_channels[-1])
+    def forward(self, imgs, state, vis):
+        # v1 / v2 / b, 49, 1024/ b, 196, 512
+        v3, v4, v5 = imgs
+        # fusion 1: b, 1024, 13, 13
+        # text projection: b, 1024 -> b, 1024
+        state = self.txt_proj(state)
+        state = state.unsqueeze(-1).unsqueeze(
+            -1)# b, 1024, 1, 1
+        f5 = self.f1_v_proj(v5)
+        f5 = self.norm_layer(f5 * state)
+        # fusion 2: b, 512, 26, 26
+        f4 = self.f2_v_proj(v4)
+        b, c, h, w = f4.size()
+        f5_ = F.interpolate(f5, (h, w), mode='bilinear')
+        f4 = self.f2_cat(torch.cat([f4, f5_], dim=1))
+        # fusion 3: b, 256, 26, 26
+        f3 = self.f3_v_proj(v3)
+        f3 = F.avg_pool2d(f3, 2, 2)
+        # f3 = f3.repeat(w2, 1, 1, 1)
+        f3 = self.f3_cat(torch.cat([f3, f4], dim=1))
+        # fusion 4: b, 512, 13, 13 / b, 512, 26, 26 / b, 512, 26, 26
+        fq5 = self.f4_proj5(f5)
+        fq4 = self.f4_proj4(f4)
+        fq3 = self.f4_proj3(f3)
+        # query
+        fq5 = F.interpolate(fq5, (h, w), mode='bilinear')
+        fq = torch.cat([fq3, fq4, fq5], dim=1)
+        fq = self.aggr(fq)
+        if not vis:
+            fq = self.coordconv(fq)
+            fq = self.attnpool(fq)
+        # b, 512, 26, 26
+        return fq

cisen/model/segmenter.py ADDED Viewed

	@@ -0,0 +1,2045 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from .clip import build_model, build_promptlearner, build_modified_model, PromptLearner, build_lclip_model
+from torch.cuda.amp import autocast as autocast
+from timm.models.layers import trunc_normal_ as __call_trunc_normal_
+from timm.models.layers import variance_scaling_
+from einops import rearrange, repeat
+from loguru import logger
+from transformers import AlignProcessor, AlignModel
+from sklearn.metrics import classification_report
+from huggingface_hub import PyTorchModelHubMixin
+from .layers import FPN, TransformerDecoder, ViTFPN, AdaptiveSpatialFeatureFusion, Text_Projector, Image_Projector, Adapter, GAP
+from cisen.model.clip import CLIP
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def trunc_normal_(tensor, mean=0.0, std=1.0):
+    __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)
+class CISEN_vit(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, patch_size  = build_model(clip_model.state_dict(), cfg.word_len)
+        self.backbone = backbone.float()
+        self.patch_emb = image_resolution // patch_size
+        cfg.image_resolution = image_resolution
+        cfg.input_size = image_resolution
+        cfg.heads = vision_heads // 32
+        cfg.emb_dim = vision_width
+        cfg.output_dim = embed_dim
+        # multi-scale adapter
+        # Multi-Modal FPN
+        self.FPN = ViTFPN(image_resolution, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        # self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+        #                                   d_model=cfg.vis_dim,
+        #                                   nhead=cfg.num_head,
+        #                                   dim_ffn=cfg.dim_ffn,
+        #                                   dropout=cfg.dropout,
+        #                                   return_intermediate=cfg.intermediate)
+        # image-text transformer
+        # self.trans = nn.Linear(1024, 1024)
+        self.ADP = Adapter(cfg.output_dim, 4)
+        # parameter
+        self.ratio = cfg.ratio
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.ce = nn.CrossEntropyLoss()
+        self.ms_adaptor = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                    nn.GroupNorm(32, cfg.emb_dim),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.Identity(),
+                ),
+                nn.Sequential(
+                    nn.MaxPool2d(2),
+                ),
+            ]
+        )
+        self.ms_adaptor.apply(self.init_adaptor)
+    def init_adaptor(self, m):
+        if isinstance(m, nn.Conv2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.GroupNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.ConvTranspose2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        # self.fc = nn.Linear(512, cfg.num_classes)
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def forward(self, img, txt, stage):
+        if stage == '1st':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1-self.ratio) * image
+            # b, 1024
+            # fq_t = self.FPN(vis, x)
+            #
+            # fv_t = self.gap(fq_t)
+            loss1 = self.IT_loss(x, text)
+            loss = loss1
+            ft = text
+            fi = x
+            fv = None
+        elif stage == '2nd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1 - self.ratio) * image
+            # Construct multi-scale feats
+            vis_trans = []
+            for i in range(len(self.ms_adaptor)):
+                x_ = rearrange(
+                    vis[i],
+                    "b (h w) c -> b c h w",
+                    h=self.patch_emb,
+                    w=self.patch_emb,
+                ).contiguous()
+                feats = self.ms_adaptor[i](x_)
+                vis_trans.append(feats)
+            # fq = self.FPN(vis, x_t)
+            fv_t = self.FPN(vis_trans[1:], x, False)
+            # fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = (loss2)
+            fv = fv_t
+            ft = text
+            fi = x
+        return loss, fv, fi, ft
+    def visualize(self, img, txt):
+        vis, image = self.backbone.encode_image(img)
+        word, text = self.backbone.encode_text(txt)
+        x = self.ADP(image)
+        x = self.ratio * x + (1 - self.ratio) * image
+        # Construct multi-scale feats
+        vis_trans = []
+        for i in range(len(self.ms_adaptor)):
+            x_ = rearrange(
+                vis[i],
+                "b (h w) c -> b c h w",
+                h=self.patch_emb,
+                w=self.patch_emb,
+            ).contiguous()
+            feats = self.ms_adaptor[i](x_)
+            vis_trans.append(feats)
+        # fq = self.FPN(vis, x_t)
+        fv_t = self.FPN(vis_trans[1:], x, True)
+        ft_t = self.FPN(vis_trans[1:], text, True)
+        return vis, fv_t, ft_t
+class CISEN_rsvit(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.load(cfg.clip_pretrain,
+                                    map_location="cpu")
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, patch_size  = build_model(clip_model, cfg.word_len)
+        self.backbone = backbone.float()
+        self.patch_emb = image_resolution // patch_size
+        cfg.image_resolution = image_resolution
+        cfg.input_size = image_resolution
+        cfg.heads = vision_heads // 32
+        cfg.emb_dim = vision_width
+        cfg.output_dim = embed_dim
+        # multi-scale adapter
+        # Multi-Modal FPN
+        self.FPN = ViTFPN(image_resolution, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        # self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+        #                                   d_model=cfg.vis_dim,
+        #                                   nhead=cfg.num_head,
+        #                                   dim_ffn=cfg.dim_ffn,
+        #                                   dropout=cfg.dropout,
+        #                                   return_intermediate=cfg.intermediate)
+        # image-text transformer
+        # self.trans = nn.Linear(1024, 1024)
+        self.ADP = Adapter(cfg.output_dim, 4)
+        # parameter
+        self.ratio = cfg.ratio
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.ce = nn.CrossEntropyLoss()
+        self.ms_adaptor = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                    nn.GroupNorm(32, cfg.emb_dim),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.Identity(),
+                ),
+                nn.Sequential(
+                    nn.MaxPool2d(2),
+                ),
+            ]
+        )
+        self.ms_adaptor.apply(self.init_adaptor)
+    def init_adaptor(self, m):
+        if isinstance(m, nn.Conv2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.GroupNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.ConvTranspose2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        # self.fc = nn.Linear(512, cfg.num_classes)
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def image_encode(self, img):
+        vis, image = self.backbone.encode_image(img)
+        x = self.ADP(image)
+        x = self.ratio * x + (1 - self.ratio) * image
+        return x
+    def text_encode(self, txt):
+        word, text = self.backbone.encode_text(txt)
+        return text
+    def forward(self, img, txt, stage):
+        if stage == '1st':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1-self.ratio) * image
+            # b, 1024
+            # fq_t = self.FPN(vis, x)
+            #
+            # fv_t = self.gap(fq_t)
+            loss1 = self.IT_loss(x, text)
+            loss = loss1
+            ft = text
+            fi = x
+            fv = None
+        elif stage == '2nd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1 - self.ratio) * image
+            # Construct multi-scale feats
+            vis_trans = []
+            for i in range(len(self.ms_adaptor)):
+                x_ = rearrange(
+                    vis[i],
+                    "b (h w) c -> b c h w",
+                    h=self.patch_emb,
+                    w=self.patch_emb,
+                ).contiguous()
+                feats = self.ms_adaptor[i](x_)
+                vis_trans.append(feats)
+            # fq = self.FPN(vis, x_t)
+            fv_t = self.FPN(vis_trans[1:], x, False)
+            # fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = (loss2)
+            fv = fv_t
+            ft = text
+            fi = x
+        return loss, fv, fi, ft
+    def visualize(self, img):
+        vis, image = self.backbone.encode_image(img)
+        x = self.ADP(image)
+        x = self.ratio * x + (1 - self.ratio) * image
+        # Construct multi-scale feats
+        vis_trans = []
+        for i in range(len(self.ms_adaptor)):
+            x_ = rearrange(
+                vis[i],
+                "b (h w) c -> b c h w",
+                h=self.patch_emb,
+                w=self.patch_emb,
+            ).contiguous()
+            feats = self.ms_adaptor[i](x_)
+            vis_trans.append(feats)
+        fv_t = self.FPN(vis_trans[1:], x, True)
+        return vis, fv_t
+class CISEN_vit(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, patch_size  = build_model(clip_model.state_dict(), cfg.word_len)
+        self.backbone = backbone.float()
+        self.patch_emb = image_resolution // patch_size
+        cfg.image_resolution = image_resolution
+        cfg.input_size = image_resolution
+        cfg.heads = vision_heads // 32
+        cfg.emb_dim = vision_width
+        cfg.output_dim = embed_dim
+        # multi-scale adapter
+        # Multi-Modal FPN
+        self.FPN = ViTFPN(cfg, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        # self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+        #                                   d_model=cfg.vis_dim,
+        #                                   nhead=cfg.num_head,
+        #                                   dim_ffn=cfg.dim_ffn,
+        #                                   dropout=cfg.dropout,
+        #                                   return_intermediate=cfg.intermediate)
+        # image-text transformer
+        # self.trans = nn.Linear(1024, 1024)
+        self.ADP = Adapter(cfg.output_dim, 4)
+        # parameter
+        self.ratio = cfg.ratio
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.ce = nn.CrossEntropyLoss()
+        self.ms_adaptor = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                    nn.GroupNorm(32, cfg.emb_dim),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.ConvTranspose2d(cfg.emb_dim, cfg.emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.Identity(),
+                ),
+                nn.Sequential(
+                    nn.MaxPool2d(2),
+                ),
+            ]
+        )
+        self.ms_adaptor.apply(self.init_adaptor)
+    def init_adaptor(self, m):
+        if isinstance(m, nn.Conv2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.GroupNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.ConvTranspose2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        # self.fc = nn.Linear(512, cfg.num_classes)
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def forward(self, img, txt, stage):
+        if stage == '1st':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1-self.ratio) * image
+            # b, 1024
+            # fq_t = self.FPN(vis, x)
+            #
+            # fv_t = self.gap(fq_t)
+            loss1 = self.IT_loss(x, text)
+            loss = loss1
+            ft = text
+            fi = x
+            fv = None
+        elif stage == '2nd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1 - self.ratio) * image
+            # Construct multi-scale feats
+            vis_trans = []
+            for i in range(len(self.ms_adaptor)):
+                x_ = rearrange(
+                    vis[i],
+                    "b (h w) c -> b c h w",
+                    h=self.patch_emb,
+                    w=self.patch_emb,
+                ).contiguous()
+                feats = self.ms_adaptor[i](x_)
+                vis_trans.append(feats)
+            # fq = self.FPN(vis, x_t)
+            fv_t = self.FPN(vis_trans[1:], x, False)
+            # fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = (loss2)
+            fv = fv_t
+            ft = text
+            fi = x
+        return loss, fv, fi, ft
+    def visualize(self, img, txt):
+        vis, image = self.backbone.encode_image(img)
+        word, text = self.backbone.encode_text(txt)
+        x = self.ADP(image)
+        x = self.ratio * x + (1 - self.ratio) * image
+        # Construct multi-scale feats
+        vis_trans = []
+        for i in range(len(self.ms_adaptor)):
+            x_ = rearrange(
+                vis[i],
+                "b (h w) c -> b c h w",
+                h=self.patch_emb,
+                w=self.patch_emb,
+            ).contiguous()
+            feats = self.ms_adaptor[i](x_)
+            vis_trans.append(feats)
+        # fq = self.FPN(vis, x_t)
+        fv_t = self.FPN(vis_trans[1:], x, True)
+        ft_t = self.FPN(vis_trans[1:], text, True)
+        return vis, fv_t, ft_t
+class CISEN_rsvit_classification(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.load(cfg.clip_pretrain,
+                                    map_location="cpu")
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, patch_size  = build_model(clip_model, cfg.word_len)
+        self.backbone = backbone.float()
+        self.patch_emb = image_resolution // patch_size
+        num_classes_fc = 512
+        num_classes_output = 10
+        self.num_classes_fc = num_classes_fc  # Number of classes for fully connected layer
+        self.num_classes_output = num_classes_output  # Number of classes for output layer
+        # Add a fully connected layer
+        self.fc = nn.Linear(in_features=cfg.vis_dim, out_features=num_classes_fc)
+        # Add an output layer for multi-label classification
+        self.output_layer = nn.Linear(in_features=num_classes_fc, out_features=num_classes_output)
+        self.criterion = nn.BCEWithLogitsLoss()
+        cfg.image_resolution = image_resolution
+        cfg.input_size = image_resolution
+        cfg.heads = vision_heads // 32
+        cfg.emb_dim = vision_width
+        cfg.output_dim = embed_dim
+    def IT_loss(self, labels, labels_pre):
+        labels = labels.squeeze(1)
+        loss = self.criterion(labels_pre, labels)
+        return loss
+    def forward(self, img, labels):
+        _, image_features = self.backbone.encode_image(img)
+        # Fully connected layer
+        fc_output = self.fc(image_features)
+        # Apply ReLU activation function
+        fc_output = F.relu(fc_output)
+        # Output layer for multi-label classification
+        labels_pre = self.output_layer(fc_output)
+        loss2 = self.IT_loss(labels, labels_pre)
+        return labels_pre, loss2
+class CISEN_new(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, _ = build_model(clip_model.state_dict(), cfg.word_len)
+        self.backbone = backbone.float()
+        cfg.input_size = image_resolution
+        cfg.heads = vision_heads
+        cfg.emb_dim = vision_width * 32
+        cfg.output_dim = embed_dim
+        # Multi-Modal FPN
+        self.FPN = FPN(cfg, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        # self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+        #                                   d_model=cfg.vis_dim,
+        #                                   nhead=cfg.num_head,
+        #                                   dim_ffn=cfg.dim_ffn,
+        #                                   dropout=cfg.dropout,
+        #                                   return_intermediate=cfg.intermediate)
+        # image-text transformer
+        # self.trans = nn.Linear(1024, 1024)
+        self.ADP = Adapter(cfg.output_dim, 4)
+        self.gap = GAP((1,1))
+        # parameter
+        self.ratio = cfg.ratio
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.margin = 1
+        self.eps = 1e-3
+        self.ce = nn.CrossEntropyLoss()
+        #1st stage
+        self.lamda1 = cfg.lamda1
+        self.lamda2 = cfg.lamda2
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        # self.fc = nn.Linear(512, cfg.num_classes)
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def forward(self, img, txt, stage):
+        if stage == '1st':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1-self.ratio) * image
+            # b, 1024
+            # fq_t = self.FPN(vis, x)
+            #
+            # fv_t = self.gap(fq_t)
+            loss1 = self.IT_loss(x, text)
+            loss = loss1
+            ft = text
+            fi = x
+            fv = None
+        elif stage == '2nd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1 - self.ratio) * image
+            # x_t = self.trans(x)
+            # fq = self.FPN(vis, x_t)
+            fq_t = self.FPN(vis, x)
+            fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = (loss2)
+            fv = fv_t
+            ft = text
+            fi = x
+        elif stage == '3rd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(text)
+            ratio = 0.2
+            x = ratio * x + (1 - ratio) * text
+            # x_t = self.trans(x)
+            # fq = self.FPN(vis, x_t)
+            # b, 1024
+            loss1 = self.IT_loss(image, x)
+            loss = loss1
+            fv = None
+            ft = x
+            fi = image
+        elif stage == '4th':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            # x = self.ADP(image)
+            # ratio = 0.2
+            # x = ratio * x + (1 - ratio) * text
+            fq_t = self.FPN(vis, image)
+            fv_t = self.gap(fq_t)
+            ratio_1 = 0.2
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = loss2
+            fv = fv_t
+            fi = None
+            ft = text
+        elif stage == '5th':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            ratio = 0.2
+            x = ratio * x + (1 - ratio) * image
+            y = self.ADP_t(text)
+            ratio_1 = 0.2
+            y = ratio * y + (1 - ratio_1) * text
+            fq_t = self.FPN(vis, image)
+            fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, y)
+            loss = loss2
+            fv = fv_t
+            fi = x
+            ft = y
+        return loss, fv, fi, ft
+class CISEN_lclip(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.load(cfg.clip_pretrain,
+                                    map_location="cpu")
+        # print(type(clip_model))
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, _ = build_lclip_model(clip_model, load_from_clip=True)
+        self.backbone = backbone.float()
+        cfg.input_size = image_resolution
+        cfg.heads = vision_heads // 32
+        cfg.emb_dim = vision_width
+        cfg.output_dim = embed_dim
+        # Multi-Modal FPN
+        self.FPN = FPN(cfg, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        # self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+        #                                   d_model=cfg.vis_dim,
+        #                                   nhead=cfg.num_head,
+        #                                   dim_ffn=cfg.dim_ffn,
+        #                                   dropout=cfg.dropout,
+        #                                   return_intermediate=cfg.intermediate)
+        # image-text transformer
+        # self.trans = nn.Linear(1024, 1024)
+        self.ADP = Adapter(cfg.output_dim, 4)
+        self.gap = GAP((1,1))
+        # parameter
+        self.ratio = cfg.ratio
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.margin = 1
+        self.eps = 1e-3
+        self.ce = nn.CrossEntropyLoss()
+        #1st stage
+        self.lamda1 = cfg.lamda1
+        self.lamda2 = cfg.lamda2
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        # self.fc = nn.Linear(512, cfg.num_classes)
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def forward(self, img, txt, stage):
+        if stage == '1st':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1-self.ratio) * image
+            # b, 1024
+            # fq_t = self.FPN(vis, x)
+            #
+            # fv_t = self.gap(fq_t)
+            loss1 = self.IT_loss(x, text)
+            loss = loss1
+            ft = text
+            fi = x
+            fv = None
+        elif stage == '2nd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            x = self.ratio * x + (1 - self.ratio) * image
+            # x_t = self.trans(x)
+            # fq = self.FPN(vis, x_t)
+            fq_t = self.FPN(vis, x)
+            fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = (loss2)
+            fv = fv_t
+            ft = text
+            fi = x
+        elif stage == '3rd':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            text = self.backbone.encode_text(txt)
+            x = self.ADP(text)
+            ratio = 0.2
+            x = ratio * x + (1 - ratio) * text
+            # x_t = self.trans(x)
+            # fq = self.FPN(vis, x_t)
+            # b, 1024
+            loss1 = self.IT_loss(image, x)
+            loss = loss1
+            fv = None
+            ft = x
+            fi = image
+        elif stage == '4th':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            # x = self.ADP(image)
+            # ratio = 0.2
+            # x = ratio * x + (1 - ratio) * text
+            fq_t = self.FPN(vis, image)
+            fv_t = self.gap(fq_t)
+            ratio_1 = 0.2
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, text)
+            loss = loss2
+            fv = fv_t
+            fi = None
+            ft = text
+        elif stage == '5th':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # text: b, 1024
+            # image: b, 1024
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            x = self.ADP(image)
+            ratio = 0.2
+            x = ratio * x + (1 - ratio) * image
+            y = self.ADP_t(text)
+            ratio_1 = 0.2
+            y = ratio * y + (1 - ratio_1) * text
+            fq_t = self.FPN(vis, image)
+            fv_t = self.gap(fq_t)
+            # b, 1024
+            loss2 = self.IT_loss(fv_t, y)
+            loss = loss2
+            fv = fv_t
+            fi = x
+            ft = y
+        return loss, fv, fi, ft
+class GeoRSCLIP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.load(cfg.clip_pretrain,
+                                    map_location="cpu")
+        backbone, image_resolution, vision_heads, embed_dim, vision_width, patch_size  = build_model(clip_model, cfg.word_len)
+        self.backbone = backbone.float()
+    def forward(self, img, txt, stage):
+        pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+        # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+        # word: b, length, 512
+        # text: b, 1024
+        # image: b, 1024
+        vis, image = self.backbone.encode_image(img)
+        word, text = self.backbone.encode_text(txt)
+        loss = None
+        ft = text
+        fi = image
+        fv = None
+        return loss, fv, fi, ft
+class CISEN(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        self.backbone = build_model(clip_model.state_dict(), cfg.word_len).float()
+        # Multi-Modal FPN
+        self.FPN = FPN(cfg, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+                                          d_model=cfg.vis_dim,
+                                          nhead=cfg.num_head,
+                                          dim_ffn=cfg.dim_ffn,
+                                          dropout=cfg.dropout,
+                                          return_intermediate=cfg.intermediate)
+        # adaptively aggretation
+        self.ASFF = AdaptiveSpatialFeatureFusion(cfg, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # text projector
+        self.projT = Text_Projector(cfg, in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # image projector
+        # self.projI = Image_Projector(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # parameter
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.multi_label_logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.margin = 1
+        self.eps = 1e-3
+        self.ce = nn.CrossEntropyLoss()
+        #1st stage
+        self.lamda1 = cfg.lamda1
+        self.lamda2 = cfg.lamda2
+        self.beta1 = cfg.beta1
+        self.beta2 = cfg.beta2
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        # self.fc = nn.Linear(512, cfg.num_classes)
+        #2nd stage
+        self.pos_samples = cfg.pos_samples
+        self.neg_samples = cfg.neg_samples
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def IET_loss(self, image_features, text_features, pos_samples, beta):
+        # b, 1024 / b, 1024
+        # # normalized features
+        image_features = [image_feature / image_feature.norm(dim=-1,
+                                                              keepdim=True) for image_feature in image_features]
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        # logits_per_image = [logit_scale * image_feature @ text_features.t() for image_feature in image_features]
+        logits_per_image = [logit_scale * torch.sum(torch.mul(image_feature, text_features),1) for image_feature in image_features]
+        logits_per_image = torch.stack(logits_per_image).t()
+        b = logits_per_image.shape[0]
+        loss1 = torch.norm(text_features - image_features[0])
+        positive_tagsT = torch.zeros(b,len(image_features)).to(text_features.device)
+        negative_tagsT = torch.zeros(b,len(image_features)).to(text_features.device)
+        positive_tagsT[:, 0 : pos_samples + 1] = 1
+        negative_tagsT[:, pos_samples + 1 : -1] = 1
+        maskT = positive_tagsT.unsqueeze(1) * negative_tagsT.unsqueeze(-1)
+        pos_score_matT = logits_per_image * positive_tagsT
+        neg_score_matT = logits_per_image * negative_tagsT
+        IW_pos3T = pos_score_matT.unsqueeze(1)
+        IW_neg3T = neg_score_matT.unsqueeze(-1)
+        OT = 1 + IW_neg3T - IW_pos3T
+        O_maskT = maskT * OT
+        diffT = torch.clamp(O_maskT, 0)
+        violationT = torch.sign(diffT).sum(1).sum(1)
+        diffT = diffT.sum(1).sum(1)
+        lossT = torch.mean(diffT / (violationT + self.eps))
+        loss = beta * loss1 + lossT
+        return loss
+    def test_IET_loss(self, image_features, text_features, pos_samples, beta1, beta2):
+        # text_features: enhanced_features
+        # b, 1024 / b, 1024
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        image_features = image_features.unsqueeze(1)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        # image_features = image_features.expand(-1, text_features.shape[1], -1)
+        logits_per_image = logit_scale * torch.matmul(image_features, text_features.transpose(1, 2))
+        logits_per_image = logits_per_image.squeeze(1)
+        # logits_per_image = logit_scale * image_features @ text_features.t()
+        # logits_per_image = [logit_scale * image_feature @ text_features.t() for image_feature in image_features]
+        b = logits_per_image.shape[0]
+        # loss1 = torch.norm(text_features[:, 0, :] - image_features.squeeze(1))
+        positive_tagsT = torch.zeros(b, text_features.shape[1]).to(text_features.device)
+        negative_tagsT = torch.zeros(b, text_features.shape[1]).to(text_features.device)
+        positive_tagsT[:, 0 : pos_samples + 1] = 1
+        negative_tagsT[:, pos_samples + 1 : -1] = 1
+        maskT = positive_tagsT.unsqueeze(1) * negative_tagsT.unsqueeze(-1)
+        pos_score_matT = logits_per_image * positive_tagsT
+        neg_score_matT = logits_per_image * negative_tagsT
+        IW_pos3T = pos_score_matT.unsqueeze(1)
+        IW_neg3T = neg_score_matT.unsqueeze(-1)
+        OT = 1 + IW_neg3T - IW_pos3T
+        O_maskT = maskT * OT
+        diffT = torch.clamp(O_maskT, 0)
+        violationT = torch.sign(diffT).sum(1).sum(1)
+        diffT = diffT.sum(1).sum(1)
+        lossT = torch.mean(diffT / (violationT + self.eps))
+        # loss = beta1 * loss1 + beta2 * lossT
+        loss = lossT
+        return loss
+    def test_IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        image_features = image_features.unsqueeze(1)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * torch.matmul(image_features, text_features.transpose(1, 2))
+        logits_per_image = logits_per_image.squeeze(1)
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = self.ce(logits_per_image, contrastive_labels)
+        return contrastive_loss
+    def test_forward(self, img, txt):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+            mask: b, 1, h, w
+            stage: 1st or 2nd stage
+        '''
+        # padding mask used in decoder
+        pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+        # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+        # word: b, length, 512
+        # state: b, 1024
+        # image: b, 512
+        vis, image = self.backbone.encode_image(img)
+        word, text = self.backbone.encode_text(txt)
+        fq = self.FPN(vis, text)
+        b, c, h, w = fq.size()
+        # b, 512, 14, 14
+        ff = self.FGFusion(fq, word, pad_mask)
+        ff = ff.reshape(b, c, h, w)
+        f2 = self.avg(ff)
+        fi = image.unsqueeze(-1).unsqueeze(-1)
+        fv = self.ASFF(fi, f2)
+        fi = fi.squeeze(-1).squeeze(-1)
+        # b, 1024
+        ft = self.projT(text)
+        loss1 = self.IT_loss(fi, ft)
+        loss2 = self.IT_loss(fv, ft)
+        loss = self.lamda1 * loss1 + self.lamda2 * loss2
+        return loss, fv, ft, fi
+    def forward(self, img, txt, stage):
+        if stage == '1st':
+            '''
+                img: b, 3, h, w
+                word: b, words
+                word_mask: b, words
+                mask: b, 1, h, w
+                stage: 1st or 2nd stage
+            '''
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # state: b, 1024
+            # image: b, 512
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            fq = self.FPN(vis, text)
+            b, c, h, w = fq.size()
+            # b, 512, 14, 14
+            ff = self.FGFusion(fq, word, pad_mask)
+            ff = ff.reshape(b, c, h, w)
+            f2 = self.avg(ff)
+            fi = image.unsqueeze(-1).unsqueeze(-1)
+            fv = self.ASFF(fi, f2)
+            fi = fi.squeeze(-1).squeeze(-1)
+            # b, 1024
+            ft = self.projT(text)
+            loss1 = self.IT_loss(fi, ft)
+            loss2 = self.IT_loss(fv, ft)
+            loss = self.lamda1 * loss1 + self.lamda2 * loss2
+        elif stage == '2nd':
+            """
+                txt: b, num, words
+                img: b, 3, h, w
+            """
+            # txt = b * num, word
+            b, num, l = txt.shape[0], txt.shape[1], txt.shape[2]
+            txt = txt.view(-1, txt.size(-1))
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            b = img.shape[0]
+            vis, image = self.backbone.encode_image(img)
+            word, text = self.backbone.encode_text(txt)
+            fq = self.FPN(vis, text)
+            # b, 512, 14, 14 (C4)
+            b, c, h, w = fq.size()
+            # b, 512, 14, 14
+            ff = self.FGFusion(fq, word, pad_mask)
+            ff = ff.reshape(b, c, h, w)
+            f2 = self.avg(ff)
+            fi = image.unsqueeze(-1).unsqueeze(-1)
+            fi_ = fi.repeat(int(f2.shape[0] / fi.shape[0]), 1, 1, 1)
+            fv = self.ASFF(fi_, f2)
+            fi = fi.squeeze(-1).squeeze(-1)
+            # fi_ = fi_.squeeze(-1).squeeze(-1)
+            # b, 1024
+            ft = text.view(img.shape[0], int(text.shape[0] / img.shape[0]), -1)[:, 0, :]
+            fv = fv.view(ft.shape[0], int(text.shape[0] / ft.shape[0]), fv.shape[1])
+            loss = self.test_IET_loss(fi, fv, self.pos_samples, self.beta1, self.beta2)
+        elif stage == 'test':
+            """
+                            txt: b, num, words
+                            img: b, 3, h, w
+                        """
+            txt = txt.permute(1, 0, 2)
+            # txt = b * num, word
+            # txt = txt.view(-1, txt.size(-1))
+            # padding mask used in decoder
+            pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+            # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+            # word: b, length, 512
+            # state: b, 1024
+            # image: b, 512
+            b = img.shape[0]
+            words = []
+            texts = []
+            vis, image = self.backbone.encode_image(img)
+            for i in range(txt.shape[0]):
+                word, text = self.backbone.encode_text(txt[i])
+                words.append(word)
+                texts.append(text)
+            fvn = []
+            # b, 512, 14, 14 (C4)
+            for i in range(txt.shape[0]):
+                fq = self.FPN(vis, texts[i])
+                b, c, h, w = fq.size()
+                # b, 512, 14, 14
+                ff = self.FGFusion(fq, words[i], pad_mask[i, :, :])
+                ff = ff.reshape(b, c, h, w)
+                f2 = self.avg(ff)
+                fi = image.unsqueeze(-1).unsqueeze(-1)
+                fv = self.ASFF(fi, f2)
+                fi = fi.squeeze(-1).squeeze(-1)
+                fvn.append(fv)
+            # b, 1024
+            ft = self.projT(texts[0])
+            loss = self.IET_loss(fvn, ft, self.pos_samples, self.beta)
+            fv = fvn
+        else:
+            print('stage should be either 1st or 2nd or test')
+        # labels = torch.ones(image.shape[0], image.shape[0]).to(image.device)
+        # labels[:,-1] = 0
+        # labels[3, :] = 0
+        # out = self.avg(fq)
+        # out = out.squeeze(-1).squeeze(-1)
+        # out = self.fc(out)
+        return loss, fv, fi, ft
+class CRIS(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        self.backbone, _, _, _, _ = build_model(clip_model.state_dict(), cfg.word_len)
+        self.backbone = self.backbone.float()
+        self.Label_encoder = build_promptlearner(clip_model.state_dict()).float()
+        self.Label_encoder.init_label_emb(cfg.label_path)
+        # Multi-Modal FPN
+        self.FPN = FPN(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Fined-grained Fusion
+        self.FGFusion = TransformerDecoder(num_layers=cfg.num_layers,
+                                          d_model=cfg.vis_dim,
+                                          nhead=cfg.num_head,
+                                          dim_ffn=cfg.dim_ffn,
+                                          dropout=cfg.dropout,
+                                          return_intermediate=cfg.intermediate)
+        # adaptively aggretation
+        self.ASFF = AdaptiveSpatialFeatureFusion(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # text projector
+        self.projT = Text_Projector(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # parameter
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.multi_label_logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.margin = 1
+        self.eps = 1e-3
+        self.ce = nn.CrossEntropyLoss()
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        self.fc = nn.Linear(512, cfg.num_classes)
+    def IT_loss(self, image_features, text_features):
+        # b, 1024 / b, 1024
+        batch = image_features.shape[0]
+        # # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1,
+                                                              keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        contrastive_labels = torch.arange(batch).to(logits_per_image.device)
+        contrastive_loss = (self.ce(logits_per_image, contrastive_labels) + self.ce(logits_per_text, contrastive_labels)) * 0.5
+        return contrastive_loss
+    def IL_loss(self, image_features, label_features, labels):
+        # b, 1024 / K, 1024/ b, K
+        positive_tagsT = torch.clamp(labels,0.,1.)
+        negative_tagsT = torch.clamp(-labels,0.,1.)
+        maskT = positive_tagsT.unsqueeze(1) * negative_tagsT.unsqueeze(-1)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        label_features = label_features / label_features.norm(dim=-1,
+                                                           keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.multi_label_logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ label_features.t()
+        # logits_per_label = logit_scale * label_features @ image_features.t()
+        pos_score_matT = logits_per_image * positive_tagsT
+        neg_score_matT = logits_per_image * negative_tagsT
+        IW_pos3T = pos_score_matT.unsqueeze(1)
+        IW_neg3T = neg_score_matT.unsqueeze(-1)
+        OT = self.margin + IW_neg3T - IW_pos3T
+        O_maskT = maskT * OT
+        diffT = torch.clamp(O_maskT, 0)
+        violationT = torch.sign(diffT).sum(1).sum(1)
+        diffT = diffT.sum(1).sum(1)
+        lossT = torch.mean(diffT / (violationT + self.eps))
+        return lossT
+    def margin_loss(self, image_features, label_features, labels):
+        # b, 1024 / K, 1024/ b, K
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1,
+                                                              keepdim=True)
+        label_features = label_features / label_features.norm(dim=-1,
+                                                           keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.multi_label_logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ label_features.t()
+        # logits_per_label = logit_scale * label_features @ image_features.t()
+        image_label_positive_pairs = logits_per_image * labels
+        image_label_mean_positive = image_label_positive_pairs.sum() / labels.sum()
+        image_label_negative_pairs = logits_per_image * (1 - labels)
+        image_label_mean_negative = image_label_negative_pairs.sum() / (logits_per_image.numel() - labels.sum() + self.eps)
+        contrastive_loss = torch.relu(self.margin - image_label_mean_positive + image_label_mean_negative)
+        return contrastive_loss
+    def forward(self, img, txt, target=None):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+            mask: b, 1, h, w
+        '''
+        # padding mask used in decoder
+        pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+        # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+        # word: b, length, 512
+        # state: b, 1024
+        # image: b, 512
+        vis, image = self.backbone.encode_image(img)
+        word, text = self.backbone.encode_text(txt)
+        fl = self.Label_encoder(image.device)
+        # b, 512, 14, 14 (C4)
+        fq = self.FPN(vis, text)
+        b, c, h, w = fq.size()
+        # b, 512, 14, 14
+        ff = self.FGFusion(fq, word, pad_mask)
+        # b, 512, 196
+        ff = ff.reshape(b, c, h, w)
+        f2 = self.avg(ff)
+        # b, 1024
+        f1 = image.unsqueeze(-1).unsqueeze(-1)
+        fv = self.ASFF(f1, f2)
+        # b, 1024
+        ft = self.projT(text)
+        # labels = torch.ones(image.shape[0], image.shape[0]).to(image.device)
+        # labels[:,-1] = 0
+        # labels[3, :] = 0
+        loss1 = self.IT_loss(fv, ft)
+        loss2 = self.IL_loss(fv, fl, target)
+        loss = loss1 + loss2
+        # out = self.avg(fq)
+        # out = out.squeeze(-1).squeeze(-1)
+        # out = self.fc(out)
+        return loss, fv, ft, fl
+class zh_clip(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        self.backbone = build_modified_model(clip_model.state_dict(), cfg.word_len).float()
+        self.text_encoder = AutoModelForSequenceClassification.from_pretrained(cfg.chinese)
+        self.text_lin = nn.Linear(512, 1024)
+        # Multi-Modal FPN
+        self.neck = ViTFPN(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Decoder
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        self.fc = nn.Linear(512, cfg.num_classes)
+    def forward(self, img, word):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+            mask: b, 1, h, w
+        '''
+        # padding mask used in decoder
+        # vis:  v1 / v2 / b, 49, 1024/ b, 196, 512
+        # state: b, 1024
+        # feat: f1 / f2 / b, 1024, 7, 7/ b, 1024, 7, 7
+        # cls: c1 / c2 / b, 1024/ b, 512
+        vis, feat, cls = self.backbone.encode_image(img)
+        state = self.text_encoder(word.squeeze(1)).logits
+        state = self.text_lin(state)
+        # b, 1024, 7, 7 (C5)
+        fq = self.neck(feat, state)
+        out = self.avg(fq)
+        out = out.squeeze(-1).squeeze(-1)
+        out = self.fc(out)
+        return out
+class poi_clip(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        self.backbone = build_modified_model(clip_model.state_dict(), cfg.word_len).float()
+        self.text_encoder = AutoModelForSequenceClassification.from_pretrained(cfg.chinese)
+        self.text_lin = nn.Linear(512, 1024)
+        # Multi-Modal FPN
+        self.neck = ViTFPN(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Decoder
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        self.fc = nn.Linear(512, cfg.num_classes)
+    def forward(self, img, word):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+            mask: b, 1, h, w
+        '''
+        # padding mask used in decoder
+        # vis:  v1 / v2 / b, 49, 1024/ b, 196, 512
+        # state: b, 1024
+        # feat: f1 / f2 / b, 1024, 7, 7/ b, 1024, 7, 7
+        # cls: c1 / c2 / b, 1024/ b, 512
+        vis, feat, cls = self.backbone.encode_image(img)
+        state = self.text_encoder(word.squeeze(1)).logits
+        state = self.text_lin(state)
+        # b, 1024, 7, 7 (C5)
+        fq = self.neck(feat, state)
+        out = self.avg(fq)
+        out = out.squeeze(-1).squeeze(-1)
+        out = self.fc(out)
+        return out
+class Clip_hash_model(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        self.backbone = build_model(clip_model.state_dict(), cfg.word_len).float()
+        # Multi-Modal FPN
+        self.neck = FPN(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        # Decoder
+        self.avg = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = nn.Sequential(
+            nn.Linear(cfg.fpn_out[1], cfg.hash_dim, bias=True),
+            nn.Tanh(),
+        )
+        self.classifier2 = nn.Sequential(
+            nn.Linear(cfg.hash_dim, cfg.num_classes)
+        )
+        # Hash Module
+        self.image_module = nn.Sequential(
+            nn.Linear(cfg.img_dim, cfg.hidden_dim, bias=True),
+            nn.BatchNorm1d(cfg.hidden_dim),
+            nn.ReLU(True),
+            nn.Linear(cfg.hidden_dim, cfg.hash_dim, bias=True),
+            nn.Tanh()
+        )
+        self.text_module = nn.Sequential(
+            nn.Linear(cfg.txt_dim, cfg.hidden_dim, bias=True),
+            nn.BatchNorm1d(cfg.hidden_dim),
+            nn.ReLU(True),
+            nn.Linear(cfg.hidden_dim, cfg.hash_dim, bias=True),
+            nn.Tanh()
+        )
+    def forward(self, img, word, mask=None):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+        '''
+        pad_mask = torch.zeros_like(word).masked_fill_(word == 0, 1).bool()
+        # vis: C3 / C4 / C5
+        # word: b, length, 512
+        # state: b, 1024
+        vis, image = self.backbone.encode_image(img)
+        word, state = self.backbone.encode_text(word)
+        # b, 512, 26, 26 (C4)
+        fq = self.neck(vis, state)
+        # out_hash: b, code_length
+        # res: b, classes
+        out = self.avg(fq)
+        out = out.squeeze(-1).squeeze(-1)
+        out_hash = self.classifier(out)
+        res = self.classifier2(out_hash)
+        # img_hash: b, code_length
+        # txt_hash: b, code_length
+        img_hash = self.image_module(image)
+        txt_hash = self.text_module(state)
+        return img_hash, txt_hash, out_hash, res
+class Clip_model(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # Vision & Text Encoder
+        clip_model = torch.jit.load(cfg.clip_pretrain,
+                                    map_location="cpu").eval()
+        self.neck = FPN(in_channels=cfg.fpn_in, out_channels=cfg.fpn_out)
+        self.avg = nn.AdaptiveAvgPool2d((1, 1))
+        self.backbone = build_model(clip_model.state_dict(), cfg.word_len).float()
+    def forward(self, img, word, mask=None):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+        '''
+        # vis: C3 / C4 / C5
+        # word: b, length, 512
+        # state: b, 1024
+        pad_mask = torch.zeros_like(word).masked_fill_(word == 0, 1).bool()
+        vis, image = self.backbone.encode_image(img)
+        word, state = self.backbone.encode_text(word)
+        f = self.neck(vis, state)
+        out = self.avg(f)
+        out = out.squeeze(-1).squeeze(-1)
+        image_features = image / image.norm(dim=-1, keepdim=True)
+        text_features = state / state.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.backbone.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+class CISEN_rsvit_hug(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, txt_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers, patch_size,
+                 output_dim, ratio, emb_dim, fpn_in, fpn_out):
+        super().__init__()
+        # Vision & Text Encoder & Label Encoder
+        vision_heads = vision_width * 32 // 64
+        backbone = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, txt_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+        self.backbone = backbone.float()
+        self.patch_emb = image_resolution // patch_size
+        self.FPN = ViTFPN(image_resolution, in_channels=fpn_in, out_channels=fpn_out)
+        self.ADP = Adapter(output_dim, 4)
+        # parameter
+        self.ratio = ratio
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.share_temperature = True
+        self.ce = nn.CrossEntropyLoss()
+        self.ms_adaptor = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.ConvTranspose2d(emb_dim, emb_dim, 2, 2),
+                    nn.GroupNorm(32, emb_dim),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(emb_dim, emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.ConvTranspose2d(emb_dim, emb_dim, 2, 2),
+                ),
+                nn.Sequential(
+                    nn.Identity(),
+                ),
+                nn.Sequential(
+                    nn.MaxPool2d(2),
+                ),
+            ]
+        )
+        self.ms_adaptor.apply(self.init_adaptor)
+    def init_adaptor(self, m):
+        if isinstance(m, nn.Conv2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.GroupNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.ConvTranspose2d):
+            lecun_normal_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        # self.fc = nn.Linear(512, cfg.num_classes)
+    def image_encode(self, img):
+        vis, image = self.backbone.encode_image(img)
+        x = self.ADP(image)
+        x = self.ratio * x + (1 - self.ratio) * image
+        return x
+    def text_encode(self, txt):
+        word, text = self.backbone.encode_text(txt)
+        return text
+    def forward(self, img, txt):
+        '''
+            img: b, 3, h, w
+            word: b, words
+            word_mask: b, words
+            mask: b, 1, h, w
+            stage: 1st or 2nd stage
+        '''
+        # padding mask used in decoder
+        pad_mask = torch.zeros_like(txt).masked_fill_(txt == 0, 1).bool()
+        # vis: C3 / C4 / C5 / b, 512, 28, 28/ b, 1024, 14, 14/ b, 1024, 7, 7
+        # word: b, length, 512
+        # text: b, 1024
+        # image: b, 1024
+        vis, image = self.backbone.encode_image(img)
+        word, text = self.backbone.encode_text(txt)
+        x = self.ADP(image)
+        x = self.ratio * x + (1 - self.ratio) * image
+        # Construct multi-scale feats
+        vis_trans = []
+        for i in range(len(self.ms_adaptor)):
+            x_ = rearrange(
+                vis[i],
+                "b (h w) c -> b c h w",
+                h=self.patch_emb,
+                w=self.patch_emb,
+            ).contiguous()
+            feats = self.ms_adaptor[i](x_)
+            vis_trans.append(feats)
+        # fq = self.FPN(vis, x_t)
+        fv_t = self.FPN(vis_trans[1:], x, False)
+        # fv_t = self.gap(fq_t)
+        # b, 1024
+        fv = fv_t
+        ft = text
+        fi = x
+        return fv, fi, ft

cisen/utils/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (4.38 kB). View file

cisen/utils/__pycache__/dataset.cpython-38.pyc ADDED Viewed

Binary file (12.9 kB). View file

cisen/utils/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

cisen/utils/config.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -----------------------------------------------------------------------------
+# Functions for parsing args
+# -----------------------------------------------------------------------------
+import copy
+import os
+from ast import literal_eval
+import yaml
+class CfgNode(dict):
+    """
+    CfgNode represents an internal node in the configuration tree. It's a simple
+    dict-like container that allows for attribute-based access to keys.
+    """
+    def __init__(self, init_dict=None, key_list=None, new_allowed=False):
+        # Recursively convert nested dictionaries in init_dict into CfgNodes
+        init_dict = {} if init_dict is None else init_dict
+        key_list = [] if key_list is None else key_list
+        for k, v in init_dict.items():
+            if type(v) is dict:
+                # Convert dict to CfgNode
+                init_dict[k] = CfgNode(v, key_list=key_list + [k])
+        super(CfgNode, self).__init__(init_dict)
+    def __getattr__(self, name):
+        if name in self:
+            return self[name]
+        else:
+            raise AttributeError(name)
+    def __setattr__(self, name, value):
+        self[name] = value
+    def __str__(self):
+        def _indent(s_, num_spaces):
+            s = s_.split("\n")
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * " ") + line for line in s]
+            s = "\n".join(s)
+            s = first + "\n" + s
+            return s
+        r = ""
+        s = []
+        for k, v in sorted(self.items()):
+            seperator = "\n" if isinstance(v, CfgNode) else " "
+            attr_str = "{}:{}{}".format(str(k), seperator, str(v))
+            attr_str = _indent(attr_str, 2)
+            s.append(attr_str)
+        r += "\n".join(s)
+        return r
+    def __repr__(self):
+        return "{}({})".format(self.__class__.__name__,
+                               super(CfgNode, self).__repr__())
+def load_cfg_from_cfg_file(file):
+    cfg = {}
+    assert os.path.isfile(file) and file.endswith('.yaml'), \
+        '{} is not a yaml file'.format(file)
+    with open(file, 'r') as f:
+        cfg_from_file = yaml.safe_load(f)
+    for key in cfg_from_file:
+        for k, v in cfg_from_file[key].items():
+            cfg[k] = v
+    cfg = CfgNode(cfg)
+    return cfg
+def merge_cfg_from_list(cfg, cfg_list):
+    new_cfg = copy.deepcopy(cfg)
+    assert len(cfg_list) % 2 == 0
+    for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
+        subkey = full_key.split('.')[-1]
+        assert subkey in cfg, 'Non-existent key: {}'.format(full_key)
+        value = _decode_cfg_value(v)
+        value = _check_and_coerce_cfg_value_type(value, cfg[subkey], subkey,
+                                                 full_key)
+        setattr(new_cfg, subkey, value)
+    return new_cfg
+def _decode_cfg_value(v):
+    """Decodes a raw config value (e.g., from a yaml config files or command
+    line argument) into a Python object.
+    """
+    # All remaining processing is only applied to strings
+    if not isinstance(v, str):
+        return v
+    # Try to interpret `v` as a:
+    #   string, number, tuple, list, dict, boolean, or None
+    try:
+        v = literal_eval(v)
+    # The following two excepts allow v to pass through when it represents a
+    # string.
+    #
+    # Longer explanation:
+    # The type of v is always a string (before calling literal_eval), but
+    # sometimes it *represents* a string and other times a data structure, like
+    # a list. In the case that v represents a string, what we got back from the
+    # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is
+    # ok with '"foo"', but will raise a ValueError if given 'foo'. In other
+    # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval
+    # will raise a SyntaxError.
+    except ValueError:
+        pass
+    except SyntaxError:
+        pass
+    return v
+def _check_and_coerce_cfg_value_type(replacement, original, key, full_key):
+    """Checks that `replacement`, which is intended to replace `original` is of
+    the right type. The type is correct if it matches exactly or is one of a few
+    cases in which the type can be easily coerced.
+    """
+    original_type = type(original)
+    replacement_type = type(replacement)
+    # The types must match (with some exceptions)
+    if replacement_type == original_type:
+        return replacement
+    # Cast replacement from from_type to to_type if the replacement and original
+    # types match from_type and to_type
+    def conditional_cast(from_type, to_type):
+        if replacement_type == from_type and original_type == to_type:
+            return True, to_type(replacement)
+        else:
+            return False, None
+    # Conditionally casts
+    # list <-> tuple
+    casts = [(tuple, list), (list, tuple)]
+    # For py2: allow converting from str (bytes) to a unicode string
+    try:
+        casts.append((str, unicode))  # noqa: F821
+    except Exception:
+        pass
+    for (from_type, to_type) in casts:
+        converted, converted_value = conditional_cast(from_type, to_type)
+        if converted:
+            return converted_value
+    raise ValueError(
+        "Type mismatch ({} vs. {}) with values ({} vs. {}) for config "
+        "key: {}".format(original_type, replacement_type, original,
+                         replacement, full_key))

cisen/utils/dataset.py ADDED Viewed

	@@ -0,0 +1,478 @@

+import os
+from typing import List, Union
+import random
+import json
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+from loguru import logger
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+_tokenizer = _Tokenizer()
+# text_tokenize = AutoTokenizer.from_pretrained("./Taiyi-CLIP-s", model_max_length=512)
+def tokenize(texts: Union[str, List[str]],
+             context_length: int = 77,
+             truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f"Input {texts[i]} is too long for context length {context_length}"
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def select_idxs(seq_length, n_to_select, n_from_select, seed=42):
+    """
+    Select n_to_select indexes from each consequent n_from_select indexes from range with length seq_length, split
+    selected indexes to separate arrays
+    Example:
+    seq_length = 20
+    n_from_select = 5
+    n_to_select = 2
+    input, range of length seq_length:
+    range = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    sequences of length n_from_select:
+    sequences = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19]]
+    selected n_to_select elements from each sequence
+    selected = [[0, 4], [7, 9], [13, 14], [16, 18]]
+    output, n_to_select lists of length seq_length / n_from_select:
+    output = [[0, 7, 13, 16], [4, 9, 14, 18]]
+    :param seq_length: length of sequence, say 10
+    :param n_to_select: number of elements to select
+    :param n_from_select: number of consequent elements
+    :return:
+    """
+    random.seed(seed)
+    idxs = [[] for _ in range(n_to_select)]
+    for i in range(seq_length // n_from_select):
+        ints = random.sample(range(n_from_select), n_to_select)
+        for j in range(n_to_select):
+            idxs[j].append(i * n_from_select + ints[j])
+    return idxs
+def read_json(file_name, suppress_console_info=False):
+    """
+    Read JSON
+    :param file_name: input JSON path
+    :param suppress_console_info: toggle console printing
+    :return: dictionary from JSON
+    """
+    with open(file_name, 'r') as f:
+        data = json.load(f)
+        if not suppress_console_info:
+            print("Read from:", file_name)
+    return data
+def get_image_file_names(data, suppress_console_info=False):# ok
+    """
+    Get list of image file names
+    :param data: original data from JSON
+    :param suppress_console_info: toggle console printing
+    :return: list of strings (file names)
+    """
+    file_names = []
+    for img in data['images']:
+        image_name = img["image_name"]
+        sample_id = img["sample_id"]
+        path_data = f'{sample_id}/{image_name}'
+        file_names.append(path_data)
+    if not suppress_console_info:
+        print("Total number of files:", len(file_names))
+    return file_names
+def get_images(file_names, args):
+    transform = transforms.Compose([
+        transforms.Resize(224),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+    ])
+    imgs = []
+    for i in range(len(file_names)):
+        img = np.array(transform(Image.open(os.path.join(args.imgs_folder, file_names[i]))))
+        imgs.append(img)
+    return np.array(imgs)
+def get_captions(data, suppress_console_info=False):
+    """
+    Get list of formatted captions
+    :param data: original data from JSON
+    :return: list of strings (captions)
+    """
+    def format_caption(string):
+        return string.replace('.', '').replace(',', '').replace('!', '').replace('?', '').lower()
+    captions = []
+    augmented_captions_rb = []
+    augmented_captions_bt_prob = []
+    augmented_captions_bt_chain = []
+    for img in data['images']:
+        for sent in img['sentences']:
+            captions.append(format_caption(sent['raw']))
+            try:
+                augmented_captions_rb.append(format_caption(sent['aug_rb']))
+            except:
+                pass
+            try:
+                augmented_captions_bt_prob.append(format_caption(sent['aug_bt_prob']))
+            except:
+                pass
+            try:
+                augmented_captions_bt_chain.append(format_caption(sent['aug_bt_chain']))
+            except:
+                pass
+    if not suppress_console_info:
+        logger.info("Total number of captions:{}", len(captions))
+        logger.info("Total number of augmented captions RB:{}", len(augmented_captions_rb))
+        logger.info("Total number of augmented captions BT (prob):{}", len(augmented_captions_bt_prob))
+        logger.info("Total number of augmented captions BT (chain):{}", len(augmented_captions_bt_chain))
+    return captions, augmented_captions_rb, augmented_captions_bt_prob, augmented_captions_bt_chain
+def get_labels(data, suppress_console_info=False):
+    """
+    Get list of labels
+    :param data: original data from JSON
+    :param suppress_console_info: toggle console printing
+    :return: list ints (labels)
+    """
+    labels = []
+    for img in data['images']:
+        labels.append(img["classcode"])
+    if not suppress_console_info:
+        print("Total number of labels:", len(labels))
+    return labels
+def remove_tokens(data):
+    """
+    Removes 'tokens' key from caption record, if exists; halves the size of the file
+    :param data: original data
+    :return: data without tokens
+    """
+    for img in data['images']:
+        for sent in img['sentences']:
+            try:
+                sent.pop("tokens")
+            except:
+                pass
+    return data
+def write_json(file_name, data):
+    """
+    Write dictionary to JSON file
+    :param file_name: output path
+    :param data: dictionary
+    :return: None
+    """
+    bn = os.path.basename(file_name)
+    dn = os.path.dirname(file_name)
+    name, ext = os.path.splitext(bn)
+    file_name = os.path.join(dn, name + '.json')
+    with open(file_name, 'w') as f:
+        f.write(json.dumps(data, indent='\t'))
+    print("Written to:", file_name)
+def get_split_idxs(arr_len, args):
+    """
+    Get indexes for training, query and db subsets
+    :param: arr_len: array length
+    :return: indexes for training, query and db subsets
+    """
+    idx_all = list(range(arr_len))
+    idx_train, idx_eval = split_indexes(idx_all, args.dataset_train_split)
+    idx_query, idx_db = split_indexes(idx_eval, args.dataset_query_split)
+    return idx_train, idx_eval, idx_query, idx_db
+def split_indexes(idx_all, split):
+    """
+    Splits list in two parts.
+    :param idx_all: array to split
+    :param split: portion to split
+    :return: splitted lists
+    """
+    idx_length = len(idx_all)
+    selection_length = int(idx_length * split)
+    idx_selection = sorted(random.sample(idx_all, selection_length))
+    idx_rest = sorted(list(set(idx_all).difference(set(idx_selection))))
+    return idx_selection, idx_rest
+def get_caption_idxs(idx_train, idx_query, idx_db):
+    """
+    Get caption indexes.
+    :param: idx_train: train image (and label) indexes
+    :param: idx_query: query image (and label) indexes
+    :param: idx_db: db image (and label) indexes
+    :return: caption indexes for corresponding index sets
+    """
+    idx_train_cap = get_caption_idxs_from_img_idxs(idx_train, num=5)
+    idx_query_cap = get_caption_idxs_from_img_idxs(idx_query, num=5)
+    idx_db_cap = get_caption_idxs_from_img_idxs(idx_db)
+    return idx_train_cap, idx_query_cap, idx_db_cap
+def get_caption_idxs_from_img_idxs(img_idxs, num=5):
+    """
+    Get caption indexes. There are 5 captions for each image (and label).
+    Say, img indexes - [0, 10, 100]
+    Then, caption indexes - [0, 1, 2, 3, 4, 50, 51, 52, 53, 54, 100, 501, 502, 503, 504]
+    :param: img_idxs: image (and label) indexes
+    :return: caption indexes
+    """
+    caption_idxs = []
+    for idx in img_idxs:
+        for i in range(num):  # each image has 5 captions
+            caption_idxs.append(idx * num + i)
+    return caption_idxs
+def split_data(images, captions, labels, captions_aug, images_aug, args):
+    """
+    Split dataset to get training, query and db subsets
+    :param: images: image embeddings array
+    :param: captions: caption embeddings array
+    :param: labels: labels array
+    :param: captions_aug: augmented caption embeddings
+    :param: images_aug: augmented image embeddings
+    :return: tuples of (images, captions, labels), each element is array
+    """
+    idx_tr, idx_q, idx_db = get_split_idxs(len(images), args)
+    idx_tr_cap, idx_q_cap, idx_db_cap = get_caption_idxs(idx_tr, idx_q, idx_db)
+    train = images[idx_tr], captions[idx_tr_cap], labels[idx_tr], (idx_tr, idx_tr_cap), captions_aug[idx_tr_cap], \
+                images_aug[idx_tr]
+    query = images[idx_q], captions[idx_q_cap], labels[idx_q], (idx_q, idx_q_cap), captions_aug[idx_q_cap], \
+                images_aug[idx_q]
+    db = images[idx_db], captions[idx_db_cap], labels[idx_db], (idx_db, idx_db_cap), captions_aug[idx_db_cap], \
+             images_aug[idx_db]
+    return train, query, db
+def select_idxs(seq_length, n_to_select, n_from_select, seed=42):
+    """
+    Select n_to_select indexes from each consequent n_from_select indexes from range with length seq_length, split
+    selected indexes to separate arrays
+    Example:
+    seq_length = 20
+    n_from_select = 5
+    n_to_select = 2
+    input, range of length seq_length:
+    range = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    sequences of length n_from_select:
+    sequences = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19]]
+    selected n_to_select elements from each sequence
+    selected = [[0, 4], [7, 9], [13, 14], [16, 18]]
+    output, n_to_select lists of length seq_length / n_from_select:
+    output = [[0, 7, 13, 16], [4, 9, 14, 18]]
+    :param seq_length: length of sequence, say 10
+    :param n_to_select: number of elements to select
+    :param n_from_select: number of consequent elements
+    :return:
+    """
+    random.seed(seed)
+    idxs = [[] for _ in range(n_to_select)]
+    for i in range(seq_length // n_from_select):
+        ints = random.sample(range(n_from_select), n_to_select)
+        for j in range(n_to_select):
+            idxs[j].append(i * n_from_select + ints[j])
+    return idxs
+class AbstractDataset(torch.utils.data.Dataset):
+    def __init__(self, images, captions, labels, targets, idxs):
+        self.image_replication_factor = 1  # default value, how many times we need to replicate image
+        self.images = images
+        self.captions = captions
+        self.labels = labels
+        self.targets = targets
+        self.idxs = np.array(idxs[0])
+    def __getitem__(self, index):
+        return
+    def __len__(self):
+        return
+class CISENDataset(torch.utils.data.Dataset):
+    """
+    Class for dataset representation.
+    Each image has 5 corresponding captions
+    Duplet dataset sample - img-txt (image and corresponding caption)
+    """
+    def __init__(self, images, captions, args):
+        """
+        Initialization.
+        :param images: image embeddings vector
+        :param captions: captions embeddings vector
+        :param labels: labels vector
+        """
+        super().__init__()
+        self.images = images
+        self.captions = captions
+        # self.targets = targets
+        # self.labels = labels
+        self.word_len = args.word_len
+    def __getitem__(self, index):
+        """
+        Returns a tuple (img, txt, label) - image and corresponding caption
+        :param index: index of sample
+        :return: tuple (img, txt, label)
+        """
+        return (
+            torch.from_numpy(self.images[index].astype('float32')),
+            torch.from_numpy(np.array(tokenize(self.captions[index], self.word_len).squeeze(0)).astype('int64'))
+            # ,torch.from_numpy(self.targets[index])
+        )
+    def __len__(self):
+        return len(self.images)
+class DatasetDuplet(AbstractDataset):
+    """
+    Class for dataset representation.
+    Each image has 5 corresponding captions
+    Duplet dataset sample - img-txt (image and corresponding caption)
+    """
+    def __init__(self, images, captions, labels, targets, idxs, args):
+        """
+        Initialization.
+        :param images: image embeddings vector
+        :param captions: captions embeddings vector
+        :param labels: labels vector
+        """
+        super().__init__(images, captions, labels, targets, idxs)
+        self.word_len = args.word_len
+    def __getitem__(self, index):
+        """
+        Returns a tuple (img, txt, label) - image and corresponding caption
+        :param index: index of sample
+        :return: tuple (img, txt, label)
+        """
+        return (
+            index,
+            torch.from_numpy(self.images[index].astype('float32')),
+            torch.from_numpy(np.array(tokenize(self.captions[index] + self.captions[index], self.word_len).squeeze(0)).astype('int64')),
+            self.labels[index],
+            self.targets[index]
+        )
+    def __len__(self):
+        return len(self.images)
+class ModifiedDatasetDuplet(AbstractDataset):
+    """
+    Class for dataset representation.
+    Each image has 5 corresponding captions
+    Duplet dataset sample - img-txt (image and corresponding caption)
+    """
+    def __init__(self, images, captions, labels, targets, idxs, args):
+        """
+        Initialization.
+        :param images: image embeddings vector
+        :param captions: captions embeddings vector
+        :param labels: labels vector
+        """
+        super().__init__(images, captions, labels, targets, idxs)
+    def __getitem__(self, index):
+        """
+        Returns a tuple (img, txt, label) - image and corresponding caption
+        :param index: index of sample
+        :return: tuple (img, txt, label)
+        """
+        text = text_tokenize(self.captions[index], return_tensors='pt', padding='max_length', truncation='longest_first')['input_ids']
+        return (
+            index,
+            torch.from_numpy(self.images[index].astype('float32')),
+            torch.from_numpy(np.array(text_tokenize(self.captions[index], return_tensors='pt', padding='max_length', truncation='longest_first')['input_ids']).astype('int64')),
+            self.labels[index],
+            self.targets[index]
+        )
+    def __len__(self):
+        return len(self.images)

cisen/utils/hash.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import torch
+from torch.autograd import Variable
+import numpy as np
+import torch.nn as nn
+from torch.nn import functional as F
+import math
+def init_hash(dataloader, args):
+    dataset_size = len(dataloader.dataset)
+    B = torch.randn(dataset_size, args.hash_dim).sign().cuda(non_blocking=True)
+    H = torch.zeros(dataset_size, args.hash_dim).sign().cuda(non_blocking=True)
+    Hi = torch.zeros(dataset_size, args.hash_dim).sign().cuda(non_blocking=True)
+    Ht = torch.zeros(dataset_size, args.hash_dim).sign().cuda(non_blocking=True)
+    return B, H, Hi, Ht
+def GenerateCode(model, data_loader, args):
+    num_data = len(data_loader.dataset)
+    B = np.zeros([num_data, args.hash_dim], dtype=np.float32)
+    Bi = np.zeros([num_data, args.hash_dim], dtype=np.float32)
+    Bt = np.zeros([num_data, args.hash_dim], dtype=np.float32)
+    for i, (idx, image, text, label, target) in enumerate(data_loader, 0):
+        image = image.cuda(non_blocking = True)
+        text = text.cuda(non_blocking = True)
+        img_hash, txt_hash, output, output_s = model(image, text)
+        B[idx, :] = torch.sign(output.detach().cpu()).numpy()
+        Bi[idx, :] = torch.sign(img_hash.detach().cpu()).numpy()
+        Bt[idx, :] = torch.sign(txt_hash.detach().cpu()).numpy()
+    return B, Bi, Bt
+def CalcSim(batch_label, train_label):
+    S = (batch_label.mm(train_label.t()) > 0)
+    return S
+# loss
+def Logtrick(x):
+    lt = torch.log(1+torch.exp(-torch.abs(x))).cuda() + torch.max(x, Variable(torch.FloatTensor([0.]).cuda()))
+    return lt
+class NTXentLoss(nn.Module):
+    """
+    Normalized Temperature-scaled Cross-entropy Loss (NTXent Loss).
+    Contains single-modal and cross-modal implementations.
+    """
+    def __init__(self, temperature=1, eps=1e-6):
+        super(NTXentLoss, self).__init__()
+        self.temperature = temperature
+        self.eps = eps
+    def forward(self, *args, type='orig'):
+        if type == 'cross':
+            return self.forward_cross_modal(*args)
+        if type == 'orig':
+            return self.forward_orig(*args)
+        if type == 'both':
+            return self.forward_orig(*args), self.forward_cross_modal(*args)
+        else:
+            raise Exception("Wrong NTXent loss type, must be: 'cross', 'orig' or 'both'")
+    def forward_cross_modal(self, mod1, mod2):
+        """
+        Cross-modal case:
+        p - positive pair
+        n - negative pair
+        sim - cosine similarity
+        ix - image modality feature number x
+        tx - text modality feature number x
+        Cross-modal case of NTXent doesn't consider similarities inside of the same modality
+                        Similarities matrix: exp(sim(i, y))
+                             +--+--+--+--+--+--+--+
+                             |  |i1|i2|i3|t1|t2|t3|
+         Modality            +--+--+--+--+--+--+--+
+         Features            |i1|0 |0 |0 |p |n |n |
+        +--+  +--+           +--+--+--+--+--+--+--+
+        |i1|  |t1|           |i2|0 |0 |0 |n |p |n |
+        +--+  +--+           +--+--+--+--+--+--+--+
+        |i2|  |t2|  ------>  |i3|0 |0 |0 |n |n |p |
+        +--+  +--+           +--+--+--+--+--+--+--+
+        |i3|  |t3|           |t1|p |n |n |0 |0 |0 |
+        +--+  +--+           +--+--+--+--+--+--+--+
+                             |t2|n |p |n |0 |0 |0 |
+                             +--+--+--+--+--+--+--+
+                             |t3|n |n |p |0 |0 |0 |
+                             +--+--+--+--+--+--+--+
+        :param: mod1: features of the 1st modality
+        :param: mod1: features of the 2nd modality
+        :return: NTXent loss
+        """
+        # normalize for numerical stability
+        mod1 = F.normalize(mod1)
+        mod2 = F.normalize(mod2)
+        out = torch.cat([mod1, mod2], dim=0)
+        # cov and sim: [2 * batch_size, 2 * batch_size * world_size]
+        cov = torch.mm(out, out.t().contiguous())  # cosine similarities matrix
+        sim = torch.exp(cov / self.temperature)
+        # mask for cross-modal case, nullifies certain regions (see docstring)
+        zeros = torch.zeros(mod1.shape[0], mod1.shape[0]).to(sim.device)
+        ones = torch.ones(mod1.shape[0], mod1.shape[0]).to(sim.device)
+        mask = torch.hstack([torch.vstack([zeros, ones]), torch.vstack([ones, zeros])]).to(sim.device)
+        sim = sim * mask
+        # neg: [2 * batch_size]
+        # negative pairs sum
+        neg = sim.sum(dim=1)
+        # Positive similarity, pos becomes [2 * batch_size]
+        pos = torch.exp(torch.sum(mod1 * mod2, dim=-1) / self.temperature)
+        pos = torch.cat([pos, pos], dim=0)
+        loss = -torch.log(pos / (neg + self.eps)).sum()
+        return loss
+    def forward_orig(self, out_1, out_2):
+        """
+        Implementation taken from:
+        https://github.com/PyTorchLightning/lightning-bolts/blob/master/pl_bolts/models/self_supervised/simclr/simclr_module.py
+        p - positive pair
+        n - negative pair
+        sim - cosine similarity
+        e - Euler's number
+        ix - value x of input feature vector i
+        tx - value x of input feature vector t
+                        Similarities matrix: exp(sim(i, y))
+                             +--+--+--+--+--+--+--+
+                             |  |i1|i2|i3|t1|t2|t3|
+         Modality            +--+--+--+--+--+--+--+
+         Features            |i1|e |n |n |p |n |n |
+        +--+  +--+           +--+--+--+--+--+--+--+
+        |i1|  |t1|           |i2|n |e |n |n |p |n |
+        +--+  +--+           +--+--+--+--+--+--+--+
+        |i2|  |t2|  ------>  |i3|n |n |e |n |n |p |
+        +--+  +--+           +--+--+--+--+--+--+--+
+        |i3|  |t3|           |t1|p |n |n |e |n |n |
+        +--+  +--+           +--+--+--+--+--+--+--+
+                             |t2|n |p |n |n |e |n |
+                             +--+--+--+--+--+--+--+
+                             |t3|n |n |p |n |n |e |
+                             +--+--+--+--+--+--+--+
+        :param out_1: input feature vector i
+        :param out_2: input feature vector t
+        :return: NTXent loss
+        """
+        out_1 = F.normalize(out_1)
+        out_2 = F.normalize(out_2)
+        out = torch.cat([out_1, out_2], dim=0)
+        # cov and sim: [2 * batch_size, 2 * batch_size * world_size]
+        # neg: [2 * batch_size]
+        cov = torch.mm(out, out.t().contiguous())
+        sim = torch.exp(cov / self.temperature)
+        neg = sim.sum(dim=-1)
+        # from each row, subtract e^1 to remove similarity measure for x1.x1
+        row_sub = torch.Tensor(neg.shape).fill_(math.e).to(neg.device)
+        neg = torch.clamp(neg - row_sub, min=self.eps)  # clamp for numerical stability
+        # Positive similarity, pos becomes [2 * batch_size]
+        o = out_1 * out_2
+        pos = torch.exp(torch.sum(out_1 * out_2, dim=-1) / self.temperature)
+        pos = torch.cat([pos, pos], dim=0)
+        loss = -torch.log(pos / (neg + self.eps)).mean()
+        return loss
+"""
+    out_hash: real-value code
+    H: total real-value code
+    Bbatch: batch hash code
+    S: similarity
+    num_train: number of train
+    num_batch: batchsize
+"""
+def Calcloss(out_hash, H, Bbatch, S, num_train, num_batch, args):
+    theta_x = out_hash.float().mm(Variable(H.cuda()).t()) / 2
+    logloss = (Variable(S.cuda()) * theta_x - Logtrick(theta_x)).sum() \
+              / (num_train * num_batch)
+    regterm = (Bbatch - out_hash).pow(2).sum() / (num_train * num_batch)
+    loss_p = - logloss + args.lamda * regterm
+    return logloss, regterm, loss_p
+def CalcNTXentLoss(img_hash, txt_hash, out_hash, Criterion, args):
+    """
+        Calculate NTXent Loss
+        :param: h_img1: batch of image hashes #1 (original)
+        :param: h_img2: batch of image hashes #2 (augmented)
+        :param: h_txt1: batch of text hashes #1 (original)
+        :param: h_txt2: batch of text hashes #2 (augmented)
+        :returns: NTXent Loss
+        """
+    loss_ntxent_inter1 = Criterion(img_hash, txt_hash, type='cross')
+    loss_ntxent_inter2 = Criterion(img_hash, out_hash, type='orig')
+    loss_ntxent_inter3 = Criterion(out_hash, txt_hash, type='orig')
+    # loss_ntxent_intra = Criterion(out_hash, out_hash, type='orig') * args.contrastive_weights[1]
+    loss_ntxent = loss_ntxent_inter1 * args.contrastive[0] + loss_ntxent_inter2 * args.contrastive[1] + loss_ntxent_inter3 * args.contrastive[2]
+    return loss_ntxent
+def Calc_total_loss(H, B, S, num_train, args):
+    theta = H.mm(H.t()) / 2
+    t1 = (theta*theta).sum() / (num_train * num_train)
+    logloss = (- theta * S + Logtrick(Variable(theta)).data).sum()
+    regterm = (H - B).pow(2).sum()
+    loss_p = logloss + args.lamda * regterm
+    return logloss, regterm, loss_p
+def CalcHammingDist(B1, B2):
+    q = B2.shape[1]
+    distH = 0.5 * (q - np.dot(B1, B2.transpose()))
+    return distH
+def CalcMap(qB, rB, queryL, retrievalL):
+    # qB: m, q
+    # rB: n, q
+    # queryL: {0,1}^{mxl}
+    # retrievalL: {0,1}^{nxl}
+    num_query = queryL.shape[0]
+    map = 0
+    # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
+    for iter in range(num_query):
+        # 标签匹配
+        gnd = (np.dot(queryL[iter, :], retrievalL.transpose()) > 0).astype(np.float32)
+        tsum = np.sum(gnd)
+        if tsum == 0:
+            continue
+        # 计算query 与 database之间的汉明距离
+        hamm = CalcHammingDist(qB[iter, :], rB)
+        # 排序
+        ind = np.argsort(hamm)
+        # 汉明距离与标签对应
+        gnd = gnd[ind]
+        count = np.linspace(1, int(tsum), int(tsum))
+        # 按照结果排序比对是否标签一致，并返回一致的坐标
+        tindex = np.asarray(np.where(gnd == 1)) + 1.0
+        map_ = np.mean(count / (tindex))
+        # print(map_)
+        map = map + map_
+    map = map / num_query
+    # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
+    return map
+def CalcTopMap(qB, rB, queryL, retrievalL, topk = 20):
+    # qB: {-1,+1}^{mxq}
+    # rB: {-1,+1}^{nxq}
+    # queryL: {0,1}^{mxl}
+    # retrievalL: {0,1}^{nxl}
+    num_query = queryL.shape[0]
+    topkmap = 0
+    # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
+    for iter in range(num_query):
+        gnd = (np.dot(queryL[iter, :], retrievalL.transpose()) > 0).astype(np.float32)
+        hamm = CalcHammingDist(qB[iter, :], rB)
+        ind = np.argsort(hamm)
+        gnd = gnd[ind]
+        tgnd = gnd[0:topk]
+        tsum = np.sum(tgnd)
+        if tsum == 0:
+            continue
+        count = np.linspace(1, int(tsum), int(tsum))
+        tindex = np.asarray(np.where(tgnd == 1)) + 1.0
+        topkmap_ = np.mean(count / (tindex))
+        # print(topkmap_)
+        topkmap = topkmap + topkmap_
+    topkmap = topkmap / num_query
+    # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
+    return topkmap

cisen/utils/misc.py ADDED Viewed

	@@ -0,0 +1,444 @@

+import os
+import random
+import numpy as np
+from PIL import Image
+from loguru import logger
+import sys
+import inspect
+import math
+import torch
+import torch.distributed as dist
+from collections import OrderedDict
+from torch import nn
+def init_random_seed(seed=None, device='cuda', rank=0, world_size=1):
+    """Initialize random seed."""
+    if seed is not None:
+        return seed
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+def set_random_seed(seed, deterministic=False):
+    """Set random seed."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def __str__(self):
+        if self.name == "Lr":
+            fmtstr = "{name}={val" + self.fmt + "}"
+        else:
+            fmtstr = "{name}={val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        logger.info("  ".join(entries))
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+def get_caller_name(depth=0):
+    """
+    Args:
+        depth (int): Depth of caller conext, use 0 for caller depth.
+        Default value: 0.
+    Returns:
+        str: module name of the caller
+    """
+    # the following logic is a little bit faster than inspect.stack() logic
+    frame = inspect.currentframe().f_back
+    for _ in range(depth):
+        frame = frame.f_back
+    return frame.f_globals["__name__"]
+class StreamToLoguru:
+    """
+    stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
+        """
+        Args:
+            level(str): log level string of loguru. Default value: "INFO".
+            caller_names(tuple): caller names of redirected module.
+                Default value: (apex, pycocotools).
+        """
+        self.level = level
+        self.linebuf = ""
+        self.caller_names = caller_names
+    def write(self, buf):
+        full_name = get_caller_name(depth=1)
+        module_name = full_name.rsplit(".", maxsplit=-1)[0]
+        if module_name in self.caller_names:
+            for line in buf.rstrip().splitlines():
+                # use caller level log
+                logger.opt(depth=2).log(self.level, line.rstrip())
+        else:
+            sys.__stdout__.write(buf)
+    def flush(self):
+        pass
+def redirect_sys_output(log_level="INFO"):
+    redirect_logger = StreamToLoguru(log_level)
+    sys.stderr = redirect_logger
+    sys.stdout = redirect_logger
+def setup_logger(save_dir, filename="log.txt", mode="a"):
+    """setup logger for training and testing.
+    Args:
+        save_dir(str): location to save log file
+        distributed_rank(int): device rank when multi-gpu environment
+        filename (string): log save name.
+        mode(str): log file write mode, `append` or `override`. default is `a`.
+    Return:
+        logger instance.
+    """
+    loguru_format = (
+        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+        "<level>{level: <8}</level> | "
+        "<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+    logger.remove()
+    save_file = os.path.join(save_dir, filename)
+    if mode == "o" and os.path.exists(save_file):
+        os.remove(save_file)
+    # only keep logger in rank0 process
+    logger.add(
+        sys.stderr,
+        format=loguru_format,
+        level="INFO",
+        enqueue=True,
+    )
+    logger.add(save_file)
+    # redirect stdout/stderr to loguru
+    redirect_sys_output("INFO")
+def trainMetric(pred, label):
+    pred = torch.argmax(pred,dim = 1)
+    prec = torch.sum(pred == label)
+    return prec
+# def compute_AP(predicted_probs, true_labels):
+#     num_samples, num_classes = true_labels.shape
+#
+#     # 初始化用于存储每个类别的 AP 的列表
+#     aps = []
+#
+#     for class_idx in range(num_classes):
+#         class_true_labels = true_labels[:, class_idx]
+#         class_similarity_scores = predicted_probs[:, class_idx]
+#
+#         # 获取按相似性分数排序后的样本索引
+#         sorted_indices = torch.argsort(class_similarity_scores, descending=True)
+#
+#         # 计算累积精度和召回率
+#         tp = 0
+#         fp = 0
+#         precision_at_rank = []
+#         recall_at_rank = []
+#
+#         for rank, idx in enumerate(sorted_indices):
+#             if class_true_labels[idx] == 1:
+#                 tp += 1
+#             else:
+#                 fp += 1
+#             precision = tp / (tp + fp)
+#             recall = tp / torch.sum(class_true_labels)
+#             precision_at_rank.append(precision)
+#             recall_at_rank.append(recall)
+#
+#         # 计算平均精度（AP）通过计算曲线下的面积
+#         precision_at_rank = torch.tensor(precision_at_rank)
+#         recall_at_rank = torch.tensor(recall_at_rank)
+#         ap = torch.trapz(precision_at_rank, recall_at_rank)
+#
+#         aps.append(ap)
+#
+#
+#     return aps
+def token_wise_similarity(rep1, rep2, mask=None, chunk_size=1024):
+    batch_size1, n_token1, feat_dim = rep1.shape
+    batch_size2, n_token2, _ = rep2.shape
+    num_folds = math.ceil(batch_size2 / chunk_size)
+    output = []
+    for i in range(num_folds):
+        rep2_seg = rep2[i * chunk_size:(i + 1) * chunk_size]
+        out_i = rep1.reshape(-1, feat_dim) @ rep2_seg.reshape(-1, feat_dim).T
+        out_i = out_i.reshape(batch_size1, n_token1, -1, n_token2).max(3)[0]
+        if mask is None:
+            out_i = out_i.mean(1)
+        else:
+            out_i = out_i.sum(1)
+        output.append(out_i)
+    output = torch.cat(output, dim=1)
+    if mask is not None:
+        output = output / mask.sum(1, keepdim=True).clamp_(min=1)
+    return output
+def compute_acc(logits, targets, topk=5):
+    targets = targets.squeeze(1)
+    p = logits.topk(topk, 1, True, True)[1]
+    pred = logits.topk(topk, 1, True, True)[1]
+    gt = targets[pred,:]
+    a = gt.view(1, -1)
+    # b = a.expand_as(pred)
+    c = gt.eq(targets)
+    correct = pred.eq(targets.view(1, -1).expand_as(pred)).contiguous()
+    acc_1 = correct[:1].sum(0)
+    acc_k = correct[:topk].sum(0)
+    return acc_1, acc_k
+def compute_mAP(predicted_probs, true_labels):
+    aps = compute_AP(predicted_probs, true_labels)
+    aps = [ap for ap in aps if not torch.isnan(ap)]
+    mAP = torch.mean(torch.tensor(aps))
+    return mAP
+def compute_F1(predictions, labels, k_val=5):
+    labels = labels.squeeze(1)
+    idx = predictions.topk(dim=1, k=k_val)[1]
+    predictions.fill_(0)
+    predictions.scatter_(dim=1, index=idx, src=torch.ones(predictions.size(0), k_val).to(predictions.device))
+    mask = predictions == 1
+    TP = (labels[mask] == 1).sum().float()
+    tpfp = mask.sum().float()
+    tpfn = (labels == 1).sum().float()
+    p = TP / tpfp
+    r = TP/tpfn
+    f1 = 2*p*r/(p+r)
+    return f1, p, r
+def compute_AP(predictions, labels):
+    num_class = predictions.size(1)
+    ap = torch.zeros(num_class).to(predictions.device)
+    empty_class = 0
+    for idx_cls in range(num_class):
+        prediction = predictions[:, idx_cls]
+        label = labels[:, idx_cls]
+        mask = label.abs() == 1
+        if (label > 0).sum() == 0:
+            empty_class += 1
+            continue
+        binary_label = torch.clamp(label[mask], min=0, max=1)
+        sorted_pred, sort_idx = prediction[mask].sort(descending=True)
+        sorted_label = binary_label[sort_idx]
+        tmp = (sorted_label == 1).float()
+        tp = tmp.cumsum(0)
+        fp = (sorted_label != 1).float().cumsum(0)
+        num_pos = binary_label.sum()
+        rec = tp/num_pos
+        prec = tp/(tp+fp)
+        ap_cls = (tmp*prec).sum()/num_pos
+        ap[idx_cls].copy_(ap_cls)
+    return ap, empty_class
+def compute_ACG(predictions, labels, k_val=5):
+    gt = labels.squeeze(1)
+    idx = predictions.topk(dim=1, k=k_val)[1]
+    pred = gt[idx, :]
+    pred[pred == -1] = 0
+    c = labels.eq(pred)  # common label
+    r = c.sum(-1) # similarity level
+    # acg
+    acg = c.sum(-1).sum(-1) / k_val
+    lg = torch.log1p(torch.arange(1, k_val+1, 1) ).to(r.device)
+    # dcg
+    dcg = (torch.pow(2, r) - 1) / lg
+    ir, _ = r.sort(-1, descending=True)
+    idcg = (torch.pow(2, ir) - 1) / lg
+    idcg[idcg == 0] = 1e-6
+    ndcg = dcg.sum(-1) / idcg.sum(-1)
+    # map
+    pos = r.clone()
+    pos[pos != 0] = 1
+    j = torch.arange(1, k_val + 1, 1).to(pos.device)
+    P = torch.cumsum(pos, 1) / j
+    Npos = torch.sum(pos, 1)
+    Npos[Npos == 0] = 1
+    AP = torch.sum(P * pos, 1)
+    map = torch.sum(P * pos, 1) / Npos
+    # wmap
+    acgj = torch.cumsum(r, 1) / j
+    wmap = torch.sum(acgj * pos, 1) / Npos
+    return acg, ndcg, map, wmap
+def compute_mAPw(predictions, labels, k_val=5):
+    gt = labels.squeeze(1)
+    idx = predictions.topk(dim=1, k=k_val)[1]
+    pred = gt[idx, :]
+    pred[pred == -1] = 0
+    c = labels.eq(pred)
+    r = c.sum(-1)
+    pos = r.clone()
+    pos[pos != 0] = 1
+    P = torch.cumsum(pos) / torch.arange(1, k_val+1, 1)
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    if epoch < args.warmup_epochs:
+        lr = args.base_lr * epoch / args.warmup_epochs
+    else:
+        lr = args.min_lr + (args.base_lr - args.min_lr) * 0.5 * \
+            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+    return lr
+def load_ckpt(weight_dir, model, map_location, args):
+    checkpoint = torch.load(weight_dir, map_location=map_location)
+    if args.resume:
+        resume_epoch = checkpoint['epoch']
+    else:
+        resume_epoch = 0
+    pre_weight = checkpoint['state_dict']
+    new_pre_weight = OrderedDict()
+    # pre_weight =torch.jit.load(resume)
+    model_dict = model.state_dict()
+    new_model_dict = OrderedDict()
+    for k, v in pre_weight.items():
+        new_k = k.replace('module.', '') if 'module' in k else k
+        # 针对batch_size=1
+        # new_k = new_k.replace('1','2') if 'proj.1' in new_k else new_k
+        new_pre_weight[new_k] = v
+    # for k, v in model_dict.items():
+    #     new_k = k.replace('module.', '') if 'module' in k else k
+    #     new_model_dict[new_k] = v
+    pre_weight = new_pre_weight  # ["model_state"]
+    # pretrained_dict = {}
+    # t_n = 0
+    # v_n = 0
+    # for k, v in pre_weight.items():
+    #     t_n += 1
+    #     if k in new_model_dict:
+    #         k = 'module.' + k if 'module' not in k else k
+    #         v_n += 1
+    #         pretrained_dict[k] = v
+            # print(k)
+    # os._exit()
+    # print(f'{v_n}/{t_n} weights have been loaded!')
+    model_dict.update(pre_weight)
+    model.load_state_dict(model_dict, strict=False)
+    return model, resume_epoch
+def load_ckpt_fpn(weight_dir, model, map_location):
+    pre_weight = torch.load(weight_dir, map_location=map_location)['state_dict']
+    epoch = torch.load(weight_dir, map_location=map_location)['epoch']
+    new_pre_weight = OrderedDict()
+    # pre_weight =torch.jit.load(resume)
+    model_dict = model.state_dict()
+    for k, v in pre_weight.items():
+        new_k = k.replace('module.', '') if 'module' in k else k
+        # if not (new_k.startswith('FPN') or new_k.startswith('gap')):
+        new_pre_weight[new_k] = v
+    pre_weight = new_pre_weight
+    # ["model_state"]
+    model_dict.update(pre_weight)
+    model.load_state_dict(model_dict, strict=True)
+    return model, epoch
+def load_ckpt_old(weight_dir, model, map_location):
+    pre_weight = torch.load(weight_dir, map_location=map_location)['state_dict']
+    epoch = torch.load(weight_dir, map_location=map_location)['epoch']
+    new_pre_weight = OrderedDict()
+    # pre_weight =torch.jit.load(resume)
+    model_dict = model.state_dict()
+    for k, v in pre_weight.items():
+        new_k = k.replace('module.', '') if 'module' in k else k
+        if not (new_k.startswith('FPN') or new_k.startswith('gap')):
+            new_pre_weight[new_k] = v
+    pre_weight = new_pre_weight
+    # ["model_state"]
+    model_dict.update(pre_weight)
+    model.load_state_dict(model_dict, strict=False)
+    return model, epoch
+def compare_ckpt(model1, model2):
+    V = dict()
+    for k, v in model1.items():
+        if k.startswith('projT'):
+            V[k] = v
+    for k, v in model2.items():
+        if k in sorted(V.keys()):
+            model2[k] = V[k]
+    return model2

cisen/utils/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text