End of training

Browse files

Files changed (14) hide show

README.md +53 -0
all_results.json +7 -0
config.json +52 -0
configuration_vae.py +57 -0
image_processing_vae.py +117 -0
model.safetensors +3 -0
modeling_vae.py +239 -0
module_layers.py +95 -0
module_layers_attn.py +335 -0
my_config.json +85 -0
preprocessor_config.json +24 -0
train_results.json +7 -0
trainer_state.json +57 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+library_name: transformers
+tags:
+- image-classification
+- generated_from_trainer
+model-index:
+- name: vae_test
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# vae_test
+This model is a fine-tuned version of [](https://huggingface.co/) on the train_file=/home/pj24002027/ku50001104/data/mutual_dataset/few_data/train.jsonl, validation_file=/home/pj24002027/ku50001104/data/mutual_dataset/few_data/test.jsonl, max_train_samples=2048, max_eval_samples=2048, use_sensor_keys=CAM_FRONT dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 128
+- eval_batch_size: 32
+- seed: 42
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.5,0.9) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine_with_min_lr
+- lr_scheduler_warmup_steps: 1000
+- num_epochs: 0.1
+### Training results
+### Framework versions
+- Transformers 4.51.3
+- Pytorch 2.6.0+cu126
+- Datasets 3.5.1
+- Tokenizers 0.21.1

all_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 0.125,
+    "train_loss": 1.1237460374832153,
+    "train_runtime": 20.6388,
+    "train_samples_per_second": 9.923,
+    "train_steps_per_second": 0.097
+}

config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "architectures": [
+    "VAEModel"
+  ],
+  "attn_resolutions": [],
+  "auto_map": {
+    "AutoConfig": "configuration_vae.VAEConfig",
+    "AutoModel": "modeling_vae.VAEModel"
+  },
+  "channels": 128,
+  "channels_mult": [
+    1,
+    1,
+    1,
+    2,
+    2
+  ],
+  "codebook_dim": 0,
+  "codebook_size": 0,
+  "decoder_type": "Simple",
+  "drop_out": 0,
+  "dropout": 0.0,
+  "encoder_type": "Simple",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "in_channels": 3,
+  "model_type": "vae",
+  "num_res_blocks": 2,
+  "out_channels": 3,
+  "quantizer_type": "VQ",
+  "resolution": [
+    64,
+    64
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "w_commit": 0,
+  "w_dino": 0,
+  "w_kl": 1,
+  "w_l1": 0.2,
+  "w_mse": 2,
+  "w_perceptual": 0,
+  "z_channels": 64
+}

configuration_vae.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from enum import Enum
+from  transformers import PretrainedConfig
+from .module_layers import Encoder, Decoder
+from .module_layers_attn import Encoder as AttnEncoder, Decoder as AttnDecoder
+# from .module_quantizers import VectorQuantizer
+class EncoderType(Enum):
+    Simple = Encoder
+    Attn = AttnEncoder
+class DecoderType(Enum):
+    Simple = Decoder
+    Attn = AttnDecoder
+# class QuantizerType(Enum):
+#     VQ = VectorQuantizer
+class VAEConfig(PretrainedConfig):
+    model_type = "vae"
+    def __init__(self, **kwargs):
+        # ref ./modules/__init__.py
+        self.encoder_type = kwargs.get("encoder_type", EncoderType.Simple.name)
+        self.decoder_type = kwargs.get("decoder_type", DecoderType.Simple.name)
+        # self.quantizer_type = kwargs.get("quantizer_type", QuantizerType.VQ.name)
+        # in_ch -> channels * channels_mult -> z_channels -> codebook_dim -> z_channels -> channels * channels_mult -> out_ch
+        self.in_channels = kwargs.get("in_channels", 3)
+        self.out_channels = kwargs.get("out_channels", 3)
+        self.z_channels = kwargs.get("z_channels", 256) # embeding dim
+        self.channels = kwargs.get("channels", 128)
+        # features = [channels * mult for mult in channels_mult]
+        # res -> res // 2**(len(channels_mult)-1)
+        self.channels_mult = kwargs.get("channels_mult", [1, 1, 2, 2])
+        self.codebook_dim = kwargs.get("codebook_dim", 8)
+        self.codebook_size = kwargs.get("codebook_size", 1024)
+        # if res = 128 and ch_mult = [1, 1, 2, 2], select any from [128/1, 128/2, 128/2**2, 128/2**3]
+        # in taming-transformers use attn_resolutions = [res/2**(len(ch_mult)-1)]
+        self.attn_resolutions = kwargs.get("attn_resolutions", [])
+        self.num_res_blocks = kwargs.get("num_res_blocks", 2)
+        self.resolution = kwargs.get("resolution", [64, 64])
+        self.dropout = kwargs.get("dropout", 0.)
+        # imagenet mean [0.1616, 0.1646, 0.1618], std [0.2206, 0.2233, 0.2214]
+        # nusc mean [0.3814, 0.3861, 0.3778], std [0.2219, 0.2188, 0.2248]
+        self.image_mean = kwargs.get('image_mean', [0.1616, 0.1646, 0.1618])
+        self.image_std = kwargs.get("image_std", [0.2206, 0.2233, 0.2214])
+        self.w_mse = kwargs.get("w_mse", 2)
+        self.w_l1 = kwargs.get("w_l1", 0.2)
+        self.w_perceptual = kwargs.get("w_perceptual", 0.1)
+        self.w_commit = kwargs.get("w_commit", 1)
+        self.w_dino = kwargs.get("w_dino", 0.1)
+        self.w_kl = kwargs.get("w_kl", 0.1)
+        super().__init__(**kwargs)

image_processing_vae.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import List, Optional, Union, Tuple
+import PIL
+import torch
+from torchvision.transforms.v2 import (
+    Compose,
+    Lambda,
+    Resize,
+    Normalize,
+    InterpolationMode,
+)
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import ChannelDimension, to_numpy_array
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+class VAEImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+    def __init__(
+            self,
+            do_resize:bool = True,
+            image_size: Tuple[int, int]=[64, 64],
+            do_rescale: bool = True,
+            rescale_factor: Union[int, float] = 1 / 255,
+            do_normalize: bool = True,
+            image_mean: Optional[Union[List[float]]] = [0.5, 0.5, 0.5],
+            image_std: Optional[Union[List[float]]] = [0.5, 0.5, 0.5],
+            *args,
+            **kwargs
+        ):
+        super().__init__(*args, **kwargs)
+        self.do_resize = do_resize
+        self.image_size = image_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+    def preprocess(
+            self,
+            images: Union["PIL.Image.Image", np.ndarray, List["PIL.Image.Image"], List[np.ndarray]],
+            is_video: bool = False,
+            return_tensors: Optional[Union[str, TensorType]] = "pt",
+            input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
+            **kwargs
+        ):
+        if isinstance(images, list):
+            images = [to_numpy_array(image) for image in images]
+            images = torch.from_numpy(np.stack(images, axis=0)).float()
+        else:
+            images = to_numpy_array(images)
+            images = torch.from_numpy(images).float()
+        if is_video:
+            if images.dim() == 4:
+                images = images.unsqueeze(0)
+            if input_data_format == ChannelDimension.LAST:
+                images = images.permute(0, 1, 4, 2, 3)
+        else:
+            if images.dim() == 3:
+                images = images.unsqueeze(0)
+            if input_data_format == ChannelDimension.LAST:
+                images = images.permute(0, 3, 1, 2)
+        compose_tf = Compose(
+                    [
+                        Resize(self.image_size, interpolation=InterpolationMode.BICUBIC) if self.do_resize else Lambda(lambda x: x),
+                        Lambda(lambda x: x / 255.0) if self.do_rescale else Lambda(lambda x: x),
+                        Normalize(self.image_mean, self.image_std) if self.do_normalize else Lambda(lambda x: x),
+                    ]
+                )
+        images = compose_tf(images)
+        return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+    def postprocess(
+            self,
+            images:  "torch.Tensor",
+            is_video: bool = False,
+            return_tensors:  Optional[Union[str, TensorType]] = "np",
+            input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+            **kwargs
+            ):
+        if isinstance(images, np.ndarray):
+            images = torch.from_numpy(images).float()
+        if isinstance(images, list):
+            images = torch.stack(images, dim=0)
+        if not isinstance(images, torch.Tensor):
+            raise ValueError("images must be a torch.Tensor")
+        if is_video:
+            if images.dim() == 4:
+                images = images.unsqueeze(0)
+            if input_data_format == ChannelDimension.FIRST:
+                images = images.permute(0, 1, 3, 4, 2)
+        else:
+            if images.dim() == 3:
+                images = images.unsqueeze(0)
+            if input_data_format == ChannelDimension.FIRST:
+                images = images.permute(0, 2, 3, 1)
+        if self.do_normalize:
+            images = (images * torch.tensor(self.image_std)) + torch.tensor(self.image_mean)
+        if self.do_rescale:
+            images = torch.clamp(images, 0, 1)
+            images = (images * 255).type(torch.uint8)
+        if return_tensors == TensorType.NUMPY:
+            images = images.numpy()
+        return BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3385c46b833dccf98c5e3ceb6b4a9b174795a85374120706fa0a0c9a42f2197
+size 31338740

modeling_vae.py ADDED Viewed

	@@ -0,0 +1,239 @@

+from typing import Optional, Union, Tuple
+from dataclasses import dataclass
+import torch
+from torch import nn
+from torch import Tensor
+from transformers import PreTrainedModel
+from transformers.utils import logging, ModelOutput
+from torchvision.models import vgg16, VGG16_Weights
+import torch.nn.functional as F
+from einops import rearrange
+from .configuration_vae import VAEConfig, EncoderType, DecoderType
+logger = logging.get_logger(__name__)
+@dataclass
+class VAEOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    reconstruction: torch.FloatTensor = None
+    mse_loss: Optional[torch.FloatTensor] = None
+    l1_loss: Optional[torch.FloatTensor] = None
+    perceptual_loss: Optional[torch.FloatTensor] = None
+    dino_loss: Optional[torch.FloatTensor] = None
+    kl_loss: Optional[torch.FloatTensor] = None
+class Vgg16(nn.Module):
+    # ref https://github.com/dxyang/StyleTransfer/blob/master/vgg.py
+    def __init__(self, layers):
+        super().__init__()
+        features = vgg16(weights=VGG16_Weights.DEFAULT).features
+        self.to_relu_1_2 = nn.Sequential()
+        self.to_relu_2_2 = nn.Sequential()
+        self.to_relu_3_3 = nn.Sequential()
+        self.to_relu_4_3 = nn.Sequential()
+        for x in range(4):
+            self.to_relu_1_2.add_module(str(x), features[x])
+        for x in range(4, 9):
+            self.to_relu_2_2.add_module(str(x), features[x])
+        for x in range(9, 16):
+            self.to_relu_3_3.add_module(str(x), features[x])
+        for x in range(16, 23):
+            self.to_relu_4_3.add_module(str(x), features[x])
+        # don't need the gradients, just want the features
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        h = self.to_relu_1_2(x)
+        h_relu_1_2 = h
+        h = self.to_relu_2_2(h)
+        h_relu_2_2 = h
+        h = self.to_relu_3_3(h)
+        h_relu_3_3 = h
+        h = self.to_relu_4_3(h)
+        h_relu_4_3 = h
+        out = (h_relu_1_2, h_relu_2_2, h_relu_3_3, h_relu_4_3)
+        return out
+class PerceptualLoss(nn.Module):
+    def __init__(self, layers=(3, 8, 15, 22), unnorm_mean=None, unnorm_std=None, weights=None):
+        super().__init__()
+        self.vgg = Vgg16(layers=layers)
+        self.layers = layers
+        self.weights = weights or [1.0 / len(layers)] * len(layers)
+    def forward(self, x, y):
+        x_vgg = self.vgg(x)
+        y_vgg = self.vgg(y)
+        loss = 0.0
+        for x_vgg_layer, y_vgg_layer in zip(x_vgg, y_vgg):
+            loss += F.mse_loss(x_vgg_layer, y_vgg_layer)
+        return loss
+class DinoLoss(nn.Module):
+    def __init__(self, patch_size, use_large=False):
+        super().__init__()
+        size = 'b' if use_large else 's'
+        dino = f'dino_vit{size}{patch_size}'
+        self.vit = torch.hub.load('facebookresearch/dino:main', dino)
+        print('use ', dino)
+        self.vit.eval()
+        for param in self.vit.parameters():
+            param.requires_grad = False
+    def forward(self, gt, embed):
+        with torch.no_grad():
+            dino_features = self.vit.prepare_tokens(gt)
+            for blk in self.vit.blocks:
+                dino_features = blk(dino_features)
+            dino_features = self.vit.norm(dino_features)
+            dino_features = dino_features[:, 1:]
+        embed_features = rearrange(embed, 'b c h w -> b (h w) c').contiguous()
+        dtype = embed.dtype
+        dino_loss = 1 - F.cosine_similarity(dino_features.to(torch.float32), embed_features.to(torch.float32), dim=2)
+        dino_loss = dino_loss.mean()
+        dino_loss = dino_loss.to(dtype)
+        return dino_loss
+class VAEModel(PreTrainedModel):
+    config_class = VAEConfig
+    main_input_name = "s0_img"
+    def __init__(self, config: VAEConfig):
+        super().__init__(config)
+        dict_config = config.to_dict()
+        self.encoder = EncoderType[config.encoder_type].value(**dict_config)
+        enc_out_dim = self.config.z_channels * (self.config.resolution[0] // (2 ** (len(self.config.channels_mult) - 1))) ** 2
+        latent_dim = 64
+        self.cond_mlp = nn.Sequential(
+            nn.Linear(enc_out_dim * 2, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, latent_dim * 2),
+        )
+        self.in_mlp = nn.Sequential(
+            nn.Linear(enc_out_dim, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, latent_dim * 2),
+        )
+        self.cond_mlp_out = nn.Sequential(
+            nn.Linear(latent_dim + enc_out_dim, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, enc_out_dim),
+        )
+        self.out_mlp = nn.Sequential(
+            nn.Linear(latent_dim, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, config.z_channels),
+            nn.ReLU(),
+            nn.Linear(config.z_channels, enc_out_dim),
+        )
+        self.decoder = DecoderType[config.decoder_type].value(**dict_config)
+        if config.w_perceptual > 0:
+            self.perceptual_loss = PerceptualLoss(
+                unnorm_mean=config.image_mean,
+                unnorm_std=config.image_std
+            )
+        if config.w_dino > 0:
+            assert config.z_channels in [384, 768]
+            patch_size = 2 ** (len(config.channels_mult) - 1)
+            self.dino_loss = DinoLoss(patch_size=patch_size)
+        self.log_state = {
+            "loss": None,
+            "mse_loss": None,
+            "l1_loss": None,
+            "perceptual_loss": None,
+            "dino_loss": None,
+            "gt": None,
+            "recon": None,
+        }
+        self.post_init()
+    def encode(self, s0_img: Tensor, s1_img: Tensor, a0: Tensor) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+        # s0 = self.encoder(s0_img).reshape(s0_img.shape[0], -1)
+        s0 = None
+        s1 = self.encoder(s1_img).reshape(s1_img.shape[0], -1)
+        # s1_mean_var = self.cond_mlp(torch.cat([s0, s1], dim=1))
+        s1_mean_var = self.in_mlp(s1)
+        s1_mean, s1_logvar = s1_mean_var.chunk(2, dim=1)
+        s1_stddev = torch.exp(s1_logvar * 0.5)
+        s1_latent = s1_mean + s1_stddev * torch.randn_like(s1_mean)
+        return s1_latent, s0, s1_mean, s1_logvar
+    def decode(self, s1_latent: Tensor, s0: Tensor) -> Tensor:
+        quant_h = int(self.config.resolution[0] / (2 ** (len(self.config.channels_mult) - 1)))
+        quant_w = int(self.config.resolution[1] / (2 ** (len(self.config.channels_mult) - 1)))
+        # s1_latent = self.cond_mlp_out(torch.cat([s1_latent, s0], dim=1)).reshape(s1_latent.shape[0], self.config.z_channels, quant_h, quant_w)
+        s1_latent = self.out_mlp(s1_latent).reshape(s1_latent.shape[0], self.config.z_channels, quant_h, quant_w)
+        return self.decoder(s1_latent)
+    def forward(self,
+                s0_img: Tensor,
+                s1_img: Tensor,
+                action: Tensor,
+                return_loss: bool = True,
+                return_dict: Optional[bool] = None,
+                ) -> Union[Tuple, VAEOutput]:
+        return_dict = return_dict if return_dict is not None else False
+        s1_latent, s0, s1_mean, s1_logvar = self.encode(s0_img, s1_img, action)
+        recon = self.decode(s1_latent, s0)
+        loss = None
+        if return_loss:
+            # recon loss
+            mse_loss = F.mse_loss(recon, s1_img)
+            l1_loss = F.l1_loss(recon, s1_img)
+            if self.config.w_perceptual > 0:
+                perceptual_loss = self.perceptual_loss(recon, s1_img)
+            else:
+                perceptual_loss = torch.zeros_like(mse_loss).to(mse_loss.device)
+            if self.config.w_dino > 0:
+                dino_loss = self.dino_loss(s1_img, None)
+            else:
+                dino_loss = torch.zeros_like(mse_loss).to(mse_loss.device)
+            # kl loss
+            kl_loss = torch.mean(-0.5 * torch.sum(1 + s1_logvar - s1_mean**2 - s1_logvar.exp(), dim=1))
+            loss = self.config.w_mse * mse_loss + \
+                self.config.w_l1 * l1_loss + \
+                self.config.w_perceptual * perceptual_loss + \
+                self.config.w_dino * dino_loss + \
+                self.config.w_kl * kl_loss
+        if not return_dict:
+            self.log_state["loss"] = loss.item()
+            self.log_state["mse_loss"] = mse_loss.item()
+            self.log_state["l1_loss"] = l1_loss.item()
+            self.log_state["perceptual_loss"] = perceptual_loss.item()
+            self.log_state["dino_loss"] = dino_loss.item()
+            self.log_state["kl_loss"] = kl_loss.item()
+            self.log_state["gt"] = s0_img.clone().detach().cpu()[:4].to(torch.float32)
+            self.log_state["recon"] = recon.clone().detach().cpu()[:4].to(torch.float32)
+            return ((loss,) + (recon,)) if loss is not None else recon
+        return VAEOutput(
+            loss=loss,
+            reconstruction=recon,
+            mse_loss=mse_loss,
+            l1_loss=l1_loss,
+            perceptual_loss=perceptual_loss,
+            dino_loss=dino_loss,
+        )
+    def get_last_layer(self):
+        raise NotImplementedError

module_layers.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+class DoubleConv(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: int = None):
+        super().__init__()
+        if mid_channels is None:
+            mid_channels = out_channels
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(mid_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        return self.conv(x)
+class Down(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.maxpool_conv = nn.Sequential(
+            nn.MaxPool2d(2),
+            DoubleConv(in_channels, out_channels)
+        )
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        return self.maxpool_conv(x)
+class Up(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, bilinear: bool = False):
+        super().__init__()
+        if bilinear:
+            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+            self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
+        else:
+            self.up = nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
+            self.conv = DoubleConv(in_channels, out_channels)
+    def forward(self, x):
+        x = self.up(x)
+        return self.conv(x)
+class Encoder(nn.Module):
+    def __init__(self, z_channels: int, in_channels: int, channels: int, channels_mult: list[int], **ignore_kwargs):
+        super().__init__()
+        self.encoder = nn.ModuleList()
+        num_resolutions = len(channels_mult)
+        in_ch_mult = (1,) + tuple(channels_mult)
+        self.encoder.append(DoubleConv(in_channels, channels))
+        for i_level in range(num_resolutions):
+            block_in = channels * in_ch_mult[i_level]
+            block_out = channels * channels_mult[i_level]
+            if i_level != num_resolutions - 1:
+                self.encoder.append(Down(block_in, block_out))
+            else:
+                self.encoder.append(DoubleConv(block_in, block_out))
+        block_in = block_out
+        self.encoder.append(nn.Conv2d(block_in, z_channels, kernel_size=(1, 1)))
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        for layer in self.encoder:
+            x = layer(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, z_channels: int, out_channels: int, channels: int, channels_mult: list[int], **ignore_kwargs):
+        super().__init__()
+        self.decoder = nn.ModuleList()
+        num_resolutions = len(channels_mult)
+        block_in = channels*channels_mult[num_resolutions-1]
+        self.decoder.append(nn.Conv2d(z_channels, block_in, kernel_size=(1, 1)))
+        for i_level in reversed(range(num_resolutions)):
+            block_out = channels * channels_mult[i_level]
+            if i_level != 0:
+                self.decoder.append(Up(block_in, block_out))
+            else:
+                self.decoder.append(DoubleConv(block_in, block_out))
+            block_in = block_out
+        self.final_conv = nn.Conv2d(block_in, out_channels, kernel_size=1)
+    def forward(self, x):
+        for layer in self.decoder:
+            x = layer(x)
+        return self.final_conv(x)

module_layers_attn.py ADDED Viewed

	@@ -0,0 +1,335 @@

+# pytorch_diffusion + derived encoder decoder
+# Ref [https://github.com/CompVis/taming-transformers/blob/master/taming/modules/diffusionmodules/model.py]
+import torch
+import torch.nn as nn
+import numpy as np
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class Encoder(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 channels_mult: list[int],
+                 num_res_blocks: int,
+                 attn_resolutions: int,
+                 dropout: float,
+                 resolution: list[int],
+                 z_channels: int,
+                 **ignore_kwargs):
+        super().__init__()
+        self.ch = channels
+        self.temb_ch = 0
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution if isinstance(resolution, int) else resolution[0]
+        in_ch_mult = (1,)+tuple(channels_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = channels*in_ch_mult[i_level]
+            block_out = channels*channels_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, True)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        #assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self,
+                 out_channels:int,
+                 channels: int,
+                 channels_mult: list[int],
+                 num_res_blocks: int,
+                 attn_resolutions: list[int],
+                 dropout: float,
+                 resolution: list[int],
+                 z_channels: int,
+                 **ignorekwargs):
+        super().__init__()
+        self.ch = channels
+        self.temb_ch = 0
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(channels_mult)
+        block_in = channels*channels_mult[self.num_resolutions-1]
+        curr_res = resolution if isinstance(resolution, int) else resolution[0]
+        curr_res = curr_res // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        # print("Working with z of shape {} = {} dimensions.".format(
+        #     self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = channels*channels_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, True)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

my_config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+    "output_dir": "logs/vae_test",
+    "overwrite_output_dir": true,
+    "model_type": "vae",
+    "report_to": [
+        "wandb"
+    ],
+    "wandb_project_name": "train_vae",
+    "run_name": "train_vae",
+    "num_train_epochs": 0.1,
+    "logging_strategy": "steps",
+    "logging_steps": 0.01,
+    "save_strategy": "epoch",
+    "save_steps": 1,
+    "eval_strategy": "no",
+    "eval_steps": 0.1,
+    "do_train": true,
+    "do_eval": false,
+    "resume_from_checkpoint": null,
+    "remove_unused_columns": false,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 32,
+    "gradient_accumulation_steps": 1,
+    "max_grad_norm": 5.0,
+    "bf16": true,
+    "fp16": false,
+    "use_cpu": false,
+    "save_only_model": false,
+    "adam_beta1": 0.5,
+    "adam_beta2": 0.9,
+    "learning_rate": 0.0002,
+    "weight_decay": 0.01,
+    "warmup_steps": 1000,
+    "lr_scheduler_type": "cosine_with_min_lr",
+    "lr_scheduler_kwargs": {
+        "min_lr": 1e-05
+    },
+    "train_file": "/home/pj24002027/ku50001104/data/mutual_dataset/few_data/train.jsonl",
+    "validation_file": "/home/pj24002027/ku50001104/data/mutual_dataset/few_data/test.jsonl",
+    "max_eval_samples": 2048,
+    "max_train_samples": 2048,
+    "exp_setup": null,
+    "use_sensor_keys": "CAM_FRONT",
+    "dataloader_num_workers": 1,
+    "disable_tqdm": true,
+    "config_overrides": {
+        "encoder_type": "Simple",
+        "decoder_type": "Simple",
+        "quantizer_type": "VQ",
+        "resolution": [
+            64,
+            64
+        ],
+        "z_channels": 64,
+        "codebook_dim": 0,
+        "codebook_size": 0,
+        "num_res_blocks": 2,
+        "channels": 128,
+        "channels_mult": [
+            1,
+            1,
+            1,
+            2,
+            2
+        ],
+        "attn_resolutions": [],
+        "drop_out": 0,
+        "image_mean": [
+            0.5,
+            0.5,
+            0.5
+        ],
+        "image_std": [
+            0.5,
+            0.5,
+            0.5
+        ],
+        "w_mse": 2,
+        "w_l1": 0.2,
+        "w_perceptual": 0,
+        "w_commit": 0,
+        "w_dino": 0,
+        "w_kl": 1
+    }
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_vae.VAEImageProcessor"
+  },
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "VAEImageProcessor",
+  "image_size": [
+    64,
+    64
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "rescale_factor": 0.00392156862745098
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 0.125,
+    "train_loss": 1.1237460374832153,
+    "train_runtime": 20.6388,
+    "train_samples_per_second": 9.923,
+    "train_steps_per_second": 0.097
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.125,
+  "eval_steps": 1,
+  "global_step": 2,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0625,
+      "grad_norm": 4.648393154144287,
+      "learning_rate": 0.0,
+      "loss": 1.1228,
+      "step": 1
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 4.56954288482666,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 1.1247,
+      "step": 2
+    },
+    {
+      "epoch": 0.125,
+      "step": 2,
+      "total_flos": 147645328785408.0,
+      "train_loss": 1.1237460374832153,
+      "train_runtime": 20.6388,
+      "train_samples_per_second": 9.923,
+      "train_steps_per_second": 0.097
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 147645328785408.0,
+  "train_batch_size": 128,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:494f49e8ffe42f196e661babfa0f4516d40ccf2f9a923a613986c80ce7a70477
+size 5368