first upload

Files changed (14) hide show

README.md +30 -3
model_index.json +24 -0
scheduler/scheduler.py +175 -0
scheduler/scheduler_config.json +7 -0
text_encoder/config.json +24 -0
text_encoder/model.safetensors +3 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +38 -0
tokenizer/vocab.json +0 -0
transformer/config.json +22 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
vqvae/config.json +39 -0
vqvae/diffusion_pytorch_model.safetensors +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,30 @@
----
-license: apache-2.0
----

+---
+pipeline_tag: text-to-image
+license: apache-2.0
+tags:
+- Non-Autoregressive
+---
+# Monetico: An Efficient Reproduction of Meissonic for Text-to-Image Synthesis
+## Introduction
+Similar to Meissonic, Monetico is a non-autoregressive masked image modeling text-to-image synthesis model capable of generating high-resolution images. It is designed to run efficiently on consumer-grade graphics cards.
+Monetico is an efficient reproduction of Meissonic. Trained on 8 H100 GPUs for approximately one week, Monetico can generate high-quality 512x512 images that are comparable to those produced by Meissonic and SDXL.
+Monetico was developed by Collov Labs. We extend our gratitude to @MeissonFlow and @viiika for their valuable advice on efficient training.
+## Usage
+For detailed usage instructions, please refer to [GitHub repository](https://github.com/viiika/Meissonic).
+## Citation
+If you find this work helpful, please consider citing:
+```bibtex
+@article{bai2024meissonic,
+  title={Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis},
+  author={Bai, Jinbin and Ye, Tian and Chow, Wei and Song, Enxin and Chen, Qing-Guo and Li, Xiangtai and Dong, Zhen and Zhu, Lei and Yan, Shuicheng},
+  journal={arXiv preprint arXiv:2410.08261},
+  year={2024}
+}
+```

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "Pipeline",
+  "_diffusers_version": "0.30.2",
+  "scheduler": [
+    "scheduler",
+    "Scheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModelWithProjection"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "transformer": [
+    "transformer",
+    "Transformer2DModel"
+  ],
+  "vqvae": [
+    "diffusers",
+    "VQModel"
+  ]
+}

scheduler/scheduler.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+def gumbel_noise(t, generator=None):
+    device = generator.device if generator is not None else t.device
+    noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
+    return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
+    sorted_confidence = torch.sort(confidence, dim=-1).values
+    cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
+    masking = confidence < cut_off
+    return masking
+@dataclass
+class SchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+    prev_sample: torch.Tensor
+    pred_original_sample: torch.Tensor = None
+class Scheduler(SchedulerMixin, ConfigMixin):
+    order = 1
+    temperatures: torch.Tensor
+    @register_to_config
+    def __init__(
+        self,
+        mask_token_id: int,
+        masking_schedule: str = "cosine",
+    ):
+        self.temperatures = None
+        self.timesteps = None
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+        device: Union[str, torch.device] = None,
+    ):
+        self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
+        if isinstance(temperature, (tuple, list)):
+            self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device)
+        else:
+            self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device)
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: torch.long,
+        sample: torch.LongTensor,
+        starting_mask_ratio: int = 1,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        two_dim_input = sample.ndim == 3 and model_output.ndim == 4
+        if two_dim_input:
+            batch_size, codebook_size, height, width = model_output.shape
+            sample = sample.reshape(batch_size, height * width)
+            model_output = model_output.reshape(batch_size, codebook_size, height * width).permute(0, 2, 1)
+        unknown_map = sample == self.config.mask_token_id
+        probs = model_output.softmax(dim=-1)
+        device = probs.device
+        probs_ = probs.to(generator.device) if generator is not None else probs  # handles when generator is on CPU
+        if probs_.device.type == "cpu" and probs_.dtype != torch.float32:
+            probs_ = probs_.float()  # multinomial is not implemented for cpu half precision
+        probs_ = probs_.reshape(-1, probs.size(-1))
+        pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device)
+        pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1])
+        pred_original_sample = torch.where(unknown_map, pred_original_sample, sample)
+        if timestep == 0:
+            prev_sample = pred_original_sample
+        else:
+            seq_len = sample.shape[1]
+            step_idx = (self.timesteps == timestep).nonzero()
+            ratio = (step_idx + 1) / len(self.timesteps)
+            if self.config.masking_schedule == "cosine":
+                mask_ratio = torch.cos(ratio * math.pi / 2)
+            elif self.config.masking_schedule == "linear":
+                mask_ratio = 1 - ratio
+            else:
+                raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
+            mask_ratio = starting_mask_ratio * mask_ratio
+            mask_len = (seq_len * mask_ratio).floor()
+            # do not mask more than amount previously masked
+            mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
+            # mask at least one
+            mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len)
+            selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0]
+            # Ignores the tokens given in the input by overwriting their confidence.
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+            masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator)
+            # Masks tokens with lower confidence.
+            prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample)
+        if two_dim_input:
+            prev_sample = prev_sample.reshape(batch_size, height, width)
+            pred_original_sample = pred_original_sample.reshape(batch_size, height, width)
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+        return SchedulerOutput(prev_sample, pred_original_sample)
+    def add_noise(self, sample, timesteps, generator=None):
+        step_idx = (self.timesteps == timesteps).nonzero()
+        ratio = (step_idx + 1) / len(self.timesteps)
+        if self.config.masking_schedule == "cosine":
+            mask_ratio = torch.cos(ratio * math.pi / 2)
+        elif self.config.masking_schedule == "linear":
+            mask_ratio = 1 - ratio
+        else:
+            raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
+        mask_indices = (
+            torch.rand(
+                sample.shape, device=generator.device if generator is not None else sample.device, generator=generator
+            ).to(sample.device)
+            < mask_ratio
+        )
+        masked_sample = sample.clone()
+        masked_sample[mask_indices] = self.config.mask_token_id
+        return masked_sample

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "Scheduler",
+  "_diffusers_version": "0.30.2",
+  "mask_token_id": 8255,
+  "masking_schedule": "cosine"
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "CLIPTextModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "vocab_size": 49408
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed02ba1546554a152c5e1f4920ba14466e3749e7feb42d8111857a8ed510574
+size 1416177568

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "!",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "!",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "Transformer2DModel",
+  "_diffusers_version": "0.30.2",
+  "attention_head_dim": 128,
+  "axes_dims_rope": [
+    16,
+    56,
+    56
+  ],
+  "codebook_size": 8192,
+  "downsample": true,
+  "guidance_embeds": false,
+  "in_channels": 64,
+  "joint_attention_dim": 1024,
+  "num_attention_heads": 8,
+  "num_layers": 14,
+  "num_single_layers": 28,
+  "patch_size": 1,
+  "pooled_projection_dim": 1024,
+  "upsample": true,
+  "vocab_size": 8256
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f6db36e88e25b7cf8f9a7c90f0084a760e81147324c3a33b079766f8d2eec9d
+size 3994323336

vqvae/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_class_name": "VQModel",
+  "_diffusers_version": "0.30.2",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    256,
+    512,
+    768
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 64,
+  "layers_per_block": 2,
+  "lookup_from_codebook": true,
+  "mid_block_add_attention": false,
+  "norm_num_groups": 32,
+  "norm_type": "group",
+  "num_vq_embeddings": 8192,
+  "out_channels": 3,
+  "sample_size": 32,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "vq_embed_dim": null,
+  "force_upcast": true
+}

vqvae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1241a5c88b635af4f8cfb268e388ccaa70f55a458a473d68943e5c28d7b7f762
+size 585009980