Spaces:

Alpha-VLLM
/

Lumina-Next-T2I

Running on Zero

App Files Files Community

PommesPeter commited on May 16, 2024

Commit

db5524c

verified ·

1 Parent(s): be34a3d

Update models/model.py

Browse files

Files changed (1) hide show

models/model.py +318 -198

models/model.py CHANGED Viewed

@@ -1,25 +1,9 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# GLIDE: https://github.com/openai/glide-text2im
-# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
-# --------------------------------------------------------
 import functools
 import logging
 import math
 from typing import Optional, Tuple, List
-# from apex.normalization import FusedRMSNorm as RMSNorm
 from .components import RMSNorm
-import fairscale.nn.model_parallel.initialize as fs_init
-from fairscale.nn.model_parallel.layers import (
-    ColumnParallelLinear, RowParallelLinear, ParallelEmbedding,
-)
 from flash_attn import flash_attn_varlen_func
 from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 import torch
@@ -38,22 +22,25 @@ def modulate(x, shift, scale):
 #             Embedding Layers for Timesteps and Class Labels               #
 #############################################################################
 class ParallelTimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.
     """
     def __init__(self, hidden_size, frequency_embedding_size=256):
         super().__init__()
         self.mlp = nn.Sequential(
-            ColumnParallelLinear(
-                frequency_embedding_size, hidden_size, bias=True,
-                gather_output=False,
-                init_method=functools.partial(nn.init.normal_, std=0.02),
             ),
             nn.SiLU(),
-            RowParallelLinear(
-                hidden_size, hidden_size, bias=True, input_is_parallel=True,
-                init_method=functools.partial(nn.init.normal_, std=0.02),
             ),
         )
         self.frequency_embedding_size = frequency_embedding_size
@@ -71,16 +58,16 @@ class ParallelTimestepEmbedder(nn.Module):
         # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
         half = dim // 2
         freqs = torch.exp(
-            -math.log(max_period) * torch.arange(
-                start=0, end=half, dtype=torch.float32
-            ) / half
         ).to(device=t.device)
         args = t[:, None].float() * freqs[None]
         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
-            embedding = torch.cat([
-                embedding, torch.zeros_like(embedding[:, :1])
-            ], dim=-1)
         return embedding
     def forward(self, t):
@@ -93,12 +80,13 @@ class ParallelLabelEmbedder(nn.Module):
     r"""Embeds class labels into vector representations. Also handles label
     dropout for classifier-free guidance.
     """
     def __init__(self, num_classes, hidden_size, dropout_prob):
         super().__init__()
         use_cfg_embedding = int(dropout_prob > 0)
-        self.embedding_table = ParallelEmbedding(
-            num_classes + use_cfg_embedding, hidden_size,
-            init_method=functools.partial(nn.init.normal_, std=0.02),
         )
         self.num_classes = num_classes
         self.dropout_prob = dropout_prob
@@ -108,15 +96,10 @@ class ParallelLabelEmbedder(nn.Module):
         Drops labels to enable classifier-free guidance.
         """
         if force_drop_ids is None:
-            drop_ids = torch.rand(
-                labels.shape[0], device=labels.device
-            ) < self.dropout_prob
-            drop_ids = drop_ids.cuda()
-            dist.broadcast(
-                drop_ids,
-                fs_init.get_model_parallel_src_rank(),
-                fs_init.get_model_parallel_group(),
             )
             drop_ids = drop_ids.to(labels.device)
         else:
             drop_ids = force_drop_ids == 1
@@ -132,13 +115,21 @@ class ParallelLabelEmbedder(nn.Module):
 #############################################################################
-#                               Core DiT Model                              #
 #############################################################################
 class Attention(nn.Module):
     """Multi-head attention module."""
-    def __init__(self, dim: int, n_heads: int, n_kv_heads: Optional[int], qk_norm: bool, y_dim: int):
         """
         Initialize the Attention module.
@@ -150,38 +141,44 @@ class Attention(nn.Module):
         """
         super().__init__()
         self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
-        model_parallel_size = fs_init.get_model_parallel_world_size()
         self.n_local_heads = n_heads // model_parallel_size
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
         self.head_dim = dim // n_heads
-        self.wq = ColumnParallelLinear(
-            dim, n_heads * self.head_dim, bias=False, gather_output=False,
-            init_method=nn.init.xavier_uniform_,
         )
-        self.wk = ColumnParallelLinear(
-            dim, self.n_kv_heads * self.head_dim, bias=False,
-            gather_output=False, init_method=nn.init.xavier_uniform_,
         )
-        self.wv = ColumnParallelLinear(
-            dim, self.n_kv_heads * self.head_dim, bias=False,
-            gather_output=False, init_method=nn.init.xavier_uniform_,
         )
         if y_dim > 0:
-            self.wk_y = ColumnParallelLinear(
-                y_dim, self.n_kv_heads * self.head_dim, bias=False,
-                gather_output=False, init_method=nn.init.xavier_uniform_,
             )
-            self.wv_y = ColumnParallelLinear(
-                y_dim, self.n_kv_heads * self.head_dim, bias=False,
-                gather_output=False, init_method=nn.init.xavier_uniform_,
             )
             self.gate = nn.Parameter(torch.zeros([self.n_local_heads]))
-        self.wo = RowParallelLinear(
-            n_heads * self.head_dim, dim, bias=False,
-            input_is_parallel=True, init_method=nn.init.xavier_uniform_,
         )
         if qk_norm:
@@ -194,7 +191,7 @@ class Attention(nn.Module):
         else:
             self.q_norm = self.k_norm = nn.Identity()
             self.ky_norm = nn.Identity()
         # for proportional attention computation
         self.base_seqlen = None
         self.proportional_attn = False
@@ -224,8 +221,7 @@ class Attention(nn.Module):
         ndim = x.ndim
         assert 0 <= 1 < ndim
         assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-        shape = [d if i == 1 or i == ndim - 1 else 1
-                 for i, d in enumerate(x.shape)]
         return freqs_cis.view(*shape)
     @staticmethod
@@ -259,13 +255,17 @@ class Attention(nn.Module):
             return x_out.type_as(x_in)
     # copied from huggingface modeling_llama.py
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         def _get_unpad_data(attention_mask):
             seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
             indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
             max_seqlen_in_batch = seqlens_in_batch.max().item()
-            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
             return (
                 indices,
                 cu_seqlens,
@@ -276,14 +276,19 @@ class Attention(nn.Module):
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.n_local_heads, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
@@ -298,7 +303,9 @@ class Attention(nn.Module):
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
         return (
             query_layer,
@@ -347,15 +354,22 @@ class Attention(nn.Module):
         if dtype in [torch.float16, torch.bfloat16]:
             # begin var_len flash attn
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                xq, xk, xv, x_mask, seqlen
-            )
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
             if self.proportional_attn:
-                softmax_scale = math.sqrt(math.log(seqlen, self.base_seqlen) / self.head_dim)
             else:
                 softmax_scale = math.sqrt(1 / self.head_dim)
@@ -367,24 +381,32 @@ class Attention(nn.Module):
                 cu_seqlens_k=cu_seqlens_k,
                 max_seqlen_q=max_seqlen_in_batch_q,
                 max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=0.,
                 causal=False,
-                softmax_scale=softmax_scale
             )
             output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
             # end var_len_flash_attn
         else:
-            output = F.scaled_dot_product_attention(
-                xq.permute(0, 2, 1, 3),
-                xk.permute(0, 2, 1, 3),
-                xv.permute(0, 2, 1, 3),
-                attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_local_heads, seqlen, -1),
-            ).permute(0, 2, 1, 3).to(dtype)
         if hasattr(self, "wk_y"):
             # todo better flash_attn support
-            yk = self.ky_norm(self.wk_y(y)).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
             yv = self.wv_y(y).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
             n_rep = self.n_local_heads // self.n_local_kv_heads
             if n_rep >= 1:
@@ -394,7 +416,7 @@ class Attention(nn.Module):
                 xq.permute(0, 2, 1, 3),
                 yk.permute(0, 2, 1, 3),
                 yv.permute(0, 2, 1, 3),
-                y_mask.view(bsz, 1, 1, -1).expand(bsz, self.n_local_heads, seqlen, -1)
             ).permute(0, 2, 1, 3)
             output_y = output_y * self.gate.tanh().view(1, 1, -1, 1)
             output = output + output_y
@@ -424,10 +446,10 @@ class FeedForward(nn.Module):
                 dimension. Defaults to None.
         Attributes:
-            w1 (ColumnParallelLinear): Linear transformation for the first
                 layer.
-            w2 (RowParallelLinear): Linear transformation for the second layer.
-            w3 (ColumnParallelLinear): Linear transformation for the third
                 layer.
         """
@@ -436,21 +458,22 @@ class FeedForward(nn.Module):
         # custom dim factor multiplier
         if ffn_dim_multiplier is not None:
             hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * (
-            (hidden_dim + multiple_of - 1) // multiple_of
-        )
-        self.w1 = ColumnParallelLinear(
-            dim, hidden_dim, bias=False, gather_output=False,
-            init_method=nn.init.xavier_uniform_,
         )
-        self.w2 = RowParallelLinear(
-            hidden_dim, dim, bias=False, input_is_parallel=True,
-            init_method=nn.init.xavier_uniform_,
         )
-        self.w3 = ColumnParallelLinear(
-            dim, hidden_dim, bias=False, gather_output=False,
-            init_method=nn.init.xavier_uniform_,
         )
     # @torch.compile
@@ -462,9 +485,18 @@ class FeedForward(nn.Module):
 class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int,
-                 multiple_of: int, ffn_dim_multiplier: float, norm_eps: float,
-                 qk_norm: bool, y_dim: int) -> None:
         """
         Initialize a TransformerBlock.
@@ -495,7 +527,9 @@ class TransformerBlock(nn.Module):
         self.head_dim = dim // n_heads
         self.attention = Attention(dim, n_heads, n_kv_heads, qk_norm, y_dim)
         self.feed_forward = FeedForward(
-            dim=dim, hidden_dim=4 * dim, multiple_of=multiple_of,
             ffn_dim_multiplier=ffn_dim_multiplier,
         )
         self.layer_id = layer_id
@@ -506,9 +540,10 @@ class TransformerBlock(nn.Module):
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
-            ColumnParallelLinear(
-                min(dim, 1024), 6 * dim, bias=True, gather_output=True,
-                init_method=nn.init.zeros_,
             ),
         )
@@ -536,28 +571,41 @@ class TransformerBlock(nn.Module):
         """
         if adaln_input is not None:
-            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = \
                 self.adaLN_modulation(adaln_input).chunk(6, dim=1)
-            x = x + self.attention_norm1(gate_msa.unsqueeze(1) * self.attention(
-                modulate(self.attention_norm(x), shift_msa, scale_msa),
-                x_mask,
-                freqs_cis,
-                self.attention_y_norm(y),
-                y_mask,
-            ))
             d = x.shape[-1]
-            x = x + self.ffn_norm1(gate_mlp.unsqueeze(1) * self.feed_forward(
-                modulate(self.ffn_norm(x), shift_mlp, scale_mlp).view(-1, d),
-            ).view(*x.shape))
         else:
-            x = x + self.attention_norm1(self.attention(
-                self.attention_norm(x), x_mask, freqs_cis, self.attention_y_norm(y), y_mask
-            ))
             # for compatibility with torch.compile because the sequence length changes
             B, L, D = x.shape
-            x = x.view(B*L, D)
             x = x + self.ffn_norm1(self.feed_forward(self.ffn_norm(x)))
             x = x.view(B, L, D)
@@ -566,22 +614,27 @@ class TransformerBlock(nn.Module):
 class ParallelFinalLayer(nn.Module):
     """
-    The final layer of DiT.
     """
     def __init__(self, hidden_size, patch_size, out_channels):
         super().__init__()
         self.norm_final = nn.LayerNorm(
-            hidden_size, elementwise_affine=False, eps=1e-6,
         )
-        self.linear = ColumnParallelLinear(
-            hidden_size, patch_size * patch_size * out_channels, bias=True,
-            init_method=nn.init.zeros_, gather_output=True,
         )
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
-            ColumnParallelLinear(
-                min(hidden_size, 1024), 2 * hidden_size, bias=True,
-                init_method=nn.init.zeros_, gather_output=True,
             ),
         )
@@ -596,6 +649,7 @@ class NextDiT(nn.Module):
     """
     Diffusion model with a Transformer backbone.
     """
     def __init__(
         self,
         patch_size: int = 2,
@@ -610,8 +664,8 @@ class NextDiT(nn.Module):
         learn_sigma: bool = True,
         qk_norm: bool = False,
         cap_feat_dim: int = 5120,
-        rope_scaling_factor: float = 1.,
-        ntk_factor: float=1.
     ) -> None:
         super().__init__()
         self.learn_sigma = learn_sigma
@@ -619,34 +673,49 @@ class NextDiT(nn.Module):
         self.out_channels = in_channels * 2 if learn_sigma else in_channels
         self.patch_size = patch_size
-        self.x_embedder = ColumnParallelLinear(
             in_features=patch_size * patch_size * in_channels,
             out_features=dim,
             bias=True,
-            gather_output=True,
-            init_method=nn.init.xavier_uniform_,
         )
-        nn.init.constant_(self.x_embedder.bias, 0.)
         self.t_embedder = ParallelTimestepEmbedder(min(dim, 1024))
         self.cap_embedder = nn.Sequential(
             nn.LayerNorm(cap_feat_dim),
-            ColumnParallelLinear(cap_feat_dim, min(dim, 1024), bias=True, gather_output=True,
-                                 init_method=nn.init.zeros_),
         )
-        self.layers = nn.ModuleList([
-            TransformerBlock(layer_id, dim, n_heads, n_kv_heads, multiple_of,
-                             ffn_dim_multiplier, norm_eps, qk_norm, cap_feat_dim)
-            for layer_id in range(n_layers)
-        ])
         self.final_layer = ParallelFinalLayer(dim, patch_size, self.out_channels)
         assert (dim // n_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
         self.dim = dim
         self.n_heads = n_heads
         self.freqs_cis = NextDiT.precompute_freqs_cis(
-            dim // n_heads, 384, rope_scaling_factor=rope_scaling_factor, ntk_factor=ntk_factor
         )
         self.rope_scaling_factor = rope_scaling_factor
         self.ntk_factor = ntk_factor
@@ -655,7 +724,9 @@ class NextDiT(nn.Module):
         # nn.init.normal_(self.eol_token, std=0.02)
         nn.init.normal_(self.pad_token, std=0.02)
-    def unpatchify(self, x: torch.Tensor, img_size: List[Tuple[int, int]], return_tensor=False) -> List[torch.Tensor]:
         """
         x: (N, T, patch_size**2 * C)
         imgs: (N, H, W, C)
@@ -673,26 +744,40 @@ class NextDiT(nn.Module):
             for i in range(x.size(0)):
                 H, W = img_size[i]
                 L = (H // pH) * (W // pW)
-                imgs.append(x[i][:L].view(
-                    H // pH, W // pW, pH, pW, self.out_channels
-                ).permute(4, 0, 2, 1, 3).flatten(3, 4).flatten(1, 2))
         return imgs
     def patchify_and_embed(
-        self,
-        x: List[torch.Tensor] | torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], torch.Tensor]:
         self.freqs_cis = self.freqs_cis.to(x[0].device)
         if isinstance(x, torch.Tensor):
             pH = pW = self.patch_size
             B, C, H, W = x.size()
-            x = x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 1, 3, 5).flatten(3)
             x = self.x_embedder(x)
             x = x.flatten(1, 2)
-            mask = torch.ones(x.shape[0], x.shape[1], dtype=torch.int32, device=x.device)
             # leave the first line for text
-            return x, mask, [(H, W)] * B, self.freqs_cis[:H//pH, :W//pW].flatten(0,1).unsqueeze(0)
         else:
             pH = pW = self.patch_size
             x_embed = []
@@ -702,30 +787,44 @@ class NextDiT(nn.Module):
             for img in x:
                 C, H, W = img.size()
-                item_freqs_cis = self.freqs_cis[:H//pH, :W//pW]
-                freqs_cis.append(item_freqs_cis.flatten(0,1))
                 img_size.append((H, W))
-                img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 0, 2, 4).flatten(2)
                 img = self.x_embedder(img)
                 img = img.flatten(0, 1)
                 l_effective_seq_len.append(len(img))
                 x_embed.append(img)
             max_seq_len = max(l_effective_seq_len)
-            mask = torch.zeros(len(x), max_seq_len, dtype=torch.int32, device=x[0].device)
             padded_x_embed = []
             padded_freqs_cis = []
-            for i, (item_embed, item_freqs_cis, item_seq_len) in enumerate(zip(
-                x_embed, freqs_cis, l_effective_seq_len
-            )):
-                item_embed = torch.cat([
-                    item_embed,
-                    self.pad_token.view(1, -1).expand(max_seq_len - item_seq_len, -1),
-                ], dim=0)
-                item_freqs_cis = torch.cat([
-                    item_freqs_cis,
-                    item_freqs_cis[-1:].expand(max_seq_len - item_seq_len, -1)
-                ], dim=0)
                 padded_x_embed.append(item_embed)
                 padded_freqs_cis.append(item_freqs_cis)
                 mask[i][:item_seq_len] = 1
@@ -736,7 +835,7 @@ class NextDiT(nn.Module):
     def forward(self, x, t, cap_feats, cap_mask):
         """
-        Forward pass of DiT.
         t: (N,) tensor of diffusion timesteps
         y: (N,) tensor of class labels
         """
@@ -746,19 +845,18 @@ class NextDiT(nn.Module):
         # cap_freqs_cis = self.freqs_cis[:1, :cap_feats.shape[1]].to(x.device)
-        t = self.t_embedder(t)                   # (N, D)
         cap_mask_float = cap_mask.float().unsqueeze(-1)
-        cap_feats_pool = (cap_feats * cap_mask_float).sum(dim=1) / cap_mask_float.sum(dim=1)
         cap_feats_pool = cap_feats_pool.to(cap_feats)
         cap_emb = self.cap_embedder(cap_feats_pool)
         adaln_input = t + cap_emb
         cap_mask = cap_mask.bool()
         for layer in self.layers:
-            x = layer(
-                x, mask, freqs_cis, cap_feats, cap_mask,
-                adaln_input=adaln_input
-            )
         x = self.final_layer(x, adaln_input)
         x = self.unpatchify(x, img_size, return_tensor=x_is_tensor)
@@ -769,25 +867,48 @@ class NextDiT(nn.Module):
                 x = [_.chunk(2, dim=0)[0] for _ in x]
         return x
-    def forward_with_cfg(self, x, t, cap_feats, cap_mask, cfg_scale, rope_scaling_factor=None, ntk_factor=None, base_seqlen: Optional[int] = None, proportional_attn: bool = False):
         # """
-        # Forward pass of DiT, but also batches the unconditional forward pass
         # for classifier-free guidance.
         # """
         # # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
         # print(ntk_factor, rope_scaling_factor, self.ntk_factor, self.rope_scaling_factor)
         if rope_scaling_factor is not None or ntk_factor is not None:
-            rope_scaling_factor = rope_scaling_factor if rope_scaling_factor is not None else self.rope_scaling_factor
             ntk_factor = ntk_factor if ntk_factor is not None else self.ntk_factor
-            if rope_scaling_factor != self.rope_scaling_factor or ntk_factor != self.ntk_factor:
-                print(f"override freqs_cis, rope_scaling {rope_scaling_factor}, ntk {ntk_factor}", flush=True)
                 self.freqs_cis = NextDiT.precompute_freqs_cis(
-                    self.dim // self.n_heads, 384,
-                    rope_scaling_factor=rope_scaling_factor, ntk_factor=ntk_factor
                 )
                 self.rope_scaling_factor = rope_scaling_factor
                 self.ntk_factor = ntk_factor
         if proportional_attn:
             assert base_seqlen is not None
             for layer in self.layers:
@@ -817,7 +938,7 @@ class NextDiT(nn.Module):
         end: int,
         theta: float = 10000.0,
         rope_scaling_factor: float = 1.0,
-        ntk_factor: float = 1.0
     ):
         """
         Precompute the frequency tensor for complex exponentials (cis) with
@@ -841,23 +962,27 @@ class NextDiT(nn.Module):
         theta = theta * ntk_factor
-        logger.info(f"theta {theta} rope scaling {rope_scaling_factor} ntk {ntk_factor}")
-        freqs = 1.0 / (theta ** (
-            torch.arange(0, dim, 4)[: (dim // 4)].float().cuda() / dim
-        ))
         t = torch.arange(end, device=freqs.device, dtype=torch.float)  # type: ignore
         t = t / rope_scaling_factor
         freqs = torch.outer(t, freqs).float()  # type: ignore
         freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-        freqs_cis_h = freqs_cis.view(end, 1, dim//4, 1).repeat(1, end, 1, 1)
-        freqs_cis_w = freqs_cis.view(1, end, dim//4, 1).repeat(end, 1, 1, 1)
         freqs_cis = torch.cat([freqs_cis_h, freqs_cis_w], dim=-1).flatten(2)
         return freqs_cis
     def parameter_count(self) -> int:
         tensor_parallel_module_list = (
-            ColumnParallelLinear, RowParallelLinear, ParallelEmbedding,
         )
         total_params = 0
@@ -865,10 +990,7 @@ class NextDiT(nn.Module):
             nonlocal total_params
             is_tp_module = isinstance(module, tensor_parallel_module_list)
             for param in module.parameters(recurse=False):
-                total_params += param.numel() * (
-                    fs_init.get_model_parallel_world_size()
-                    if is_tp_module else 1
-                )
             for submodule in module.children():
                 _recursive_count_params(submodule)
@@ -880,9 +1002,7 @@ class NextDiT(nn.Module):
 #############################################################################
-#                                 DiT Configs                               #
 #############################################################################
 def NextDiT_2B_patch2(**kwargs):
-    return NextDiT(
-        patch_size=2, dim=2304, n_layers=24, n_heads=32, **kwargs
-    )

 import functools
 import logging
 import math
 from typing import Optional, Tuple, List
 from .components import RMSNorm
 from flash_attn import flash_attn_varlen_func
 from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 import torch
 #             Embedding Layers for Timesteps and Class Labels               #
 #############################################################################
 class ParallelTimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.
     """
     def __init__(self, hidden_size, frequency_embedding_size=256):
         super().__init__()
         self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size,
+                hidden_size,
+                bias=True,
             ),
             nn.SiLU(),
+            nn.Linear(
+                hidden_size,
+                hidden_size,
+                bias=True,
             ),
         )
         self.frequency_embedding_size = frequency_embedding_size
         # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
         half = dim // 2
         freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
         ).to(device=t.device)
         args = t[:, None].float() * freqs[None]
         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
         return embedding
     def forward(self, t):
     r"""Embeds class labels into vector representations. Also handles label
     dropout for classifier-free guidance.
     """
     def __init__(self, num_classes, hidden_size, dropout_prob):
         super().__init__()
         use_cfg_embedding = int(dropout_prob > 0)
+        self.embedding_table = nn.Embedding(
+            num_classes + use_cfg_embedding,
+            hidden_size,
         )
         self.num_classes = num_classes
         self.dropout_prob = dropout_prob
         Drops labels to enable classifier-free guidance.
         """
         if force_drop_ids is None:
+            drop_ids = (
+                torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
             )
+            drop_ids = drop_ids.cuda()
             drop_ids = drop_ids.to(labels.device)
         else:
             drop_ids = force_drop_ids == 1
 #############################################################################
+#                               Core NextDiT Model                              #
 #############################################################################
 class Attention(nn.Module):
     """Multi-head attention module."""
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: Optional[int],
+        qk_norm: bool,
+        y_dim: int,
+    ):
         """
         Initialize the Attention module.
         """
         super().__init__()
         self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        model_parallel_size = 1
         self.n_local_heads = n_heads // model_parallel_size
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
         self.head_dim = dim // n_heads
+        self.wq = nn.Linear(
+            dim,
+            n_heads * self.head_dim,
+            bias=False,
         )
+        self.wk = nn.Linear(
+            dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
         )
+        self.wv = nn.Linear(
+            dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
         )
         if y_dim > 0:
+            self.wk_y = nn.Linear(
+                y_dim,
+                self.n_kv_heads * self.head_dim,
+                bias=False,
             )
+            self.wv_y = nn.Linear(
+                y_dim,
+                self.n_kv_heads * self.head_dim,
+                bias=False,
             )
             self.gate = nn.Parameter(torch.zeros([self.n_local_heads]))
+        self.wo = nn.Linear(
+            n_heads * self.head_dim,
+            dim,
+            bias=False,
         )
         if qk_norm:
         else:
             self.q_norm = self.k_norm = nn.Identity()
             self.ky_norm = nn.Identity()
         # for proportional attention computation
         self.base_seqlen = None
         self.proportional_attn = False
         ndim = x.ndim
         assert 0 <= 1 < ndim
         assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+        shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
         return freqs_cis.view(*shape)
     @staticmethod
             return x_out.type_as(x_in)
     # copied from huggingface modeling_llama.py
+    def _upad_input(
+        self, query_layer, key_layer, value_layer, attention_mask, query_length
+    ):
         def _get_unpad_data(attention_mask):
             seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
             indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
             max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
             return (
                 indices,
                 cu_seqlens,
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
         )
         value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
+                query_layer.reshape(
+                    batch_size * kv_seq_len, self.n_local_heads, head_dim
+                ),
+                indices_k,
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask
+            )
         return (
             query_layer,
         if dtype in [torch.float16, torch.bfloat16]:
             # begin var_len flash attn
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(xq, xk, xv, x_mask, seqlen)
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
             if self.proportional_attn:
+                softmax_scale = math.sqrt(
+                    math.log(seqlen, self.base_seqlen) / self.head_dim
+                )
             else:
                 softmax_scale = math.sqrt(1 / self.head_dim)
                 cu_seqlens_k=cu_seqlens_k,
                 max_seqlen_q=max_seqlen_in_batch_q,
                 max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=0.0,
                 causal=False,
+                softmax_scale=softmax_scale,
             )
             output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
             # end var_len_flash_attn
         else:
+            output = (
+                F.scaled_dot_product_attention(
+                    xq.permute(0, 2, 1, 3),
+                    xk.permute(0, 2, 1, 3),
+                    xv.permute(0, 2, 1, 3),
+                    attn_mask=x_mask.bool()
+                    .view(bsz, 1, 1, seqlen)
+                    .expand(-1, self.n_local_heads, seqlen, -1),
+                )
+                .permute(0, 2, 1, 3)
+                .to(dtype)
+            )
         if hasattr(self, "wk_y"):
             # todo better flash_attn support
+            yk = self.ky_norm(self.wk_y(y)).view(
+                bsz, -1, self.n_local_kv_heads, self.head_dim
+            )
             yv = self.wv_y(y).view(bsz, -1, self.n_local_kv_heads, self.head_dim)
             n_rep = self.n_local_heads // self.n_local_kv_heads
             if n_rep >= 1:
                 xq.permute(0, 2, 1, 3),
                 yk.permute(0, 2, 1, 3),
                 yv.permute(0, 2, 1, 3),
+                y_mask.view(bsz, 1, 1, -1).expand(bsz, self.n_local_heads, seqlen, -1),
             ).permute(0, 2, 1, 3)
             output_y = output_y * self.gate.tanh().view(1, 1, -1, 1)
             output = output + output_y
                 dimension. Defaults to None.
         Attributes:
+            w1 (nn.Linear): Linear transformation for the first
                 layer.
+            w2 (nn.Linear): Linear transformation for the second layer.
+            w3 (nn.Linear): Linear transformation for the third
                 layer.
         """
         # custom dim factor multiplier
         if ffn_dim_multiplier is not None:
             hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
         )
+        self.w2 = nn.Linear(
+            hidden_dim,
+            dim,
+            bias=False,
         )
+        self.w3 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
         )
     # @torch.compile
 class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        y_dim: int,
+    ) -> None:
         """
         Initialize a TransformerBlock.
         self.head_dim = dim // n_heads
         self.attention = Attention(dim, n_heads, n_kv_heads, qk_norm, y_dim)
         self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=4 * dim,
+            multiple_of=multiple_of,
             ffn_dim_multiplier=ffn_dim_multiplier,
         )
         self.layer_id = layer_id
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
+            nn.Linear(
+                min(dim, 1024),
+                6 * dim,
+                bias=True,
             ),
         )
         """
         if adaln_input is not None:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
                 self.adaLN_modulation(adaln_input).chunk(6, dim=1)
+            )
+            x = x + self.attention_norm1(
+                gate_msa.unsqueeze(1)
+                * self.attention(
+                    modulate(self.attention_norm(x), shift_msa, scale_msa),
+                    x_mask,
+                    freqs_cis,
+                    self.attention_y_norm(y),
+                    y_mask,
+                )
+            )
             d = x.shape[-1]
+            x = x + self.ffn_norm1(
+                gate_mlp.unsqueeze(1)
+                * self.feed_forward(
+                    modulate(self.ffn_norm(x), shift_mlp, scale_mlp).view(-1, d),
+                ).view(*x.shape)
+            )
         else:
+            x = x + self.attention_norm1(
+                self.attention(
+                    self.attention_norm(x),
+                    x_mask,
+                    freqs_cis,
+                    self.attention_y_norm(y),
+                    y_mask,
+                )
+            )
             # for compatibility with torch.compile because the sequence length changes
             B, L, D = x.shape
+            x = x.view(B * L, D)
             x = x + self.ffn_norm1(self.feed_forward(self.ffn_norm(x)))
             x = x.view(B, L, D)
 class ParallelFinalLayer(nn.Module):
     """
+    The final layer of NextDiT.
     """
     def __init__(self, hidden_size, patch_size, out_channels):
         super().__init__()
         self.norm_final = nn.LayerNorm(
+            hidden_size,
+            elementwise_affine=False,
+            eps=1e-6,
         )
+        self.linear = nn.Linear(
+            hidden_size,
+            patch_size * patch_size * out_channels,
+            bias=True,
         )
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
+            nn.Linear(
+                min(hidden_size, 1024),
+                2 * hidden_size,
+                bias=True,
             ),
         )
     """
     Diffusion model with a Transformer backbone.
     """
     def __init__(
         self,
         patch_size: int = 2,
         learn_sigma: bool = True,
         qk_norm: bool = False,
         cap_feat_dim: int = 5120,
+        rope_scaling_factor: float = 1.0,
+        ntk_factor: float = 1.0,
     ) -> None:
         super().__init__()
         self.learn_sigma = learn_sigma
         self.out_channels = in_channels * 2 if learn_sigma else in_channels
         self.patch_size = patch_size
+        self.x_embedder = nn.Linear(
             in_features=patch_size * patch_size * in_channels,
             out_features=dim,
             bias=True,
         )
+        nn.init.constant_(self.x_embedder.bias, 0.0)
         self.t_embedder = ParallelTimestepEmbedder(min(dim, 1024))
         self.cap_embedder = nn.Sequential(
             nn.LayerNorm(cap_feat_dim),
+            nn.Linear(
+                cap_feat_dim,
+                min(dim, 1024),
+                bias=True,
+            ),
         )
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    cap_feat_dim,
+                )
+                for layer_id in range(n_layers)
+            ]
+        )
         self.final_layer = ParallelFinalLayer(dim, patch_size, self.out_channels)
         assert (dim // n_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
         self.dim = dim
         self.n_heads = n_heads
         self.freqs_cis = NextDiT.precompute_freqs_cis(
+            dim // n_heads,
+            384,
+            rope_scaling_factor=rope_scaling_factor,
+            ntk_factor=ntk_factor,
         )
         self.rope_scaling_factor = rope_scaling_factor
         self.ntk_factor = ntk_factor
         # nn.init.normal_(self.eol_token, std=0.02)
         nn.init.normal_(self.pad_token, std=0.02)
+    def unpatchify(
+        self, x: torch.Tensor, img_size: List[Tuple[int, int]], return_tensor=False
+    ) -> List[torch.Tensor]:
         """
         x: (N, T, patch_size**2 * C)
         imgs: (N, H, W, C)
             for i in range(x.size(0)):
                 H, W = img_size[i]
                 L = (H // pH) * (W // pW)
+                imgs.append(
+                    x[i][:L]
+                    .view(H // pH, W // pW, pH, pW, self.out_channels)
+                    .permute(4, 0, 2, 1, 3)
+                    .flatten(3, 4)
+                    .flatten(1, 2)
+                )
         return imgs
     def patchify_and_embed(
+        self, x: List[torch.Tensor] | torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], torch.Tensor]:
         self.freqs_cis = self.freqs_cis.to(x[0].device)
         if isinstance(x, torch.Tensor):
             pH = pW = self.patch_size
             B, C, H, W = x.size()
+            x = (
+                x.view(B, C, H // pH, pH, W // pW, pW)
+                .permute(0, 2, 4, 1, 3, 5)
+                .flatten(3)
+            )
             x = self.x_embedder(x)
             x = x.flatten(1, 2)
+            mask = torch.ones(
+                x.shape[0], x.shape[1], dtype=torch.int32, device=x.device
+            )
             # leave the first line for text
+            return (
+                x,
+                mask,
+                [(H, W)] * B,
+                self.freqs_cis[: H // pH, : W // pW].flatten(0, 1).unsqueeze(0),
+            )
         else:
             pH = pW = self.patch_size
             x_embed = []
             for img in x:
                 C, H, W = img.size()
+                item_freqs_cis = self.freqs_cis[: H // pH, : W // pW]
+                freqs_cis.append(item_freqs_cis.flatten(0, 1))
                 img_size.append((H, W))
+                img = (
+                    img.view(C, H // pH, pH, W // pW, pW)
+                    .permute(1, 3, 0, 2, 4)
+                    .flatten(2)
+                )
                 img = self.x_embedder(img)
                 img = img.flatten(0, 1)
                 l_effective_seq_len.append(len(img))
                 x_embed.append(img)
             max_seq_len = max(l_effective_seq_len)
+            mask = torch.zeros(
+                len(x), max_seq_len, dtype=torch.int32, device=x[0].device
+            )
             padded_x_embed = []
             padded_freqs_cis = []
+            for i, (item_embed, item_freqs_cis, item_seq_len) in enumerate(
+                zip(x_embed, freqs_cis, l_effective_seq_len)
+            ):
+                item_embed = torch.cat(
+                    [
+                        item_embed,
+                        self.pad_token.view(1, -1).expand(
+                            max_seq_len - item_seq_len, -1
+                        ),
+                    ],
+                    dim=0,
+                )
+                item_freqs_cis = torch.cat(
+                    [
+                        item_freqs_cis,
+                        item_freqs_cis[-1:].expand(max_seq_len - item_seq_len, -1),
+                    ],
+                    dim=0,
+                )
                 padded_x_embed.append(item_embed)
                 padded_freqs_cis.append(item_freqs_cis)
                 mask[i][:item_seq_len] = 1
     def forward(self, x, t, cap_feats, cap_mask):
         """
+        Forward pass of NextDiT.
         t: (N,) tensor of diffusion timesteps
         y: (N,) tensor of class labels
         """
         # cap_freqs_cis = self.freqs_cis[:1, :cap_feats.shape[1]].to(x.device)
+        t = self.t_embedder(t)  # (N, D)
         cap_mask_float = cap_mask.float().unsqueeze(-1)
+        cap_feats_pool = (cap_feats * cap_mask_float).sum(dim=1) / cap_mask_float.sum(
+            dim=1
+        )
         cap_feats_pool = cap_feats_pool.to(cap_feats)
         cap_emb = self.cap_embedder(cap_feats_pool)
         adaln_input = t + cap_emb
         cap_mask = cap_mask.bool()
         for layer in self.layers:
+            x = layer(x, mask, freqs_cis, cap_feats, cap_mask, adaln_input=adaln_input)
         x = self.final_layer(x, adaln_input)
         x = self.unpatchify(x, img_size, return_tensor=x_is_tensor)
                 x = [_.chunk(2, dim=0)[0] for _ in x]
         return x
+    def forward_with_cfg(
+        self,
+        x,
+        t,
+        cap_feats,
+        cap_mask,
+        cfg_scale,
+        rope_scaling_factor=None,
+        ntk_factor=None,
+        base_seqlen: Optional[int] = None,
+        proportional_attn: bool = False,
+    ):
         # """
+        # Forward pass of NextDiT, but also batches the unconNextditional forward pass
         # for classifier-free guidance.
         # """
         # # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
         # print(ntk_factor, rope_scaling_factor, self.ntk_factor, self.rope_scaling_factor)
         if rope_scaling_factor is not None or ntk_factor is not None:
+            rope_scaling_factor = (
+                rope_scaling_factor
+                if rope_scaling_factor is not None
+                else self.rope_scaling_factor
+            )
             ntk_factor = ntk_factor if ntk_factor is not None else self.ntk_factor
+            if (
+                rope_scaling_factor != self.rope_scaling_factor
+                or ntk_factor != self.ntk_factor
+            ):
+                print(
+                    f"override freqs_cis, rope_scaling {rope_scaling_factor}, ntk {ntk_factor}",
+                    flush=True,
+                )
                 self.freqs_cis = NextDiT.precompute_freqs_cis(
+                    self.dim // self.n_heads,
+                    384,
+                    rope_scaling_factor=rope_scaling_factor,
+                    ntk_factor=ntk_factor,
                 )
                 self.rope_scaling_factor = rope_scaling_factor
                 self.ntk_factor = ntk_factor
         if proportional_attn:
             assert base_seqlen is not None
             for layer in self.layers:
         end: int,
         theta: float = 10000.0,
         rope_scaling_factor: float = 1.0,
+        ntk_factor: float = 1.0,
     ):
         """
         Precompute the frequency tensor for complex exponentials (cis) with
         theta = theta * ntk_factor
+        logger.info(
+            f"theta {theta} rope scaling {rope_scaling_factor} ntk {ntk_factor}"
+        )
+        freqs = 1.0 / (
+            theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float().cuda() / dim)
+        )
         t = torch.arange(end, device=freqs.device, dtype=torch.float)  # type: ignore
         t = t / rope_scaling_factor
         freqs = torch.outer(t, freqs).float()  # type: ignore
         freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+        freqs_cis_h = freqs_cis.view(end, 1, dim // 4, 1).repeat(1, end, 1, 1)
+        freqs_cis_w = freqs_cis.view(1, end, dim // 4, 1).repeat(end, 1, 1, 1)
         freqs_cis = torch.cat([freqs_cis_h, freqs_cis_w], dim=-1).flatten(2)
         return freqs_cis
     def parameter_count(self) -> int:
         tensor_parallel_module_list = (
+            nn.Linear,
+            nn.Linear,
+            nn.Embedding,
         )
         total_params = 0
             nonlocal total_params
             is_tp_module = isinstance(module, tensor_parallel_module_list)
             for param in module.parameters(recurse=False):
+                total_params += param.numel()
             for submodule in module.children():
                 _recursive_count_params(submodule)
 #############################################################################
+#                                 NextDiT Configs                               #
 #############################################################################
 def NextDiT_2B_patch2(**kwargs):
+    return NextDiT(patch_size=2, dim=2304, n_layers=24, n_heads=32, **kwargs)