Add diffusers

by multimodalart HF Staff - opened 4 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+5777

-0

Files changed (18) hide show

README.md +76 -0
__init__.py +0 -0
before_denoise.py +956 -0
decoders.py +145 -0
denoise.py +330 -0
encoders.py +282 -0
modular_blocks.py +42 -0
modular_config.json +7 -0
modular_model_index.json +79 -0
transformer/__init__.py +0 -0
transformer/attention.py +326 -0
transformer/causal_model.py +1402 -0
transformer/config.json +19 -0
transformer/diffusion_pytorch_model-00001-of-00003.safetensors +3 -0
transformer/diffusion_pytorch_model-00002-of-00003.safetensors +3 -0
transformer/diffusion_pytorch_model-00003-of-00003.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors.index.json +1102 -0
transformer/model.py +1002 -0

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ tags:
 - text-to-video
 - video-to-video
 - realtime
 ---
 Krea Realtime 14B is distilled from the [Wan 2.1 14B text-to-video model](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B) using Self-Forcing, a technique for converting regular video diffusion models into autoregressive models. It achieves a text-to-video inference speed of **11fps** using 4 inference steps on a single NVIDIA B200 GPU. For more details on our training methodology and sampling innovations, refer to our [technical blog post](https://www.krea.ai/blog/krea-realtime-14b).
@@ -97,5 +98,80 @@ Krea realtime allows users to generate videos in a streaming fashion with ~1s ti
     </table>
 </div>

 - text-to-video
 - video-to-video
 - realtime
+library_name: diffusers
 ---
 Krea Realtime 14B is distilled from the [Wan 2.1 14B text-to-video model](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B) using Self-Forcing, a technique for converting regular video diffusion models into autoregressive models. It achieves a text-to-video inference speed of **11fps** using 4 inference steps on a single NVIDIA B200 GPU. For more details on our training methodology and sampling innovations, refer to our [technical blog post](https://www.krea.ai/blog/krea-realtime-14b).
     </table>
 </div>
+# Use it with our inference code
+Set up
+```bash
+sudo apt install ffmpeg # install if you haven't already
+git clone https://github.com/krea-ai/realtime-video
+cd realtime-video
+uv sync
+uv pip install flash_attn --no-build-isolation
+huggingface-cli download Wan-AI/Wan2.1-T2V-1.3B --local-dir-use-symlinks False --local-dir wan_models/Wan2.1-T2V-1.3B
+huggingface-cli download krea/krea-realtime-video krea-realtime-video-14b.safetensors --local-dir-use-symlinks False --local-dir checkpoints/krea-realtime-video-14b.safetensors
+```
+Run
+```bash
+export MODEL_FOLDER=Wan-AI
+export CUDA_VISIBLE_DEVICES=0 # pick the GPU you want to serve on
+export DO_COMPILE=true
+uvicorn release_server:app --host 0.0.0.0 --port 8000
+```
+And use the web app at http://localhost:8000/ in your browser
+(for more advanced use-cases and custom pipeline check out our GitHub repository: https://github.com/krea-ai/realtime-video)
+# Use it with 🧨 diffusers
+Krea Realtime 14B can be used with the `diffusers` library utilizing the new Modular Diffusers structure (for now supporting text-to-video, video-to-video coming soon)
+```bash
+# Install diffusers from main
+pip install git+github.com/huggingface/diffusers.git
+```
+```py
+import torch
+from collections import deque
+from diffusers.utils import export_to_video
+from diffusers import ModularPipelineBlocks
+from diffusers.modular_pipelines import PipelineState, WanModularPipeline
+repo_id = "krea/krea-realtime-video"
+blocks = ModularPipelineBlocks.from_pretrained(repo_id, trust_remote_code=True)
+pipe = WanModularPipeline(blocks, repo_id)
+pipe.load_components(
+    trust_remote_code=True,
+    device_map="cuda",
+    torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+)
+num_frames_per_block = 3
+num_blocks = 9
+frames = []
+state = PipelineState()
+state.set("frame_cache_context", deque(maxlen=pipe.config.frame_cache_len))
+prompt = ["a cat sitting on a boat"]
+for block in pipe.transformer.blocks:
+    block.self_attn.fuse_projections()
+for block_idx in range(num_blocks):
+    state = pipe(
+        state,
+        prompt=prompt,
+        num_inference_steps=6,
+        num_blocks=num_blocks,
+        num_frames_per_block=num_frames_per_block,
+        block_idx=block_idx,
+        generator=torch.Generator("cuda").manual_seed(42),
+    )
+    frames.extend(state.values["videos"][0])
+export_to_video(frames, "output.mp4", fps=16)
+```

__init__.py ADDED Viewed

File without changes

before_denoise.py ADDED Viewed

	@@ -0,0 +1,956 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import List, Optional, Union, Dict
+import torch
+from diffusers import AutoencoderKLWan
+from diffusers.schedulers import UniPCMultistepScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.modular_pipelines import (
+    ModularPipeline,
+    ModularPipelineBlocks,
+    SequentialPipelineBlocks,
+    PipelineState,
+)
+from diffusers.modular_pipelines.modular_pipeline_utils import (
+    ComponentSpec,
+    ConfigSpec,
+    InputParam,
+    OutputParam,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def _initialize_kv_cache(
+    components: ModularPipeline,
+    kv_cache_existing: Optional[List[Dict]],
+    batch_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    local_attn_size: int,
+    frame_seq_length: int,
+):
+    """
+    Initialize a Per-GPU KV cache for the Wan model.
+    Mirrors causal_inference.py:279-313
+    """
+    kv_cache = []
+    # Calculate KV cache size
+    if local_attn_size != -1:
+        # Use the local attention size to compute the KV cache size
+        kv_cache_size = local_attn_size * frame_seq_length
+    else:
+        # Use the default KV cache size
+        kv_cache_size = 32760
+    # Get transformer config
+    num_transformer_blocks = len(components.transformer.blocks)
+    num_heads = components.transformer.config.num_heads
+    dim = components.transformer.config.dim
+    k_shape = [batch_size, kv_cache_size, num_heads, dim // num_heads]
+    v_shape = [batch_size, kv_cache_size, num_heads, dim // num_heads]
+    # Check if we can reuse existing cache
+    if (
+        kv_cache_existing
+        and len(kv_cache_existing) > 0
+        and list(kv_cache_existing[0]["k"].shape) == k_shape
+        and list(kv_cache_existing[0]["v"].shape) == v_shape
+    ):
+        for i in range(num_transformer_blocks):
+            kv_cache_existing[i]["k"].zero_()
+            kv_cache_existing[i]["v"].zero_()
+            kv_cache_existing[i]["global_end_index"] = 0
+            kv_cache_existing[i]["local_end_index"] = 0
+        return kv_cache_existing
+    else:
+        # Create new cache
+        for _ in range(num_transformer_blocks):
+            kv_cache.append(
+                {
+                    "k": torch.zeros(k_shape, dtype=dtype, device=device).contiguous(),
+                    "v": torch.zeros(v_shape, dtype=dtype, device=device).contiguous(),
+                    "global_end_index": 0,
+                    "local_end_index": 0,
+                }
+            )
+        return kv_cache
+def _initialize_crossattn_cache(
+    components: ModularPipeline,
+    crossattn_cache_existing: Optional[List[Dict]],
+    batch_size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
+    """
+    Initialize a Per-GPU cross-attention cache for the Wan model.
+    Mirrors causal_inference.py:315-338
+    """
+    crossattn_cache = []
+    # Get transformer config
+    num_transformer_blocks = len(components.transformer.blocks)
+    num_heads = components.transformer.config.num_heads
+    dim = components.transformer.config.dim
+    k_shape = [batch_size, 512, num_heads, dim // num_heads]
+    v_shape = [batch_size, 512, num_heads, dim // num_heads]
+    # Check if we can reuse existing cache
+    if (
+        crossattn_cache_existing
+        and len(crossattn_cache_existing) > 0
+        and list(crossattn_cache_existing[0]["k"].shape) == k_shape
+        and list(crossattn_cache_existing[0]["v"].shape) == v_shape
+    ):
+        for i in range(num_transformer_blocks):
+            crossattn_cache_existing[i]["k"].zero_()
+            crossattn_cache_existing[i]["v"].zero_()
+            crossattn_cache_existing[i]["is_init"] = False
+        return crossattn_cache_existing
+    else:
+        # Create new cache
+        for _ in range(num_transformer_blocks):
+            crossattn_cache.append(
+                {
+                    "k": torch.zeros(k_shape, dtype=dtype, device=device).contiguous(),
+                    "v": torch.zeros(v_shape, dtype=dtype, device=device).contiguous(),
+                    "is_init": False,
+                }
+            )
+        return crossattn_cache
+class WanInputStep(ModularPipelineBlocks):
+    model_name = "WanRT"
+    @property
+    def description(self) -> str:
+        return (
+            "Input processing step that:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_videos_per_prompt`\n\n"
+            "All input tensors are expected to have either batch_size=1 or match the batch_size\n"
+            "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
+            "have a final batch_size of batch_size * num_videos_per_prompt."
+        )
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_videos_per_prompt", default=1),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                description="Pre-generated negative text embeddings. Can be generated from text_encoder step.",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
+            ),
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
+                description="text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",  # already in intermedites state but declare here again for denoiser_input_fields
+                description="negative text embeddings used to guide the image generation",
+            ),
+        ]
+    def check_inputs(self, components, block_state):
+        if (
+            block_state.prompt_embeds is not None
+            and block_state.negative_prompt_embeds is not None
+        ):
+            if (
+                block_state.prompt_embeds.shape
+                != block_state.negative_prompt_embeds.shape
+            ):
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {block_state.negative_prompt_embeds.shape}."
+                )
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+        _, seq_len, _ = block_state.prompt_embeds.shape
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(
+            1, block_state.num_videos_per_prompt, 1
+        )
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * block_state.num_videos_per_prompt, seq_len, -1
+        )
+        if block_state.negative_prompt_embeds is not None:
+            _, seq_len, _ = block_state.negative_prompt_embeds.shape
+            block_state.negative_prompt_embeds = (
+                block_state.negative_prompt_embeds.repeat(
+                    1, block_state.num_videos_per_prompt, 1
+                )
+            )
+            block_state.negative_prompt_embeds = (
+                block_state.negative_prompt_embeds.view(
+                    block_state.batch_size * block_state.num_videos_per_prompt,
+                    seq_len,
+                    -1,
+                )
+            )
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamingSetTimestepsStep(ModularPipelineBlocks):
+    model_name = "WanRT"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", UniPCMultistepScheduler),
+        ]
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference"
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_inference_steps", default=4),
+            InputParam("timesteps"),
+            InputParam("sigmas"),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "timesteps",
+                type_hint=torch.Tensor,
+                description="The timesteps to use for inference",
+            ),
+            OutputParam(
+                "all_timesteps",
+                type_hint=torch.Tensor,
+                description="The timesteps to use for inference",
+            ),
+            OutputParam(
+                "num_inference_steps",
+                type_hint=int,
+                description="The number of denoising steps to perform at inference time",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.device = components._execution_device
+        shift = 5.0
+        sigmas = torch.linspace(1.0, 0.0, 1001)[:-1]
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        timesteps = sigmas.to(components.transformer.device) * 1000.0
+        zero_padded_timesteps = torch.cat(
+            [
+                timesteps,
+                torch.tensor([0], device=components.transformer.device),
+            ]
+        )
+        denoising_steps = torch.linspace(
+            1000, 0, block_state.num_inference_steps, dtype=torch.float32
+        ).to(torch.long)
+        block_state.timesteps = zero_padded_timesteps[1000 - denoising_steps]
+        block_state.all_timesteps = timesteps
+        block_state.sigmas = sigmas
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamingPrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "WanRT"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+        ]
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [ConfigSpec("num_frames_per_block", 3)]
+    @property
+    def description(self) -> str:
+        return "Prepare latents step that prepares the latents for the text-to-video generation process"
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("num_blocks", type_hint=int),
+            InputParam("num_frames_per_block", type_hint=int),
+            InputParam("latents", type_hint=Optional[torch.Tensor]),
+            InputParam("init_latents", type_hint=Optional[torch.Tensor]),
+            InputParam("final_latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_videos_per_prompt", type_hint=int, default=1),
+            InputParam("generator"),
+            InputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process",
+            ),
+            OutputParam(
+                "init_latents",
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process",
+            ),
+            OutputParam(
+                "final_latents",
+                type_hint=torch.Tensor,
+            ),
+        ]
+    @staticmethod
+    def check_inputs(components, block_state):
+        if (
+            block_state.height is not None
+            and block_state.height % components.vae_scale_factor_spatial != 0
+        ) or (
+            block_state.width is not None
+            and block_state.width % components.vae_scale_factor_spatial != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {components.vae_scale_factor_spatial} but are {block_state.height} and {block_state.width}."
+            )
+    @staticmethod
+    def prepare_latents(
+        components,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 352,
+        width: int = 640,
+        num_blocks: int = 9,
+        num_frames_per_block: int = 3,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        num_latent_frames = num_blocks * num_frames_per_block
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // components.vae_scale_factor_spatial,
+            int(width) // components.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(
+            shape,
+            generator=generator,
+            device=components.transformer.device,
+            dtype=dtype,
+        )
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.height = block_state.height or components.default_height
+        block_state.width = block_state.width or components.default_width
+        block_state.device = components._execution_device
+        block_state.num_channels_latents = components.num_channels_latents
+        self.check_inputs(components, block_state)
+        block_state.init_latents = self.prepare_latents(
+            components,
+            1,
+            block_state.num_channels_latents,
+            block_state.height,
+            block_state.width,
+            block_state.num_blocks,
+            components.config.num_frames_per_block,
+            components.transformer.dtype,
+            block_state.device,
+            block_state.generator,
+            block_state.init_latents,
+        )
+        if block_state.final_latents is None:
+            block_state.final_latents = torch.zeros_like(
+                block_state.init_latents, device=components.transformer.device
+            )
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamingExtractBlockLatentsStep(ModularPipelineBlocks):
+    """
+    Extracts a single block of latents from the full video buffer for streaming generation.
+    This block simply slices the final_latents buffer to get the current block's latents.
+    The final_latents buffer should be created beforehand using WanRTStreamingPrepareAllLatents.
+    """
+    model_name = "WanRT"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return []
+    @property
+    def description(self) -> str:
+        return (
+            "Extracts a single block from the full latent buffer for streaming generation. "
+            "Slices final_latents based on block_idx to get current block's latents."
+        )
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "final_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Full latent buffer [B, C, total_frames, H, W]",
+            ),
+            InputParam(
+                "init_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Full latent buffer [B, C, total_frames, H, W]",
+            ),
+            InputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Full latent buffer [B, C, total_frames, H, W]",
+            ),
+            InputParam(
+                "block_idx",
+                required=True,
+                type_hint=int,
+                default=0,
+                description="Current block index to process",
+            ),
+            InputParam(
+                "num_frames_per_block",
+                required=True,
+                type_hint=int,
+                default=3,
+                description="Number of frames per block",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Latents for current block [B, C, num_frames_per_block, H, W]",
+            ),
+            OutputParam(
+                "current_start_frame",
+                type_hint=int,
+                description="Starting frame index for current block",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        num_frames_per_block = block_state.num_frames_per_block
+        block_idx = block_state.block_idx
+        # Calculate frame range for current block
+        start_frame = block_idx * num_frames_per_block
+        end_frame = start_frame + num_frames_per_block
+        # Extract single block from full latent buffer
+        # final_latents shape: [B, C, total_frames, H, W]
+        # Extract frames along the time dimension (dim=2)
+        block_state.latents = block_state.init_latents[
+            :, :, start_frame:end_frame, :, :
+        ]
+        block_state.current_start_frame = start_frame
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamingSetupKVCache(ModularPipelineBlocks):
+    """
+    Initializes KV cache and cross-attention cache for streaming generation.
+    This block sets up the persistent caches used across all blocks in streaming
+    generation. Mirrors the cache initialization logic from causal_inference.py.
+    Should be called once at the start of streaming generation.
+    """
+    model_name = "WanRT"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", torch.nn.Module),
+        ]
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec("kv_cache_num_frames", 3),
+            ConfigSpec("num_frames_per_block", 3),
+            ConfigSpec("frame_seq_length", 1560),
+            ConfigSpec("frame_cache_len", 9),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "Initializes KV cache and cross-attention cache for streaming generation. "
+            "Creates persistent caches that will be reused across all blocks."
+        )
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "kv_cache",
+                required=False,
+                type_hint=Optional[List[Dict]],
+                description="Existing KV cache. If provided and shape matches, will be zeroed instead of recreated.",
+            ),
+            InputParam(
+                "crossattn_cache",
+                required=False,
+                type_hint=Optional[List[Dict]],
+                description="Existing cross-attention cache. If provided and shape matches, will be zeroed.",
+            ),
+            InputParam(
+                "local_attn_size",
+                required=False,
+                type_hint=int,
+                default=-1,
+                description="Local attention size for computing KV cache size. -1 uses default (32760).",
+            ),
+            InputParam(
+                "dtype",
+                required=False,
+                type_hint=torch.dtype,
+                description="Data type for caches (defaults to bfloat16)",
+            ),
+            InputParam(
+                "update_prompt_embeds",
+                required=False,
+                description="Flag to reinitialize prompt embeds if they are updated.",
+                default=False,
+            ),
+        ]
+    @property
+    def outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "kv_cache",
+                type_hint=List[Dict],
+                description="Initialized KV cache (list of dicts per transformer block)",
+            ),
+            OutputParam(
+                "crossattn_cache",
+                type_hint=List[Dict],
+                description="Initialized cross-attention cache",
+            ),
+            OutputParam(
+                "local_attn_size",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        batch_size = 1  # Streaming always uses batch_size=1
+        # Get existing caches if they exist
+        kv_cache = block_state.kv_cache
+        crossattn_cache = block_state.crossattn_cache
+        if block_state.crossattn_cache is None or block_state.update_prompt_embeds:
+            block_state.crossattn_cache = _initialize_crossattn_cache(
+                components,
+                crossattn_cache,
+                batch_size,
+                components.transformer.dtype,
+                components.transformer.device,
+            )
+        block_state.local_attn_size = (
+            components.config.kv_cache_num_frames
+            + components.config.num_frames_per_block
+        )
+        for block in components.transformer.blocks:
+            block.self_attn.local_attn_size = -1
+        for block in components.transformer.blocks:
+            block.self_attn.num_frame_per_block = components.config.num_frames_per_block
+        block_state.kv_cache = _initialize_kv_cache(
+            components,
+            kv_cache,
+            batch_size,
+            components.transformer.dtype,
+            components.transformer.device,
+            block_state.local_attn_size,
+            components.config.frame_seq_length,
+        )
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Current block latents [B, C, num_frames_per_block, H, W]",
+            ),
+            InputParam(
+                "num_frames_per_block",
+                type_hint=int,
+                description="Number of frames per block",
+            ),
+            InputParam(
+                "block_idx",
+                type_hint=int,
+                description="Current block index to process",
+            ),
+            InputParam(
+                "block_mask",
+                description="Block-wise causal attention mask",
+            ),
+            InputParam(
+                "current_start_frame",
+                type_hint=int,
+                description="Starting frame index for current block",
+            ),
+            InputParam(
+                "videos",
+                type_hint=torch.Tensor,
+                description="Video frames for context encoding",
+            ),
+            InputParam(
+                "final_latents",
+                type_hint=torch.Tensor,
+                description="Full latent buffer [B, C, total_frames, H, W]",
+            ),
+            InputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                description="Text embeddings to guide generation",
+            ),
+            InputParam(
+                "kv_cache",
+                type_hint=torch.Tensor,
+                description="Key-value cache for attention",
+            ),
+            InputParam(
+                "crossattn_cache",
+                type_hint=torch.Tensor,
+                description="Cross-attention cache",
+            ),
+            InputParam(
+                "encoder_cache",
+                description="Encoder feature cache",
+            ),
+            InputParam(
+                "frame_cache_context",
+                description="Cached context frames for reencoding",
+            ),
+            InputParam(
+                "local_attn_size",
+            ),
+        ]
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [ConfigSpec("seq_length", 32760)]
+    def prepare_latents(self, components, block_state):
+        frames = block_state.frame_cache_context[0].half()
+        components.vae._enc_feat_map = [None] * 55
+        latents = retrieve_latents(components.vae.encode(frames), sample_mode="argmax")
+        latents_mean = (
+            torch.tensor(components.vae.config.latents_mean)
+            .view(1, components.vae.config.z_dim, 1, 1, 1)
+            .to(latents.device, latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
+            1, components.vae.config.z_dim, 1, 1, 1
+        ).to(latents.device, latents.dtype)
+        latents = (latents - latents_mean) * latents_std
+        return latents.to(components.transformer.dtype)
+    def get_context_frames(self, components, block_state):
+        current_kv_cache_num_frames = components.config.kv_cache_num_frames
+        context_frames = block_state.final_latents[
+            :, :, : block_state.current_start_frame
+        ]
+        if (
+            block_state.block_idx - 1
+        ) * block_state.num_frames_per_block < current_kv_cache_num_frames:
+            if current_kv_cache_num_frames == 1:
+                context_frames = context_frames[:, :, :1]
+            else:
+                context_frames = torch.cat(
+                    (
+                        context_frames[:, :, :1],
+                        context_frames[:, :, 1:][
+                            :, :, -current_kv_cache_num_frames + 1 :
+                        ],
+                    ),
+                    dim=2,
+                )
+        else:
+            context_frames = context_frames[:, :, 1:][
+                :, :, -current_kv_cache_num_frames + 1 :
+            ]
+            first_frame_latent = self.prepare_latents(components, block_state)
+            first_frame_latent = first_frame_latent.to(block_state.latents)
+            context_frames = torch.cat((first_frame_latent, context_frames), dim=2)
+        return context_frames
+    def __call__(self, components, state):
+        block_state = self.get_block_state(state)
+        if block_state.block_idx == 0:
+            return components, state
+        start_frame = min(
+            block_state.current_start_frame, components.config.kv_cache_num_frames
+        )
+        context_frames = self.get_context_frames(components, block_state)
+        block_state.block_mask = (
+            components.transformer._prepare_blockwise_causal_attn_mask(
+                components.transformer.device,
+                num_frames=context_frames.shape[2],
+                frame_seqlen=components.config.frame_seq_length,
+                num_frame_per_block=block_state.num_frames_per_block,
+                local_attn_size=-1,
+            )
+        )
+        components.transformer.block_mask = block_state.block_mask
+        context_timestep = torch.zeros(
+            (context_frames.shape[0], context_frames.shape[2]),
+            device=components.transformer.device,
+            dtype=torch.int64,
+        )
+        components.transformer(
+            x=context_frames.to(components.transformer.dtype),
+            t=context_timestep,
+            context=block_state.prompt_embeds.to(components.transformer.dtype),
+            kv_cache=block_state.kv_cache,
+            seq_len=components.config.seq_length,
+            crossattn_cache=block_state.crossattn_cache,
+            current_start=start_frame * components.config.frame_seq_length,
+            cache_start=None,
+        )
+        components.transformer.block_mask = None
+        return components, state
+class WanRTStreamingBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        WanRTStreamingSetTimestepsStep,
+        WanRTStreamingPrepareLatentsStep,
+        WanRTStreamingExtractBlockLatentsStep,
+        WanRTStreamingSetupKVCache,
+        WanRTStreamingRecomputeKVCache,
+    ]
+    block_names = [
+        "set_timesteps",
+        "prepare_latents",
+        "extract_block_init_latents",
+        "setup_kv_cache",
+        "recompute_kv_cache",
+    ]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanRTInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanRTSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanRTPrepareLatentsStep` is used to prepare the latents\n"
+        )

decoders.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Tuple, Union
+import numpy as np
+import PIL
+import torch
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKLWan
+from diffusers.utils import logging
+from diffusers.video_processor import VideoProcessor
+from diffusers.modular_pipelines import ModularPipelineBlocks, PipelineState
+from diffusers.modular_pipelines.modular_pipeline_utils import (
+    ComponentSpec,
+    InputParam,
+    OutputParam,
+)
+import types
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class WanRTDecodeStep(ModularPipelineBlocks):
+    model_name = "WanRT"
+    decoder_cache = []
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "vae",
+                AutoencoderKLWan,
+                repo="Wan-AI/Wan2.1-T2V-14B-Diffusers",
+                subfolder="vae",
+            ),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images"
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("output_type", default="pil"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents from the denoising step",
+            ),
+            InputParam(
+                "frame_cache_context",
+                description="The denoised latents from the denoising step",
+            ),
+            InputParam(
+                "block_idx",
+                description="The denoised latents from the denoising step",
+            ),
+            InputParam(
+                "decoder_cache",
+                description="The denoised latents from the denoising step",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "videos",
+                type_hint=Union[
+                    List[List[PIL.Image.Image]], List[torch.Tensor], List[np.ndarray]
+                ],
+                description="The generated videos, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        vae_dtype = components.vae.dtype
+        # Disable clearing cache
+        if block_state.block_idx == 0:
+            components.vae.clear_cache()
+            components.vae.clear_cache = lambda: None
+            components.vae._feat_map = [None] * 55
+        if block_state.block_idx != 0:
+            components.vae._feat_map = block_state.decoder_cache
+        if not block_state.output_type == "latent":
+            latents = block_state.latents.to(components.vae.device)
+            # Create tensors directly on target device and dtype to avoid redundant conversions
+            latents_mean = torch.tensor(
+                components.vae.config.latents_mean,
+                device=latents.device,
+                dtype=latents.dtype,
+            ).view(1, components.vae.config.z_dim, 1, 1, 1)
+            latents_std = 1.0 / torch.tensor(
+                components.vae.config.latents_std,
+                device=latents.device,
+                dtype=latents.dtype,
+            ).view(1, components.vae.config.z_dim, 1, 1, 1)
+            latents = latents / latents_std + latents_mean
+            latents = latents.to(vae_dtype)
+            videos = components.vae.decode(latents, return_dict=False)[0]
+        else:
+            block_state.videos = block_state.latents
+        block_state.decoder_cache = components.vae._feat_map
+        block_state.frame_cache_context.extend(videos.split(1, dim=2))
+        videos = components.video_processor.postprocess_video(
+            videos, output_type=block_state.output_type
+        )
+        block_state.videos = videos
+        self.set_block_state(state, block_state)
+        return components, state

denoise.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Tuple
+import torch
+from diffusers.configuration_utils import FrozenDict
+from diffusers.guiders import ClassifierFreeGuidance
+from diffusers.models import AutoModel
+from diffusers.schedulers import UniPCMultistepScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.modular_pipelines import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+    ModularPipeline,
+)
+from diffusers.modular_pipelines.modular_pipeline_utils import (
+    ComponentSpec,
+    InputParam,
+    OutputParam,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class WanRTStreamingLoopDenoiser(ModularPipelineBlocks):
+    model_name = "WanRTStreaming"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("transformer", AutoModel)]
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoise the latents with guidance. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanRTStreamingDenoiseLoopWrapper`)"
+        )
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("attention_kwargs"),
+            InputParam("block_idx"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+            ),
+            InputParam(
+                "kv_cache",
+                required=True,
+                type_hint=torch.Tensor,
+            ),
+            InputParam(
+                "crossattn_cache",
+                required=True,
+                type_hint=torch.Tensor,
+            ),
+            InputParam(
+                "current_start_frame",
+                required=True,
+                type_hint=torch.Tensor,
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                default=4,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                kwargs_type="guider_input_fields",
+                description=(
+                    "All conditional model inputs that need to be prepared with guider. "
+                    "It should contain prompt_embeds/negative_prompt_embeds. "
+                    "Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                ),
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self,
+        components: ModularPipeline,
+        block_state: BlockState,
+        i: int,
+        t: torch.Tensor,
+    ) -> PipelineState:
+        start_frame = min(
+            block_state.current_start_frame, components.config.kv_cache_num_frames
+        )
+        block_state.noise_pred = components.transformer(
+            x=block_state.latents,
+            t=t.expand(block_state.latents.shape[0], block_state.num_frames_per_block),
+            context=block_state.prompt_embeds,
+            kv_cache=block_state.kv_cache,
+            seq_len=components.config.seq_length,
+            crossattn_cache=block_state.crossattn_cache,
+            current_start=start_frame * components.config.frame_seq_length,
+            cache_start=start_frame * components.config.frame_seq_length,
+        )
+        return components, block_state
+class WanRTStreamingLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "WanRTStreaming"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", UniPCMultistepScheduler),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that update the latents. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanRTStreamingDenoiseLoopWrapper`)"
+        )
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return []
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam("generator"),
+            InputParam("block_id"),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "latents", type_hint=torch.Tensor, description="The denoised latents"
+            )
+        ]
+    @torch.no_grad()
+    def __call__(
+        self,
+        components: ModularPipeline,
+        block_state: BlockState,
+        i: int,
+        t: torch.Tensor,
+    ):
+        # Perform scheduler step using the predicted output
+        latents_dtype = block_state.latents.dtype
+        timesteps = block_state.all_timesteps
+        sigmas = block_state.sigmas
+        timestep_id = torch.argmin((timesteps - t).abs())
+        sigma_t = sigmas[timestep_id]
+        # Perform computation in double precision, then convert back once
+        latents = (
+            block_state.latents.double()
+            - sigma_t.double() * block_state.noise_pred.double()
+        ).to(latents_dtype)
+        block_state.latents = latents
+        return components, block_state
+class WanRTStreamingDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "WanRTStreaming"
+    @property
+    def description(self) -> str:
+        return (
+            "Streaming denoising loop that processes a single block with persistent KV cache. "
+            "Recomputes cache from context frames, denoises current block, and updates cache."
+        )
+    def add_noise(self, components, block_state, sample, noise, timestep, index):
+        timesteps = block_state.all_timesteps
+        sigmas = block_state.sigmas.to(timesteps.device)
+        if timestep.ndim == 2:
+            timestep = timestep.flatten(0, 1)
+        timestep_id = torch.argmin(
+            (timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1
+        )
+        sigma = sigmas[timestep_id].reshape(-1, 1, 1, 1)
+        sample = (
+            1 - sigma.double()
+        ) * sample.double() + sigma.double() * noise.double()
+        sample = sample.type_as(noise)
+        return sample
+    @property
+    def loop_inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "all_timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "sigmas",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam("final_latents", type_hint=torch.Tensor),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "num_frames_per_block",
+                required=True,
+                type_hint=int,
+                default=3,
+            ),
+            InputParam(
+                "current_start_frame",
+                required=True,
+                type_hint=int,
+            ),
+            InputParam(
+                "block_idx",
+            ),
+            InputParam(
+                "generator",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        for i, t in enumerate(block_state.timesteps):
+            components, block_state = self.loop_step(components, block_state, i=i, t=t)
+            if i < (block_state.num_inference_steps - 1):
+                t1 = block_state.timesteps[i + 1]
+                block_state.latents = (
+                    self.add_noise(
+                        components,
+                        block_state,
+                        block_state.latents.transpose(1, 2).squeeze(0),
+                        randn_tensor(
+                            block_state.latents.transpose(1, 2).squeeze(0).shape,
+                            device=block_state.latents.device,
+                            dtype=block_state.latents.dtype,
+                            generator=block_state.generator,
+                        ),
+                        t1.expand(
+                            block_state.latents.shape[0],
+                            block_state.num_frames_per_block,
+                        ),
+                        i,
+                    )
+                    .unsqueeze(0)
+                    .transpose(1, 2)
+                )
+        # Update the state
+        block_state.final_latents[
+            :,
+            :,
+            block_state.current_start_frame : block_state.current_start_frame
+            + block_state.num_frames_per_block,
+        ] = block_state.latents
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamingDenoiseStep(WanRTStreamingDenoiseLoopWrapper):
+    block_classes = [
+        WanRTStreamingLoopDenoiser,
+        WanRTStreamingLoopAfterDenoiser,
+    ]
+    block_names = ["denoiser", "after_denoiser"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanRTStreamingDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `WanRTStreamingLoopDenoiser`\n"
+            " - `WanRTStreamingLoopAfterDenoiser`\n"
+            "This block supports both text2vid tasks."
+        )

encoders.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import html
+from typing import List, Optional, Union
+import regex as re
+import torch
+from transformers import AutoTokenizer, UMT5EncoderModel
+from diffusers.configuration_utils import FrozenDict
+from diffusers.guiders import ClassifierFreeGuidance
+from diffusers.utils import is_ftfy_available, logging
+from diffusers.modular_pipelines import ModularPipelineBlocks, PipelineState
+from diffusers.modular_pipelines.modular_pipeline_utils import (
+    ComponentSpec,
+    ConfigSpec,
+    InputParam,
+    OutputParam,
+)
+from diffusers.modular_pipelines import WanModularPipeline
+if is_ftfy_available():
+    import ftfy
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
+    model_name = "WanRTStreaming"
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generate text_embeddings to guide the video generation"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", UMT5EncoderModel),
+            ComponentSpec("tokenizer", AutoTokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return []
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("prompt"),
+            InputParam("negative_prompt"),
+            InputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                description="text embeddings used to guide the image generation",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                description="negative text embeddings used to guide the image generation",
+            ),
+            InputParam("attention_kwargs"),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="negative text embeddings used to guide the image generation",
+            ),
+        ]
+    @staticmethod
+    def check_inputs(block_state):
+        if block_state.prompt is not None and (
+            not isinstance(block_state.prompt, str)
+            and not isinstance(block_state.prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}"
+            )
+    @staticmethod
+    def _get_t5_prompt_embeds(
+        components,
+        prompt: Union[str, List[str]],
+        max_sequence_length: int,
+        device: torch.device,
+    ):
+        dtype = components.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        text_inputs = components.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_embeds = components.text_encoder(
+            text_input_ids.to(device), mask.to(device)
+        ).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [
+                torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))])
+                for u in prompt_embeds
+            ],
+            dim=0,
+        )
+        return prompt_embeds
+    @staticmethod
+    def encode_prompt(
+        components,
+        prompt: str,
+        device: Optional[torch.device] = None,
+        num_videos_per_prompt: int = 1,
+        prepare_unconditional_embeds: bool = True,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of videos that should be generated per prompt
+            prepare_unconditional_embeds (`bool`):
+                whether to use prepare unconditional embeddings or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum number of text tokens to be used for the generation process.
+        """
+        device = device or components._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt) if prompt is not None else prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = WanRTStreamingTextEncoderStep._get_t5_prompt_embeds(
+                components, prompt, max_sequence_length, device
+            )
+        if prepare_unconditional_embeds and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = (
+                batch_size * [negative_prompt]
+                if isinstance(negative_prompt, str)
+                else negative_prompt
+            )
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = (
+                WanRTStreamingTextEncoderStep._get_t5_prompt_embeds(
+                    components, negative_prompt, max_sequence_length, device
+                )
+            )
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_videos_per_prompt, seq_len, -1
+        )
+        if prepare_unconditional_embeds:
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_videos_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_videos_per_prompt, seq_len, -1
+            )
+        return prompt_embeds, negative_prompt_embeds
+    @torch.no_grad()
+    def __call__(
+        self, components: WanModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        # Get inputs and intermediates
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+        block_state.prepare_unconditional_embeds = False
+        block_state.device = components._execution_device
+        # Encode input prompt
+        (
+            block_state.prompt_embeds,
+            block_state.negative_prompt_embeds,
+        ) = WanRTStreamingTextEncoderStep.encode_prompt(
+            components,
+            block_state.prompt,
+            block_state.device,
+            1,
+            block_state.prepare_unconditional_embeds,
+            block_state.negative_prompt,
+            prompt_embeds=block_state.prompt_embeds,
+            negative_prompt_embeds=block_state.negative_prompt_embeds,
+        )
+        # Add outputs
+        self.set_block_state(state, block_state)
+        return components, state

modular_blocks.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from diffusers.utils import logging
+from diffusers.modular_pipelines import SequentialPipelineBlocks
+from diffusers.modular_pipelines.modular_pipeline_utils import InsertableDict
+from .before_denoise import WanRTStreamingBeforeDenoiseStep
+from .decoders import WanRTDecodeStep
+from .encoders import WanRTStreamingTextEncoderStep
+from .denoise import WanRTStreamingDenoiseStep
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+TEXT2VIDEO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", WanRTStreamingTextEncoderStep),
+        ("before_denoise", WanRTStreamingBeforeDenoiseStep),
+        ("denoise", WanRTStreamingDenoiseStep),
+        ("decode", WanRTDecodeStep),
+    ]
+)
+ALL_BLOCKS = {
+    "text2video": TEXT2VIDEO_BLOCKS,
+}
+class WanStreamingRTBlocks(SequentialPipelineBlocks):
+    block_classes = list(TEXT2VIDEO_BLOCKS.copy().values())
+    block_names = list(TEXT2VIDEO_BLOCKS.copy().keys())

modular_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "WanRTBlocks",
+  "_diffusers_version": "0.36.0.dev0",
+  "auto_map": {
+    "ModularPipelineBlocks": "modular_blocks.WanStreamingRTBlocks"
+  }
+}

modular_model_index.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "_blocks_class_name": "WanStreamingRTBlocks",
+  "_class_name": "WanRTStreamingPipeline",
+  "_diffusers_version": "0.36.0.dev0",
+  "frame_seq_length": 1560,
+  "kv_cache_num_frames": 3,
+  "num_frames_per_block": 3,
+  "scheduler": [
+    null,
+    null,
+    {
+      "repo": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+      "revision": null,
+      "subfolder": "scheduler",
+      "type_hint": [
+        "diffusers",
+        "UniPCMultistepScheduler"
+      ],
+      "variant": null
+    }
+  ],
+  "seq_length": 32760,
+  "text_encoder": [
+    null,
+    null,
+    {
+      "repo": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+      "revision": null,
+      "subfolder": "text_encoder",
+      "type_hint": [
+        "transformers",
+        "UMT5EncoderModel"
+      ],
+      "variant": null
+    }
+  ],
+  "tokenizer": [
+    null,
+    null,
+    {
+      "repo": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+      "revision": null,
+      "subfolder": "tokenizer",
+      "type_hint": [
+        "transformers",
+        "T5TokenizerFast"
+      ],
+      "variant": null
+    }
+  ],
+  "transformer": [
+    null,
+    null,
+    {
+      "repo": "diffusers-internal-dev/krt",
+      "revision": null,
+      "subfolder": "transformer",
+      "type_hint": [
+        "diffusers",
+        "AutoModel"
+      ],
+      "variant": null
+    }
+  ],
+  "vae": [
+    null,
+    null,
+    {
+      "repo": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+      "revision": null,
+      "subfolder": "vae",
+      "type_hint": [
+        "diffusers",
+        "AutoencoderKLWan"
+      ],
+      "variant": null
+    }
+  ]
+}

transformer/__init__.py ADDED Viewed

File without changes

transformer/attention.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from typing import Optional
+import os
+import warnings
+# Global state for lazy initialization
+_SAGEATTN_AVAILABLE = None
+_FLASH_ATTN_3_AVAILABLE = None
+_FLASH_ATTN_2_AVAILABLE = None
+_sageattn_func = None
+_flash_attn_func = None
+_flash_attn_interface = None
+_flash_attn = None
+def _init_sageattention():
+    """Lazy initialization for SageAttention."""
+    global _SAGEATTN_AVAILABLE, _sageattn_func
+    if _SAGEATTN_AVAILABLE is not None:
+        return _SAGEATTN_AVAILABLE
+    _SAGEATTN_AVAILABLE = False
+    try:
+        if os.getenv("DISABLE_SAGEATTENTION", "0") != "0":
+            raise Exception("DISABLE_SAGEATTENTION is set")
+        from sageattention import sageattn
+        @torch.library.custom_op(
+            "mylib::sageattn", mutates_args={"q", "k", "v"}, device_types="cuda"
+        )
+        def sageattn_func(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            attn_mask: Optional[torch.Tensor] = None,
+            dropout_p: float = 0,
+            is_causal: bool = False,
+        ) -> torch.Tensor:
+            return sageattn(
+                q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
+            )
+        @sageattn_func.register_fake
+        def _sageattn_fake(q, k, v, attn_mask=None, dropout_p=0, is_causal=False):
+            return torch.empty(*q.shape, device=q.device, dtype=q.dtype)
+        print("SageAttention loaded successfully")
+        _sageattn_func = sageattn_func
+        _SAGEATTN_AVAILABLE = True
+    except Exception as e:
+        print(f"Warning: Could not load sageattention: {str(e)}")
+        if isinstance(e, ModuleNotFoundError):
+            print("sageattention package is not installed")
+        elif isinstance(e, ImportError) and "DLL" in str(e):
+            print("sageattention DLL loading error")
+        _sageattn_func = None
+    return _SAGEATTN_AVAILABLE
+def _is_hopper_gpu():
+    """Check if the current GPU is a Hopper architecture."""
+    if not torch.cuda.is_available():
+        return False
+    device_name = torch.cuda.get_device_name(0).lower()
+    return "h100" in device_name or "hopper" in device_name
+def _init_flash_attention_3():
+    """Lazy initialization for Flash Attention 3."""
+    global _FLASH_ATTN_3_AVAILABLE, _flash_attn_func, _flash_attn_interface
+    if _FLASH_ATTN_3_AVAILABLE is not None:
+        return _FLASH_ATTN_3_AVAILABLE
+    _FLASH_ATTN_3_AVAILABLE = False
+    try:
+        from flash_attn import flash_attn_func
+        import flash_attn_interface
+        # Always set the function reference if flash_attn is available
+        _flash_attn_func = flash_attn_func
+        _flash_attn_interface = flash_attn_interface
+        # FA3 optimizations only available on Hopper GPUs
+        _FLASH_ATTN_3_AVAILABLE = _is_hopper_gpu()
+    except ModuleNotFoundError:
+        _FLASH_ATTN_3_AVAILABLE = False
+        _flash_attn_func = None
+        _flash_attn_interface = None
+    return _FLASH_ATTN_3_AVAILABLE
+def _init_flash_attention_2():
+    """Lazy initialization for Flash Attention 2."""
+    global _FLASH_ATTN_2_AVAILABLE, _flash_attn
+    if _FLASH_ATTN_2_AVAILABLE is not None:
+        return _FLASH_ATTN_2_AVAILABLE
+    _FLASH_ATTN_2_AVAILABLE = False
+    try:
+        import flash_attn
+        _flash_attn = flash_attn
+        _FLASH_ATTN_2_AVAILABLE = True
+    except ModuleNotFoundError:
+        _FLASH_ATTN_2_AVAILABLE = False
+    return _FLASH_ATTN_2_AVAILABLE
+__all__ = ["flash_attention", "attention"]
+# Compatibility getters for external code
+def sageattn_func():
+    """Getter for sageattn_func - initializes if needed."""
+    _init_sageattention()
+    return _sageattn_func
+def SAGEATTN_AVAILABLE():
+    """Getter for SAGEATTN_AVAILABLE - initializes if needed."""
+    return _init_sageattention()
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    # Initialize flash attention modules
+    flash_attn_3_available = _init_flash_attention_3()
+    flash_attn_2_available = _init_flash_attention_2()
+    # Early fallback for simple cases when advanced features aren't needed
+    # Only use this path if flash_attn is available but we're not using FA3 features
+    if not flash_attn_3_available and _flash_attn_func is not None and q_lens is None and k_lens is None:
+        return _flash_attn_func(
+            q,
+            k,
+            v,
+        )
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == "cuda" and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(
+            device=q.device, non_blocking=True
+        )
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(
+            device=k.device, non_blocking=True
+        )
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not flash_attn_3_available:
+        warnings.warn(
+            "Flash attention 3 is not available, use flash attention 2 instead."
+        )
+    # apply attention
+    if (version is None or version == 3) and flash_attn_3_available:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = _flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic,
+        ).unflatten(0, (b, lq))
+    else:
+        assert flash_attn_2_available
+        x = _flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+        ).unflatten(0, (b, lq))
+    # output
+    return x.type(out_dtype)
+def attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+    # og_dtype=torch.bfloat16,
+):
+    # Initialize attention modules
+    sageattn_available = _init_sageattention()
+    flash_attn_2_available = _init_flash_attention_2()
+    flash_attn_3_available = _init_flash_attention_3()
+    if sageattn_available:
+        # print("Using sageattention")
+        attn_mask = None
+        og_dtype = q.dtype
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = _sageattn_func(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
+        )
+        out = out.transpose(1, 2).contiguous().to(og_dtype)
+        return out
+    elif flash_attn_2_available or flash_attn_3_available:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                "Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance."
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
+        )
+        out = out.transpose(1, 2).contiguous()
+        return out

transformer/causal_model.py ADDED Viewed

	@@ -0,0 +1,1402 @@

+import functools
+import math
+from .attention import attention
+from .model import (
+    WanRMSNorm,
+    rope_apply,
+    WanLayerNorm,
+    WAN_CROSSATTENTION_CLASSES,
+    rope_params,
+    MLPProj,
+    sinusoidal_embedding_1d,
+)
+import torch
+import torch.nn as nn
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.nn.attention.flex_attention import BlockMask
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+flex_attention = torch.compile(
+    flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
+)
+def rope_params_riflex(max_seq_len, dim, theta=10000, k=0, L_test=None):
+    assert dim % 2 == 0
+    omega = 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float64).div(dim))
+    if k is not None:
+        print("Doing riflex w/ ltest", L_test)
+        omega[k - 1] = 0.9 * 2 * torch.pi / L_test
+    freqs = torch.outer(torch.arange(max_seq_len), omega)
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@functools.lru_cache(maxsize=32)
+def get_sdpa_mask(
+    device: str,
+    num_frames: int = 21,
+    frame_seqlen: int = 1560,
+    num_frame_per_block: int = 1,
+    local_attn_size: int = -1,
+    dtype: torch.dtype = torch.bool,
+):
+    """
+    Create an attention mask tensor for torch.nn.functional.scaled_dot_product_attention
+    Args:
+        device: Device to create the mask on
+        num_frames: Number of frames
+        frame_seqlen: Sequence length per frame
+        num_frame_per_block: Number of frames per block
+        local_attn_size: Local attention window size (-1 for global)
+        dtype: Data type for the mask (torch.bool for masking, torch.float for additive)
+    Returns:
+        torch.Tensor: Attention mask of shape (seq_len, seq_len)
+                     - True/1.0 for allowed attention
+                     - False/-inf for masked attention
+    """
+    print("Generating SDPA attention mask")
+    total_length = num_frames * frame_seqlen
+    # Right padding to get to a multiple of 128
+    padded_length = math.ceil(total_length / 128) * 128 - total_length
+    full_length = total_length + padded_length
+    # Create the ends array (same logic as original)
+    ends = torch.zeros(full_length, device=device, dtype=torch.long)
+    frame_indices = torch.arange(
+        start=0,
+        end=total_length,
+        step=frame_seqlen * num_frame_per_block,
+        device=device,
+    )
+    for tmp in frame_indices:
+        end_idx = min(tmp + frame_seqlen * num_frame_per_block, full_length)
+        ends[tmp:end_idx] = end_idx
+    # Create q_idx and kv_idx coordinate matrices
+    q_indices = torch.arange(full_length, device=device).unsqueeze(
+        1
+    )  # Shape: (seq_len, 1)
+    kv_indices = torch.arange(full_length, device=device).unsqueeze(
+        0
+    )  # Shape: (1, seq_len)
+    # Apply the attention logic
+    if local_attn_size == -1:
+        # Global attention within blocks + diagonal
+        mask = (kv_indices < ends[q_indices]) | (q_indices == kv_indices)
+    else:
+        # Local attention within blocks + diagonal
+        local_window_start = ends[q_indices] - local_attn_size * frame_seqlen
+        mask = ((kv_indices < ends[q_indices]) & (kv_indices >= local_window_start)) | (
+            q_indices == kv_indices
+        )
+    if dtype == torch.bool:
+        return mask
+    elif dtype == torch.float32 or dtype == torch.float16:
+        # Convert to additive mask (0.0 for attend, -inf for mask)
+        return mask.float() * 0.0 + (~mask).float() * float("-inf")
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+@functools.lru_cache(maxsize=32)
+def get_block_mask(
+    device: str,
+    num_frames: int = 21,
+    frame_seqlen: int = 1560,
+    num_frame_per_block=3,
+    local_attn_size=-1,
+):
+    print("Generating block mask")
+    total_length = num_frames * frame_seqlen
+    # we do right padding to get to a multiple of 128
+    padded_length = math.ceil(total_length / 128) * 128 - total_length
+    ends = torch.zeros(total_length + padded_length, device=device, dtype=torch.long)
+    # Block-wise causal mask will attend to all elements that are before the end of the current chunk
+    frame_indices = torch.arange(
+        start=0,
+        end=total_length,
+        step=frame_seqlen * num_frame_per_block,
+        device=device,
+    )
+    for tmp in frame_indices:
+        ends[tmp : tmp + frame_seqlen * num_frame_per_block] = (
+            tmp + frame_seqlen * num_frame_per_block
+        )
+    def attention_mask(b, h, q_idx, kv_idx):
+        if local_attn_size == -1:
+            return (kv_idx < ends[q_idx]) | (q_idx == kv_idx)
+        else:
+            return (
+                (kv_idx < ends[q_idx])
+                & (kv_idx >= (ends[q_idx] - local_attn_size * frame_seqlen))
+            ) | (q_idx == kv_idx)
+    block_mask = create_block_mask(
+        attention_mask,
+        B=None,
+        H=None,
+        Q_LEN=total_length + padded_length,
+        KV_LEN=total_length + padded_length,
+        _compile=False,
+        device=device,
+    )
+    return block_mask
+def causal_rope_apply(x, grid_sizes, freqs, start_frame=0):
+    n, c = x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(
+            x[i, :seq_len].to(torch.float64).reshape(seq_len, n, -1, 2)
+        )
+        freqs_i = torch.cat(
+            [
+                freqs[0][start_frame : start_frame + f]
+                .view(f, 1, 1, -1)
+                .expand(f, h, w, -1),
+                freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        ).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[i, seq_len:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).type_as(x)
+class CausalWanSelfAttention(nn.Module):
+    def __init__(
+        self, dim, num_heads, local_attn_size=-1, sink_size=0, qk_norm=True, eps=1e-6
+    ):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.local_attn_size = local_attn_size
+        self.sink_size = sink_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        self.max_attention_size = (
+            32760 if local_attn_size == -1 else local_attn_size * 1560
+        )
+        self.fused_projections = False
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    @torch.no_grad()
+    def fuse_projections(self):
+        # if not self.is_cross_attention:
+        if self.fused_projections:
+            return
+        concatenated_weights = torch.cat(
+            [self.q.weight.data, self.k.weight.data, self.v.weight.data]
+        )
+        concatenated_bias = torch.cat(
+            [self.q.bias.data, self.k.bias.data, self.v.bias.data]
+        )
+        out_features, in_features = concatenated_weights.shape
+        with torch.device("meta"):
+            self.to_qkv = torch.nn.Linear(in_features, out_features, bias=True)
+        self.to_qkv.load_state_dict(
+            {"weight": concatenated_weights, "bias": concatenated_bias},
+            strict=True,
+            assign=True,
+        )
+        self.fused_projections = True
+    def forward(
+        self,
+        x,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        block_mask,
+        kv_cache=None,
+        current_start=0,
+        cache_start=None,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            seq_lens(Tensor): Shape [B]
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+            block_mask (BlockMask)
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        if cache_start is None:
+            cache_start = current_start
+        # query, key, value function
+        # @torch.compile(dynamic=True, mode="max-autotune-no-cudagraphs")
+        def qkv_fn(x):
+            if self.fused_projections:
+                # print("Using fused projections")
+                q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+                q = self.norm_q(q).view(b, s, n, d)
+                k = self.norm_k(k).view(b, s, n, d)
+                v = v.view(b, s, n, d)
+            else:
+                q = self.norm_q(self.q(x)).view(b, s, n, d)
+                k = self.norm_k(self.k(x)).view(b, s, n, d)
+                v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        if kv_cache is None or block_mask is not None:
+            # if it is teacher forcing training?
+            # is_tf = (s == seq_lens[0].item() * 2)
+            is_tf = False
+            if is_tf:
+                print("Teacher forcing training")
+                q_chunk = torch.chunk(q, 2, dim=1)
+                k_chunk = torch.chunk(k, 2, dim=1)
+                roped_query = []
+                roped_key = []
+                # rope should be same for clean and noisy parts
+                for ii in range(2):
+                    rq = rope_apply(q_chunk[ii], grid_sizes, freqs).type_as(v)
+                    rk = rope_apply(k_chunk[ii], grid_sizes, freqs).type_as(v)
+                    roped_query.append(rq)
+                    roped_key.append(rk)
+                roped_query = torch.cat(roped_query, dim=1)
+                roped_key = torch.cat(roped_key, dim=1)
+                padded_length = math.ceil(q.shape[1] / 128) * 128 - q.shape[1]
+                padded_roped_query = torch.cat(
+                    [
+                        roped_query,
+                        torch.zeros(
+                            [q.shape[0], padded_length, q.shape[2], q.shape[3]],
+                            device=q.device,
+                            dtype=v.dtype,
+                        ),
+                    ],
+                    dim=1,
+                )
+                padded_roped_key = torch.cat(
+                    [
+                        roped_key,
+                        torch.zeros(
+                            [k.shape[0], padded_length, k.shape[2], k.shape[3]],
+                            device=k.device,
+                            dtype=v.dtype,
+                        ),
+                    ],
+                    dim=1,
+                )
+                padded_v = torch.cat(
+                    [
+                        v,
+                        torch.zeros(
+                            [v.shape[0], padded_length, v.shape[2], v.shape[3]],
+                            device=v.device,
+                            dtype=v.dtype,
+                        ),
+                    ],
+                    dim=1,
+                )
+                x = flex_attention(
+                    query=padded_roped_query.transpose(2, 1),
+                    key=padded_roped_key.transpose(2, 1),
+                    value=padded_v.transpose(2, 1),
+                    block_mask=block_mask,
+                )[:, :, :-padded_length].transpose(2, 1)
+            else:
+                roped_query = rope_apply(q, grid_sizes, freqs).type_as(v)
+                roped_key = rope_apply(k, grid_sizes, freqs).type_as(v)
+                local_end_index = roped_key.shape[1]
+                kv_cache["k"][:, :local_end_index] = roped_key
+                kv_cache["v"][:, :local_end_index] = v
+                kv_cache["global_end_index"] = local_end_index
+                kv_cache["local_end_index"] = local_end_index
+                padded_length = math.ceil(q.shape[1] / 128) * 128 - q.shape[1]
+                padded_roped_query = torch.cat(
+                    [
+                        roped_query,
+                        torch.zeros(
+                            [q.shape[0], padded_length, q.shape[2], q.shape[3]],
+                            device=q.device,
+                            dtype=v.dtype,
+                        ),
+                    ],
+                    dim=1,
+                )
+                padded_roped_key = torch.cat(
+                    [
+                        roped_key,
+                        torch.zeros(
+                            [k.shape[0], padded_length, k.shape[2], k.shape[3]],
+                            device=k.device,
+                            dtype=v.dtype,
+                        ),
+                    ],
+                    dim=1,
+                )
+                # print("shape of padded_roped_query", padded_roped_query.shape)
+                # print("shape of padded_roped_key", padded_roped_key.shape)
+                padded_v = torch.cat(
+                    [
+                        v,
+                        torch.zeros(
+                            [v.shape[0], padded_length, v.shape[2], v.shape[3]],
+                            device=v.device,
+                            dtype=v.dtype,
+                        ),
+                    ],
+                    dim=1,
+                )
+                x = flex_attention(
+                    query=padded_roped_query.transpose(2, 1).contiguous(),
+                    key=padded_roped_key.transpose(2, 1).contiguous(),
+                    value=padded_v.transpose(2, 1).contiguous(),
+                    block_mask=block_mask,
+                    kernel_options={
+                        "BLOCKS_ARE_CONTIGUOUS": True,
+                    },
+                )[:, :, :-padded_length].transpose(2, 1)
+        else:
+            # frame_seqlen = math.prod(grid_sizes[0][1:]).item() # torch compile doesn't like this
+            frame_seqlen = 1560
+            current_start_frame = current_start // frame_seqlen
+            roped_query = causal_rope_apply(
+                q, grid_sizes, freqs, start_frame=current_start_frame
+            ).type_as(v)
+            roped_key = causal_rope_apply(
+                k, grid_sizes, freqs, start_frame=current_start_frame
+            ).type_as(v)
+            current_end = current_start + roped_query.shape[1]
+            sink_tokens = self.sink_size * frame_seqlen
+            # If we are using local attention and the current KV cache size is larger than the local attention size, we need to truncate the KV cache
+            kv_cache_size = kv_cache["k"].shape[1]
+            num_new_tokens = roped_query.shape[1]
+            if (
+                self.local_attn_size != -1
+                and (current_end > kv_cache["global_end_index"])
+                and (num_new_tokens + kv_cache["local_end_index"] > kv_cache_size)
+            ):
+                # Calculate the number of new tokens added in this step
+                # Shift existing cache content left to discard oldest tokens
+                # Clone the source slice to avoid overlapping memory error
+                num_evicted_tokens = (
+                    num_new_tokens + kv_cache["local_end_index"] - kv_cache_size
+                )
+                num_rolled_tokens = (
+                    kv_cache["local_end_index"] - num_evicted_tokens - sink_tokens
+                )
+                kv_cache["k"][:, sink_tokens : sink_tokens + num_rolled_tokens] = (
+                    kv_cache["k"][
+                        :,
+                        sink_tokens + num_evicted_tokens : sink_tokens
+                        + num_evicted_tokens
+                        + num_rolled_tokens,
+                    ].clone()
+                )
+                kv_cache["v"][:, sink_tokens : sink_tokens + num_rolled_tokens] = (
+                    kv_cache["v"][
+                        :,
+                        sink_tokens + num_evicted_tokens : sink_tokens
+                        + num_evicted_tokens
+                        + num_rolled_tokens,
+                    ].clone()
+                )
+                # Insert the new keys/values at the end
+                local_end_index = (
+                    kv_cache["local_end_index"]
+                    + current_end
+                    - kv_cache["global_end_index"]
+                    - num_evicted_tokens
+                )
+                local_start_index = local_end_index - num_new_tokens
+                kv_cache["k"][:, local_start_index:local_end_index] = roped_key
+                kv_cache["v"][:, local_start_index:local_end_index] = v
+            else:
+                # Assign new keys/values directly up to current_end
+                local_end_index = (
+                    kv_cache["local_end_index"]
+                    + current_end
+                    - kv_cache["global_end_index"]
+                )
+                local_start_index = local_end_index - num_new_tokens
+                kv_cache["k"][:, local_start_index:local_end_index] = roped_key
+                kv_cache["v"][:, local_start_index:local_end_index] = v
+            x = attention(
+                roped_query,
+                kv_cache["k"][
+                    :,
+                    max(0, local_end_index - self.max_attention_size) : local_end_index,
+                ],
+                kv_cache["v"][
+                    :,
+                    max(0, local_end_index - self.max_attention_size) : local_end_index,
+                ],
+            )
+            kv_cache["global_end_index"] = current_end
+            kv_cache["local_end_index"] = local_end_index
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class CausalWanAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        cross_attn_type,
+        dim,
+        ffn_dim,
+        num_heads,
+        local_attn_size=-1,
+        sink_size=0,
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.local_attn_size = local_attn_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = CausalWanSelfAttention(
+            dim, num_heads, local_attn_size, sink_size, qk_norm, eps
+        )
+        self.norm3 = (
+            WanLayerNorm(dim, eps, elementwise_affine=True)
+            if cross_attn_norm
+            else nn.Identity()
+        )
+        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](
+            dim, num_heads, (-1, -1), qk_norm, eps
+        )
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+        block_mask,
+        kv_cache=None,
+        crossattn_cache=None,
+        current_start=0,
+        cache_start=None,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, F, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        num_frames, frame_seqlen = e.shape[1], x.shape[1] // e.shape[1]
+        # assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        e = (self.modulation.unsqueeze(1) + e).chunk(6, dim=2)
+        # assert e[0].dtype == torch.float32
+        # self-attention
+        y = self.self_attn(
+            (
+                self.norm1(x).unflatten(dim=1, sizes=(num_frames, frame_seqlen))
+                * (1 + e[1])
+                + e[0]
+            ).flatten(1, 2),
+            seq_lens,
+            grid_sizes,
+            freqs,
+            block_mask,
+            kv_cache,
+            current_start,
+            cache_start,
+        )
+        # with amp.autocast(dtype=torch.float32):
+        x = x + (y.unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * e[2]).flatten(
+            1, 2
+        )
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context, context_lens, e, crossattn_cache=None):
+            x = x + self.cross_attn(
+                self.norm3(x), context, context_lens, crossattn_cache=crossattn_cache
+            )
+            y = self.ffn(
+                (
+                    self.norm2(x).unflatten(dim=1, sizes=(num_frames, frame_seqlen))
+                    * (1 + e[4])
+                    + e[3]
+                ).flatten(1, 2)
+            )
+            # with amp.autocast(dtype=torch.float32):
+            x = x + (
+                y.unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * e[5]
+            ).flatten(1, 2)
+            return x
+        x = cross_attn_ffn(x, context, context_lens, e, crossattn_cache)
+        return x
+class CausalHead(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, F, 1, C]
+        """
+        # assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        num_frames, frame_seqlen = e.shape[1], x.shape[1] // e.shape[1]
+        e = (self.modulation.unsqueeze(1) + e).chunk(2, dim=2)
+        x = self.head(
+            self.norm(x).unflatten(dim=1, sizes=(num_frames, frame_seqlen)) * (1 + e[1])
+            + e[0]
+        )
+        return x
+class CausalWanModel(ModelMixin, ConfigMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+    ignore_for_config = ["patch_size", "cross_attn_norm", "qk_norm", "text_dim"]
+    _no_split_modules = ["WanAttentionBlock"]
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        model_type="t2v",
+        patch_size=(1, 2, 2),
+        text_len=512,
+        in_dim=16,
+        dim=2048,
+        ffn_dim=8192,
+        freq_dim=256,
+        text_dim=4096,
+        out_dim=16,
+        num_heads=16,
+        num_layers=32,
+        local_attn_size=-1,
+        sink_size=0,
+        qk_norm=True,
+        cross_attn_norm=True,
+        eps=1e-6,
+    ):
+        r"""
+        Initialize the diffusion model backbone.
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            local_attn_size (`int`, *optional*, defaults to -1):
+                Window size for temporal local attention (-1 indicates global attention)
+            sink_size (`int`, *optional*, defaults to 0):
+                Size of the attention sink, we keep the first `sink_size` frames unchanged when rolling the KV cache
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+        super().__init__()
+        assert model_type in ["t2v", "i2v"]
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.local_attn_size = local_attn_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # embeddings
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn"
+        self.blocks = nn.ModuleList(
+            [
+                CausalWanAttentionBlock(
+                    cross_attn_type,
+                    dim,
+                    ffn_dim,
+                    num_heads,
+                    local_attn_size,
+                    sink_size,
+                    qk_norm,
+                    cross_attn_norm,
+                    eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # head
+        self.head = CausalHead(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat(
+            [
+                rope_params(1024, d - 4 * (d // 6)),
+                # rope_params_riflex(1024, d - 4 * (d // 6), ),
+                rope_params(1024, 2 * (d // 6)),
+                rope_params(1024, 2 * (d // 6)),
+            ],
+            dim=1,
+        )
+        if model_type == "i2v":
+            self.img_emb = MLPProj(1280, dim)
+        # initialize weights
+        self.init_weights()
+        self.gradient_checkpointing = False
+        self.block_mask = None
+        self.num_frame_per_block = 1
+        self.independent_first_frame = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    @staticmethod
+    def _prepare_blockwise_causal_attn_mask(
+        device,
+        num_frames: int = 21,
+        frame_seqlen: int = 1560,
+        num_frame_per_block=1,
+        local_attn_size=-1,
+    ) -> BlockMask:
+        """
+        we will divide the token sequence into the following format
+        [1 latent frame] [1 latent frame] ... [1 latent frame]
+        We use flexattention to construct the attention mask
+        """
+        block_mask = get_block_mask(
+            str(device), num_frames, frame_seqlen, num_frame_per_block, local_attn_size
+        )
+        return block_mask
+    @staticmethod
+    def _prepare_teacher_forcing_mask(
+        device: torch.device | str,
+        num_frames: int = 21,
+        frame_seqlen: int = 1560,
+        num_frame_per_block=1,
+    ) -> BlockMask:
+        """
+        we will divide the token sequence into the following format
+        [1 latent frame] [1 latent frame] ... [1 latent frame]
+        We use flexattention to construct the attention mask
+        """
+        # debug
+        DEBUG = False
+        if DEBUG:
+            num_frames = 9
+            frame_seqlen = 256
+        total_length = num_frames * frame_seqlen * 2
+        # we do right padding to get to a multiple of 128
+        padded_length = math.ceil(total_length / 128) * 128 - total_length
+        clean_ends = num_frames * frame_seqlen
+        # for clean context frames, we can construct their flex attention mask based on a [start, end] interval
+        context_ends = torch.zeros(
+            total_length + padded_length, device=device, dtype=torch.long
+        )
+        # for noisy frames, we need two intervals to construct the flex attention mask [context_start, context_end] [noisy_start, noisy_end]
+        noise_context_starts = torch.zeros(
+            total_length + padded_length, device=device, dtype=torch.long
+        )
+        noise_context_ends = torch.zeros(
+            total_length + padded_length, device=device, dtype=torch.long
+        )
+        noise_noise_starts = torch.zeros(
+            total_length + padded_length, device=device, dtype=torch.long
+        )
+        noise_noise_ends = torch.zeros(
+            total_length + padded_length, device=device, dtype=torch.long
+        )
+        # Block-wise causal mask will attend to all elements that are before the end of the current chunk
+        attention_block_size = frame_seqlen * num_frame_per_block
+        frame_indices = torch.arange(
+            start=0,
+            end=num_frames * frame_seqlen,
+            step=attention_block_size,
+            device=device,
+            dtype=torch.long,
+        )
+        # attention for clean context frames
+        for start in frame_indices:
+            context_ends[start : start + attention_block_size] = (
+                start + attention_block_size
+            )
+        noisy_image_start_list = torch.arange(
+            num_frames * frame_seqlen,
+            total_length,
+            step=attention_block_size,
+            device=device,
+            dtype=torch.long,
+        )
+        noisy_image_end_list = noisy_image_start_list + attention_block_size
+        # attention for noisy frames
+        for block_index, (start, end) in enumerate(
+            zip(noisy_image_start_list, noisy_image_end_list)
+        ):
+            # attend to noisy tokens within the same block
+            noise_noise_starts[start:end] = start
+            noise_noise_ends[start:end] = end
+            # attend to context tokens in previous blocks
+            # noise_context_starts[start:end] = 0
+            noise_context_ends[start:end] = block_index * attention_block_size
+        def attention_mask(b, h, q_idx, kv_idx):
+            # first design the mask for clean frames
+            clean_mask = (q_idx < clean_ends) & (kv_idx < context_ends[q_idx])
+            # then design the mask for noisy frames
+            # noisy frames will attend to all clean preceeding clean frames + itself
+            C1 = (kv_idx < noise_noise_ends[q_idx]) & (
+                kv_idx >= noise_noise_starts[q_idx]
+            )
+            C2 = (kv_idx < noise_context_ends[q_idx]) & (
+                kv_idx >= noise_context_starts[q_idx]
+            )
+            noise_mask = (q_idx >= clean_ends) & (C1 | C2)
+            eye_mask = q_idx == kv_idx
+            return eye_mask | clean_mask | noise_mask
+        block_mask = create_block_mask(
+            attention_mask,
+            B=None,
+            H=None,
+            Q_LEN=total_length + padded_length,
+            KV_LEN=total_length + padded_length,
+            _compile=False,
+            device=device,
+        )
+        if DEBUG:
+            print(block_mask)
+            import imageio
+            import numpy as np
+            from torch.nn.attention.flex_attention import create_mask
+            mask = create_mask(
+                attention_mask,
+                B=None,
+                H=None,
+                Q_LEN=total_length + padded_length,
+                KV_LEN=total_length + padded_length,
+                device=device,
+            )
+            import cv2
+            mask = cv2.resize(mask[0, 0].cpu().float().numpy(), (1024, 1024))
+            imageio.imwrite("mask_%d.jpg" % (0), np.uint8(255.0 * mask))
+        return block_mask
+    @staticmethod
+    def _prepare_blockwise_causal_attn_mask_i2v(
+        device: torch.device | str,
+        num_frames: int = 21,
+        frame_seqlen: int = 1560,
+        num_frame_per_block=4,
+        local_attn_size=-1,
+    ) -> BlockMask:
+        """
+        we will divide the token sequence into the following format
+        [1 latent frame] [N latent frame] ... [N latent frame]
+        The first frame is separated out to support I2V generation
+        We use flexattention to construct the attention mask
+        """
+        total_length = num_frames * frame_seqlen
+        # we do right padding to get to a multiple of 128
+        padded_length = math.ceil(total_length / 128) * 128 - total_length
+        ends = torch.zeros(
+            total_length + padded_length, device=device, dtype=torch.long
+        )
+        # special handling for the first frame
+        ends[:frame_seqlen] = frame_seqlen
+        # Block-wise causal mask will attend to all elements that are before the end of the current chunk
+        frame_indices = torch.arange(
+            start=frame_seqlen,
+            end=total_length,
+            step=frame_seqlen * num_frame_per_block,
+            device=device,
+        )
+        for idx, tmp in enumerate(frame_indices):
+            ends[tmp : tmp + frame_seqlen * num_frame_per_block] = (
+                tmp + frame_seqlen * num_frame_per_block
+            )
+        def attention_mask(b, h, q_idx, kv_idx):
+            if local_attn_size == -1:
+                return (kv_idx < ends[q_idx]) | (q_idx == kv_idx)
+            else:
+                return (
+                    (kv_idx < ends[q_idx])
+                    & (kv_idx >= (ends[q_idx] - local_attn_size * frame_seqlen))
+                ) | (q_idx == kv_idx)
+        block_mask = create_block_mask(
+            attention_mask,
+            B=None,
+            H=None,
+            Q_LEN=total_length + padded_length,
+            KV_LEN=total_length + padded_length,
+            _compile=False,
+            device=device,
+        )
+        # if not dist.is_initialized() or dist.get_rank() == 0:
+        # print(
+        #     f" cache a block wise causal mask with block size of {num_frame_per_block} frames")
+        # print(block_mask)
+        # import imageio
+        # import numpy as np
+        # from torch.nn.attention.flex_attention import create_mask
+        # mask = create_mask(attention_mask, B=None, H=None, Q_LEN=total_length +
+        #                    padded_length, KV_LEN=total_length + padded_length, device=device)
+        # import cv2
+        # mask = cv2.resize(mask[0, 0].cpu().float().numpy(), (1024, 1024))
+        # imageio.imwrite("mask_%d.jpg" % (0), np.uint8(255. * mask))
+        return block_mask
+    def _forward_inference(
+        self,
+        x,
+        t,
+        context,
+        seq_len,
+        clip_fea=None,
+        y=None,
+        kv_cache: dict = None,
+        crossattn_cache: dict = None,
+        current_start: int = 0,
+        cache_start: int = 0,
+    ):
+        r"""
+        Run the diffusion model with kv caching.
+        See Algorithm 2 of CausVid paper https://arxiv.org/abs/2412.07772 for details.
+        This function will be run for num_frame times.
+        Process the latent frames one by one (1560 tokens each)
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]
+        )
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat(x)
+        """
+        torch.cat([
+            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
+                      dim=1) for u in x
+        ])
+        """
+        # time embeddings
+        # with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).type_as(x)
+        )
+        e0 = (
+            self.time_projection(e)
+            .unflatten(1, (6, self.dim))
+            .unflatten(dim=0, sizes=t.shape)
+        )
+        # assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack(
+                [
+                    torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                    for u in context
+                ]
+            )
+        )
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+            block_mask=self.block_mask,
+        )
+        # print("Block mask in forward : ", self.block_mask)
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        for block_index, block in enumerate(self.blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                kwargs.update(
+                    {
+                        "kv_cache": kv_cache[block_index],
+                        "current_start": current_start,
+                        "cache_start": cache_start,
+                    }
+                )
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    **kwargs,
+                    use_reentrant=False,
+                )
+            else:
+                kwargs.update(
+                    {
+                        "kv_cache": kv_cache[block_index],
+                        "crossattn_cache": crossattn_cache[block_index],
+                        "current_start": current_start,
+                        "cache_start": cache_start,
+                    }
+                )
+                x = block(x, **kwargs)
+        # head
+        x = self.head(x, e.unflatten(dim=0, sizes=t.shape).unsqueeze(2))
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return torch.stack(x)
+    def _forward_train(
+        self,
+        x,
+        t,
+        context,
+        seq_len,
+        clean_x=None,
+        aug_t=None,
+        clip_fea=None,
+        y=None,
+    ):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        # Construct blockwise causal attn mask
+        if self.block_mask is None:
+            if clean_x is not None:
+                if self.independent_first_frame:
+                    raise NotImplementedError()
+                else:
+                    self.block_mask = self._prepare_teacher_forcing_mask(
+                        device,
+                        num_frames=x.shape[2],
+                        frame_seqlen=x.shape[-2]
+                        * x.shape[-1]
+                        // (self.patch_size[1] * self.patch_size[2]),
+                        num_frame_per_block=self.num_frame_per_block,
+                    )
+            else:
+                if self.independent_first_frame:
+                    self.block_mask = self._prepare_blockwise_causal_attn_mask_i2v(
+                        device,
+                        num_frames=x.shape[2],
+                        frame_seqlen=x.shape[-2]
+                        * x.shape[-1]
+                        // (self.patch_size[1] * self.patch_size[2]),
+                        num_frame_per_block=self.num_frame_per_block,
+                        local_attn_size=self.local_attn_size,
+                    )
+                else:
+                    self.block_mask = self._prepare_blockwise_causal_attn_mask(
+                        device,
+                        num_frames=x.shape[2],
+                        frame_seqlen=x.shape[-2]
+                        * x.shape[-1]
+                        // (self.patch_size[1] * self.patch_size[2]),
+                        num_frame_per_block=self.num_frame_per_block,
+                        local_attn_size=self.local_attn_size,
+                    )
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]
+        )
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat(
+            [
+                torch.cat(
+                    [u, u.new_zeros(1, seq_lens[0] - u.size(1), u.size(2))], dim=1
+                )
+                for u in x
+            ]
+        )
+        # time embeddings
+        # with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).type_as(x)
+        )
+        e0 = (
+            self.time_projection(e)
+            .unflatten(1, (6, self.dim))
+            .unflatten(dim=0, sizes=t.shape)
+        )
+        # assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack(
+                [
+                    torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                    for u in context
+                ]
+            )
+        )
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        if clean_x is not None:
+            clean_x = [self.patch_embedding(u.unsqueeze(0)) for u in clean_x]
+            clean_x = [u.flatten(2).transpose(1, 2) for u in clean_x]
+            seq_lens_clean = torch.tensor(
+                [u.size(1) for u in clean_x], dtype=torch.long
+            )
+            assert seq_lens_clean.max() <= seq_len
+            clean_x = torch.cat(
+                [
+                    torch.cat(
+                        [u, u.new_zeros(1, seq_lens_clean[0] - u.size(1), u.size(2))],
+                        dim=1,
+                    )
+                    for u in clean_x
+                ]
+            )
+            x = torch.cat([clean_x, x], dim=1)
+            if aug_t is None:
+                aug_t = torch.zeros_like(t)
+            e_clean = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, aug_t.flatten()).type_as(x)
+            )
+            e0_clean = (
+                self.time_projection(e_clean)
+                .unflatten(1, (6, self.dim))
+                .unflatten(dim=0, sizes=t.shape)
+            )
+            e0 = torch.cat([e0_clean, e0], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+            block_mask=self.block_mask,
+        )
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        for block in self.blocks:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    **kwargs,
+                    use_reentrant=False,
+                )
+            else:
+                x = block(x, **kwargs)
+        if clean_x is not None:
+            x = x[:, x.shape[1] // 2 :]
+        # head
+        x = self.head(x, e.unflatten(dim=0, sizes=t.shape).unsqueeze(2))
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return torch.stack(x)
+    def forward(self, *args, **kwargs):
+        result = self._forward_inference(*args, **kwargs)
+        # if kwargs.get('kv_cache', None) is not None:
+        # else:
+        #     result = self._forward_train(*args, **kwargs)
+        return result
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[: math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum("fhwpqrc->cfphqwr", u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)

transformer/config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_class_name": "CausalWanModel",
+  "_diffusers_version": "0.36.0.dev0",
+  "auto_map": {
+    "AutoModel": "causal_model.CausalWanModel"
+  },
+  "dim": 5120,
+  "eps": 1e-06,
+  "ffn_dim": 13824,
+  "freq_dim": 256,
+  "in_dim": 16,
+  "local_attn_size": -1,
+  "model_type": "t2v",
+  "num_heads": 40,
+  "num_layers": 40,
+  "out_dim": 16,
+  "sink_size": 0,
+  "text_len": 512
+}

transformer/diffusion_pytorch_model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c179cb7e91005fe6e009bfb42df4ed70316f03bbc35d33e303021b33b564791
+size 9968228976

transformer/diffusion_pytorch_model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e0f19e177dd8d83e244281ffdada88c150e080a9503dfd0f78f5acfe63563a
+size 9891538864

transformer/diffusion_pytorch_model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15c15fb4b8b8a181ff23b142566389344fb34973dddc8c49b4ff0dee29db2735
+size 8717326272

transformer/diffusion_pytorch_model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,1102 @@

+{
+  "metadata": {
+    "total_size": 28576983168
+  },
+  "weight_map": {
+    "blocks.0.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.13.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.14.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.14.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.2.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.20.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.ffn.0.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.ffn.0.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.ffn.2.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.ffn.2.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.27.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.27.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.27.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.27.modulation": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.norm3.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.norm3.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.k.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.norm_k.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.norm_q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.o.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.o.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.q.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.q.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.v.bias": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.v.weight": "diffusion_pytorch_model-00002-of-00003.safetensors",
+    "blocks.28.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.28.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.29.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.3.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.30.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.30.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.31.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.32.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.33.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.34.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.35.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.36.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.37.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.38.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.cross_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.ffn.0.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.ffn.0.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.ffn.2.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.ffn.2.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.norm3.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.norm3.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.k.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.norm_k.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.norm_q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.o.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.o.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.q.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.q.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.v.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.39.self_attn.v.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "blocks.4.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.ffn.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.ffn.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.ffn.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.ffn.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.modulation": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.norm3.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.norm3.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.k.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.norm_k.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.norm_q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.o.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.o.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.q.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.q.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.v.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.v.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "head.head.bias": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "head.head.weight": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "head.modulation": "diffusion_pytorch_model-00003-of-00003.safetensors",
+    "patch_embedding.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "patch_embedding.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "text_embedding.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "text_embedding.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "text_embedding.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "text_embedding.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "time_embedding.0.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "time_embedding.0.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "time_embedding.2.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "time_embedding.2.weight": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "time_projection.1.bias": "diffusion_pytorch_model-00001-of-00003.safetensors",
+    "time_projection.1.weight": "diffusion_pytorch_model-00001-of-00003.safetensors"
+  }
+}

transformer/model.py ADDED Viewed

	@@ -0,0 +1,1002 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from einops import repeat
+from .attention import (
+    flash_attention,
+    sageattn_func,
+    _SAGEATTN_AVAILABLE,
+    _FLASH_ATTN_2_AVAILABLE,
+    _FLASH_ATTN_3_AVAILABLE,
+)
+print("SAGEATTN_AVAILABLE:", _SAGEATTN_AVAILABLE)
+__all__ = ["WanModel"]
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(
+        position,
+        torch.pow(
+            10000,
+            -torch.arange(
+                half, device=torch.cuda.current_device(), dtype=torch.float64
+            ).div(half),
+        ),
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+# @amp.autocast(enabled=False)
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float64).div(dim)),
+    )
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+# @amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    n, c = x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(
+            x[i, :seq_len].to(torch.float64).reshape(seq_len, n, -1, 2)
+        )
+        freqs_i = torch.cat(
+            [
+                freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        ).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[i, seq_len:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).type_as(x)
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return self._norm(x.float()).type_as(x) * self.weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return super().forward(x).type_as(x)
+class WanSelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, seq_lens, grid_sizes, freqs):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            seq_lens(Tensor): Shape [B]
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        if _SAGEATTN_AVAILABLE:
+            # print("Using sageattention in crossattn")
+            og_dtype = q.dtype
+            q = q.transpose(1, 2).to(dtype)
+            k = k.transpose(1, 2).to(dtype)
+            v = v.transpose(1, 2).to(dtype)
+            x = sageattn_func(
+                q=rope_apply(q, grid_sizes, freqs),
+                k=rope_apply(k, grid_sizes, freqs),
+                v=v,
+            )
+            x = x.transpose(1, 2).contiguous().to(og_dtype)
+        else:
+            x = flash_attention(
+                q=rope_apply(q, grid_sizes, freqs),
+                k=rope_apply(k, grid_sizes, freqs),
+                v=v,
+                k_lens=seq_lens,
+                window_size=self.window_size,
+            )
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanT2VCrossAttention(WanSelfAttention):
+    def forward(self, x, context, context_lens, crossattn_cache=None):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+            crossattn_cache (List[dict], *optional*): Contains the cached key and value tensors for context embedding.
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        if crossattn_cache is not None:
+            if not crossattn_cache["is_init"]:
+                crossattn_cache["is_init"] = True
+                k = self.norm_k(self.k(context)).view(b, -1, n, d)
+                v = self.v(context).view(b, -1, n, d)
+                crossattn_cache["k"] = k
+                crossattn_cache["v"] = v
+            else:
+                k = crossattn_cache["k"]
+                v = crossattn_cache["v"]
+        else:
+            k = self.norm_k(self.k(context)).view(b, -1, n, d)
+            v = self.v(context).view(b, -1, n, d)
+        # compute attention
+        if _SAGEATTN_AVAILABLE:
+            # print("Using sageattention in crossattn")
+            dtype = torch.bfloat16
+            og_dtype = q.dtype
+            q = q.transpose(1, 2).to(dtype)
+            k = k.transpose(1, 2).to(dtype)
+            v = v.transpose(1, 2).to(dtype)
+            x = sageattn_func(
+                q=q,
+                k=k,
+                v=v,
+            )
+            x = x.transpose(1, 2).contiguous().to(og_dtype)
+        elif _FLASH_ATTN_2_AVAILABLE or _FLASH_ATTN_3_AVAILABLE:
+            x = flash_attention(q, k, v, k_lens=context_lens)
+        else:
+            dtype = torch.bfloat16
+            q = q.transpose(1, 2).to(dtype)
+            k = k.transpose(1, 2).to(dtype)
+            v = v.transpose(1, 2).to(dtype)
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            x = x.transpose(1, 2).contiguous()
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanGanCrossAttention(WanSelfAttention):
+    def forward(self, x, context, crossattn_cache=None):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+            crossattn_cache (List[dict], *optional*): Contains the cached key and value tensors for context embedding.
+        """
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        qq = self.norm_q(self.q(context)).view(b, 1, -1, d)
+        kk = self.norm_k(self.k(x)).view(b, -1, n, d)
+        vv = self.v(x).view(b, -1, n, d)
+        # compute attention
+        x = flash_attention(qq, kk, vv)
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanI2VCrossAttention(WanSelfAttention):
+    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps)
+        self.k_img = nn.Linear(dim, dim)
+        self.v_img = nn.Linear(dim, dim)
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, context, context_lens):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        k_img = self.norm_k_img(self.k_img(context_img)).view(b, -1, n, d)
+        v_img = self.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        x = x + img_x
+        x = self.o(x)
+        return x
+WAN_CROSSATTENTION_CLASSES = {
+    "t2v_cross_attn": WanT2VCrossAttention,
+    "i2v_cross_attn": WanI2VCrossAttention,
+}
+class WanAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        cross_attn_type,
+        dim,
+        ffn_dim,
+        num_heads,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
+        self.norm3 = (
+            WanLayerNorm(dim, eps, elementwise_affine=True)
+            if cross_attn_norm
+            else nn.Identity()
+        )
+        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](
+            dim, num_heads, (-1, -1), qk_norm, eps
+        )
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        # assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        e = (self.modulation + e).chunk(6, dim=1)
+        # assert e[0].dtype == torch.float32
+        # self-attention
+        y = self.self_attn(
+            self.norm1(x) * (1 + e[1]) + e[0], seq_lens, grid_sizes, freqs
+        )
+        # with amp.autocast(dtype=torch.float32):
+        x = x + y * e[2]
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context, context_lens, e):
+            x = x + self.cross_attn(self.norm3(x), context, context_lens)
+            y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
+            # with amp.autocast(dtype=torch.float32):
+            x = x + y * e[5]
+            return x
+        x = cross_attn_ffn(x, context, context_lens, e)
+        return x
+class GanAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim=1536,
+        ffn_dim=8192,
+        num_heads=12,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=True,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        # layers
+        # self.norm1 = WanLayerNorm(dim, eps)
+        # self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
+        #   eps)
+        self.norm3 = (
+            WanLayerNorm(dim, eps, elementwise_affine=True)
+            if cross_attn_norm
+            else nn.Identity()
+        )
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        self.cross_attn = WanGanCrossAttention(dim, num_heads, (-1, -1), qk_norm, eps)
+        # modulation
+        # self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        x,
+        context,
+        # seq_lens,
+        # grid_sizes,
+        # freqs,
+        # context,
+        # context_lens,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        # assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        # e = (self.modulation + e).chunk(6, dim=1)
+        # assert e[0].dtype == torch.float32
+        # # self-attention
+        # y = self.self_attn(
+        #     self.norm1(x) * (1 + e[1]) + e[0], seq_lens, grid_sizes,
+        #     freqs)
+        # # with amp.autocast(dtype=torch.float32):
+        # x = x + y * e[2]
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context):
+            token = context + self.cross_attn(self.norm3(x), context)
+            y = self.ffn(self.norm2(token)) + token  # * (1 + e[4]) + e[3])
+            # with amp.autocast(dtype=torch.float32):
+            # x = x + y * e[5]
+            return y
+        x = cross_attn_ffn(x, context)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        # assert e.dtype == torch.float32
+        # with amp.autocast(dtype=torch.float32):
+        e = (self.modulation + e.unsqueeze(1)).chunk(2, dim=1)
+        x = self.head(self.norm(x) * (1 + e[1]) + e[0])
+        return x
+class MLPProj(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_dim),
+            torch.nn.Linear(in_dim, in_dim),
+            torch.nn.GELU(),
+            torch.nn.Linear(in_dim, out_dim),
+            torch.nn.LayerNorm(out_dim),
+        )
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+class RegisterTokens(nn.Module):
+    def __init__(self, num_registers: int, dim: int):
+        super().__init__()
+        self.register_tokens = nn.Parameter(torch.randn(num_registers, dim) * 0.02)
+        self.rms_norm = WanRMSNorm(dim, eps=1e-6)
+    def forward(self):
+        return self.rms_norm(self.register_tokens)
+    def reset_parameters(self):
+        nn.init.normal_(self.register_tokens, std=0.02)
+class WanModel(ModelMixin, ConfigMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+    ignore_for_config = [
+        "patch_size",
+        "cross_attn_norm",
+        "qk_norm",
+        "text_dim",
+        "window_size",
+    ]
+    _no_split_modules = ["WanAttentionBlock"]
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        model_type="t2v",
+        patch_size=(1, 2, 2),
+        text_len=512,
+        in_dim=16,
+        dim=2048,
+        ffn_dim=8192,
+        freq_dim=256,
+        text_dim=4096,
+        out_dim=16,
+        num_heads=16,
+        num_layers=32,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=True,
+        eps=1e-6,
+    ):
+        r"""
+        Initialize the diffusion model backbone.
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            window_size (`tuple`, *optional*, defaults to (-1, -1)):
+                Window size for local attention (-1 indicates global attention)
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+        super().__init__()
+        assert model_type in ["t2v", "i2v"]
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.local_attn_size = 21
+        # embeddings
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        cross_attn_type = "t2v_cross_attn" if model_type == "t2v" else "i2v_cross_attn"
+        self.blocks = nn.ModuleList(
+            [
+                WanAttentionBlock(
+                    cross_attn_type,
+                    dim,
+                    ffn_dim,
+                    num_heads,
+                    window_size,
+                    qk_norm,
+                    cross_attn_norm,
+                    eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat(
+            [
+                # rope_params(1024, d - 4 * (d // 6)),
+                rope_params_riflex(
+                    1024,
+                    d - 4 * (d // 6),
+                ),
+                rope_params(1024, 2 * (d // 6)),
+                rope_params(1024, 2 * (d // 6)),
+            ],
+            dim=1,
+        )
+        if model_type == "i2v":
+            self.img_emb = MLPProj(1280, dim)
+        # initialize weights
+        self.init_weights()
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    def forward(self, *args, **kwargs):
+        # if kwargs.get('classify_mode', False) is True:
+        # kwargs.pop('classify_mode')
+        # return self._forward_classify(*args, **kwargs)
+        # else:
+        return self._forward(*args, **kwargs)
+    def _forward(
+        self,
+        x,
+        t,
+        context,
+        seq_len,
+        classify_mode=False,
+        concat_time_embeddings=False,
+        register_tokens=None,
+        cls_pred_branch=None,
+        gan_ca_blocks=None,
+        clip_fea=None,
+        y=None,
+    ):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]
+        )
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat(
+            [
+                torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+                for u in x
+            ]
+        )
+        # time embeddings
+        # with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t).type_as(x))
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        # assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack(
+                [
+                    torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                    for u in context
+                ]
+            )
+        )
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+        )
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        # TODO: Tune the number of blocks for feature extraction
+        final_x = None
+        if classify_mode:
+            assert register_tokens is not None
+            assert gan_ca_blocks is not None
+            assert cls_pred_branch is not None
+            final_x = []
+            registers = repeat(register_tokens(), "n d -> b n d", b=x.shape[0])
+            # x = torch.cat([registers, x], dim=1)
+        gan_idx = 0
+        for ii, block in enumerate(self.blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    **kwargs,
+                    use_reentrant=False,
+                )
+            else:
+                x = block(x, **kwargs)
+            if classify_mode and ii in [13, 21, 29]:
+                gan_token = registers[:, gan_idx : gan_idx + 1]
+                final_x.append(gan_ca_blocks[gan_idx](x, gan_token))
+                gan_idx += 1
+        if classify_mode:
+            final_x = torch.cat(final_x, dim=1)
+            if concat_time_embeddings:
+                final_x = cls_pred_branch(
+                    torch.cat([final_x, 10 * e[:, None, :]], dim=1).view(
+                        final_x.shape[0], -1
+                    )
+                )
+            else:
+                final_x = cls_pred_branch(final_x.view(final_x.shape[0], -1))
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        if classify_mode:
+            return torch.stack(x), final_x
+        return torch.stack(x)
+    def _forward_classify(
+        self,
+        x,
+        t,
+        context,
+        seq_len,
+        register_tokens,
+        cls_pred_branch,
+        clip_fea=None,
+        y=None,
+    ):
+        r"""
+        Feature extraction through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of video features with original input shapes [C_block, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert clip_fea is not None and y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]
+        )
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat(
+            [
+                torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+                for u in x
+            ]
+        )
+        # time embeddings
+        # with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t).type_as(x))
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        # assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = None
+        context = self.text_embedding(
+            torch.stack(
+                [
+                    torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                    for u in context
+                ]
+            )
+        )
+        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+        )
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        # TODO: Tune the number of blocks for feature extraction
+        for block in self.blocks[:16]:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    **kwargs,
+                    use_reentrant=False,
+                )
+            else:
+                x = block(x, **kwargs)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes, c=self.dim // 4)
+        return torch.stack(x)
+    def unpatchify(self, x, grid_sizes, c=None):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+        c = self.out_dim if c is None else c
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[: math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum("fhwpqrc->cfphqwr", u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)