{ "_class_name": "MMDiTTransformer2DModel", "_diffusers_version": "0.33.1", "attention_head_dim": 64, "caption_channels": 2048, "caption_projection_dim": 1152, "dual_attention_layers": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ], "in_channels": 32, "interpolation_scale": 2, "joint_attention_dim": 4096, "num_attention_heads": 18, "num_layers": 24, "out_channels": 32, "patch_size": 1, "pooled_projection_dim": 2048, "pos_embed_max_size": 96, "projector_dim": 2048, "qk_norm": "rms_norm", "repa_depth": -1, "sample_size": 32, "z_dims": [ 768 ] }