|
|
import spaces |
|
|
import torch |
|
|
from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline |
|
|
from diffusers.models.transformers.transformer_wan import WanTransformer3DModel |
|
|
from diffusers.utils.export_utils import export_to_video |
|
|
import gradio as gr |
|
|
import tempfile |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import random |
|
|
import gc |
|
|
|
|
|
from torchao.quantization import quantize_ |
|
|
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig |
|
|
from torchao.quantization import Int8WeightOnlyConfig |
|
|
|
|
|
import aoti |
|
|
|
|
|
|
|
|
MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" |
|
|
|
|
|
MAX_DIM = 832 |
|
|
MIN_DIM = 480 |
|
|
SQUARE_DIM = 640 |
|
|
MULTIPLE_OF = 16 |
|
|
|
|
|
MAX_SEED = np.iinfo(np.int32).max |
|
|
|
|
|
FIXED_FPS = 16 |
|
|
MIN_FRAMES_MODEL = 8 |
|
|
MAX_FRAMES_MODEL = 80 |
|
|
|
|
|
MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1) |
|
|
MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1) |
|
|
|
|
|
|
|
|
pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, |
|
|
transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', |
|
|
subfolder='transformer', |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map='cuda', |
|
|
), |
|
|
transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', |
|
|
subfolder='transformer_2', |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map='cuda', |
|
|
), |
|
|
torch_dtype=torch.bfloat16, |
|
|
).to('cuda') |
|
|
|
|
|
pipe.load_lora_weights( |
|
|
"Kijai/WanVideo_comfy", |
|
|
weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", |
|
|
adapter_name="lightx2v" |
|
|
) |
|
|
kwargs_lora = {} |
|
|
kwargs_lora["load_into_transformer_2"] = True |
|
|
pipe.load_lora_weights( |
|
|
"Kijai/WanVideo_comfy", |
|
|
weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", |
|
|
adapter_name="lightx2v_2", **kwargs_lora |
|
|
) |
|
|
pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.]) |
|
|
pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"]) |
|
|
pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"]) |
|
|
pipe.unload_lora_weights() |
|
|
|
|
|
quantize_(pipe.text_encoder, Int8WeightOnlyConfig()) |
|
|
quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig()) |
|
|
quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig()) |
|
|
|
|
|
aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da') |
|
|
aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da') |
|
|
|
|
|
|
|
|
default_prompt_i2v = "์ด ์ด๋ฏธ์ง์ ์๋๊ฐ์ ๋ถ์ฌํ๊ณ , ์ํ ๊ฐ์ ์์ง์๊ณผ ๋ถ๋๋ฌ์ด ์ ๋๋ฉ์ด์
์ ์ ์ฉ" |
|
|
default_negative_prompt = "์์กฐ ์ ๋ช
, ๊ณผ๋ค ๋
ธ์ถ, ์ ์ , ์ธ๋ถ ํ๋ฆผ, ์๋ง, ์คํ์ผ, ์ํ, ๊ทธ๋ฆผ, ํ๋ฉด, ์ ์ง, ํ์์กฐ, ์ต์
ํ์ง, ์ ํ์ง, JPEG ์์ถ, ์ถํจ, ๋ถ์์ , ์ถ๊ฐ ์๊ฐ๋ฝ, ์๋ชป ๊ทธ๋ ค์ง ์, ์๋ชป ๊ทธ๋ ค์ง ์ผ๊ตด, ๊ธฐํ, ๋ณํ, ํํ ๋ถ๋ ์ฌ์ง, ์๊ฐ๋ฝ ์ตํฉ, ์ ์ง ํ๋ฉด, ์ง์ ๋ถํ ๋ฐฐ๊ฒฝ, ์ธ ๊ฐ์ ๋ค๋ฆฌ, ๋ฐฐ๊ฒฝ ์ฌ๋ ๋ง์, ๋ค๋ก ๊ฑท๊ธฐ" |
|
|
|
|
|
def resize_image(image: Image.Image) -> Image.Image: |
|
|
width, height = image.size |
|
|
|
|
|
if width == height: |
|
|
return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS) |
|
|
|
|
|
aspect_ratio = width / height |
|
|
|
|
|
MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM |
|
|
MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM |
|
|
|
|
|
image_to_resize = image |
|
|
|
|
|
if aspect_ratio > MAX_ASPECT_RATIO: |
|
|
target_w, target_h = MAX_DIM, MIN_DIM |
|
|
crop_width = int(round(height * MAX_ASPECT_RATIO)) |
|
|
left = (width - crop_width) // 2 |
|
|
image_to_resize = image.crop((left, 0, left + crop_width, height)) |
|
|
elif aspect_ratio < MIN_ASPECT_RATIO: |
|
|
target_w, target_h = MIN_DIM, MAX_DIM |
|
|
crop_height = int(round(width / MIN_ASPECT_RATIO)) |
|
|
top = (height - crop_height) // 2 |
|
|
image_to_resize = image.crop((0, top, width, top + crop_height)) |
|
|
else: |
|
|
if width > height: |
|
|
target_w = MAX_DIM |
|
|
target_h = int(round(target_w / aspect_ratio)) |
|
|
else: |
|
|
target_h = MAX_DIM |
|
|
target_w = int(round(target_h * aspect_ratio)) |
|
|
|
|
|
final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF |
|
|
final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF |
|
|
|
|
|
final_w = max(MIN_DIM, min(MAX_DIM, final_w)) |
|
|
final_h = max(MIN_DIM, min(MAX_DIM, final_h)) |
|
|
|
|
|
return image_to_resize.resize((final_w, final_h), Image.LANCZOS) |
|
|
|
|
|
|
|
|
def get_num_frames(duration_seconds: float): |
|
|
return 1 + int(np.clip( |
|
|
int(round(duration_seconds * FIXED_FPS)), |
|
|
MIN_FRAMES_MODEL, |
|
|
MAX_FRAMES_MODEL, |
|
|
)) |
|
|
|
|
|
|
|
|
def get_duration( |
|
|
input_image, |
|
|
prompt, |
|
|
steps, |
|
|
negative_prompt, |
|
|
duration_seconds, |
|
|
guidance_scale, |
|
|
guidance_scale_2, |
|
|
seed, |
|
|
randomize_seed, |
|
|
progress, |
|
|
): |
|
|
BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624 |
|
|
BASE_STEP_DURATION = 15 |
|
|
width, height = resize_image(input_image).size |
|
|
frames = get_num_frames(duration_seconds) |
|
|
factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH |
|
|
step_duration = BASE_STEP_DURATION * factor ** 1.5 |
|
|
return 10 + int(steps) * step_duration |
|
|
|
|
|
@spaces.GPU(duration=get_duration) |
|
|
def generate_video( |
|
|
input_image, |
|
|
prompt, |
|
|
steps = 4, |
|
|
negative_prompt=default_negative_prompt, |
|
|
duration_seconds = MAX_DURATION, |
|
|
guidance_scale = 1, |
|
|
guidance_scale_2 = 1, |
|
|
seed = 42, |
|
|
randomize_seed = False, |
|
|
progress=gr.Progress(track_tqdm=True), |
|
|
): |
|
|
if input_image is None: |
|
|
raise gr.Error("์ด๋ฏธ์ง๋ฅผ ์
๋ก๋ํด์ฃผ์ธ์.") |
|
|
|
|
|
num_frames = get_num_frames(duration_seconds) |
|
|
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) |
|
|
resized_image = resize_image(input_image) |
|
|
|
|
|
output_frames_list = pipe( |
|
|
image=resized_image, |
|
|
prompt=prompt, |
|
|
negative_prompt=negative_prompt, |
|
|
height=resized_image.height, |
|
|
width=resized_image.width, |
|
|
num_frames=num_frames, |
|
|
guidance_scale=float(guidance_scale), |
|
|
guidance_scale_2=float(guidance_scale_2), |
|
|
num_inference_steps=int(steps), |
|
|
generator=torch.Generator(device="cuda").manual_seed(current_seed), |
|
|
).frames[0] |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile: |
|
|
video_path = tmpfile.name |
|
|
|
|
|
export_to_video(output_frames_list, video_path, fps=FIXED_FPS) |
|
|
|
|
|
return video_path, current_seed |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# ๐ฌ WAN ๊ธฐ๋ฐ ์ด๊ณ ์ ์ด๋ฏธ์ง to ๋น๋์ค ๋ฌด๋ฃ ์์ฑ ์คํ์์ค") |
|
|
gr.Markdown("** WAN 2.2 14B + FAST + ํ๊ธํ + ํ๋ ** - 4~8๋จ๊ณ๋ก ๋น ๋ฅธ ์์ ์์ฑ") |
|
|
gr.Markdown("** ํธ๋ํฝ ์ ํ์ ๋ค์ 4๊ฐ์ ๋ฏธ๋ฌ๋ง ์๋ฒ๋ค์ ์ด์ฉํ์ฌ ๋ถ์ฐ ์ฌ์ฉ ๊ถ๊ณ ") |
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="display: flex; gap: 10px; flex-wrap: wrap; justify-content: center; margin: 20px 0;"> |
|
|
<a href="https://huggingface.co/spaces/Heartsync/wan2_2-I2V-14B-FAST" target="_blank"> |
|
|
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge"> |
|
|
</a> |
|
|
<a href="https://huggingface.co/spaces/ginipick/wan2_2-I2V-14B-FAST" target="_blank"> |
|
|
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge"> |
|
|
</a> |
|
|
<a href="https://huggingface.co/spaces/ginigen/wan2_2-I2V-14B-FAST" target="_blank"> |
|
|
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge"> |
|
|
</a> |
|
|
<a href="https://huggingface.co/spaces/VIDraft/wan2_2-I2V-14B-FAST" target="_blank"> |
|
|
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge"> |
|
|
</a> |
|
|
<a href="https://discord.gg/openfreeai" target="_blank"> |
|
|
<img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="badge"></a> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
input_image_component = gr.Image(type="pil", label="์
๋ ฅ ์ด๋ฏธ์ง") |
|
|
prompt_input = gr.Textbox(label="ํ๋กฌํํธ", value=default_prompt_i2v, lines=2) |
|
|
duration_seconds_input = gr.Slider( |
|
|
minimum=MIN_DURATION, |
|
|
maximum=MAX_DURATION, |
|
|
step=0.1, |
|
|
value=3.5, |
|
|
label="์์ ๊ธธ์ด (์ด)" |
|
|
) |
|
|
|
|
|
with gr.Accordion("๊ณ ๊ธ ์ค์ ", open=False): |
|
|
negative_prompt_input = gr.Textbox(label="๋ค๊ฑฐํฐ๋ธ ํ๋กฌํํธ", value=default_negative_prompt, lines=2) |
|
|
steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="์์ฑ ๋จ๊ณ") |
|
|
guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="๊ฐ์ด๋์ค ์ค์ผ์ผ 1") |
|
|
guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="๊ฐ์ด๋์ค ์ค์ผ์ผ 2") |
|
|
seed_input = gr.Slider(label="์๋", minimum=0, maximum=MAX_SEED, step=1, value=42) |
|
|
randomize_seed_checkbox = gr.Checkbox(label="๋๋ค ์๋ ์ฌ์ฉ", value=True) |
|
|
|
|
|
generate_button = gr.Button("๐ฅ ์์ ์์ฑ", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
video_output = gr.Video(label="์์ฑ๋ ์์", autoplay=True, interactive=False) |
|
|
|
|
|
ui_inputs = [ |
|
|
input_image_component, prompt_input, steps_slider, |
|
|
negative_prompt_input, duration_seconds_input, |
|
|
guidance_scale_input, guidance_scale_2_input, seed_input, randomize_seed_checkbox |
|
|
] |
|
|
generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input]) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
[ |
|
|
"wan_i2v_input.JPG", |
|
|
"POV ์
์นด ์์, ์ ๊ธ๋ผ์ค ๋ ํฐ ๊ณ ์์ด๊ฐ ์ํ๋ณด๋์ ์์ ํธ์ํ ๋ฏธ์. ๋ฐฐ๊ฒฝ์ ์ด๋ ํด๋ณ(๋ง์ ๋ฌผ, ๋
น์ ์ธ๋, ๊ตฌ๋ฆ ๋ ํธ๋ฅธ ํ๋). ์ํ๋ณด๋๊ฐ ๊ธฐ์ธ์ด์ง๊ณ ๊ณ ์์ด๊ฐ ๋ฐ๋ค๋ก ๋จ์ด์ง๋ฉฐ ์นด๋ฉ๋ผ๊ฐ ๊ฑฐํ๊ณผ ํ๋น๊ณผ ํจ๊ป ๋ฌผ์์ผ๋ก ๋น ์ง. ์ ๊น ๋ฌผ์์์ ๊ณ ์์ด ์ผ๊ตด ๋ณด์ด๋ค๊ฐ ๋ค์ ์๋ฉด ์๋ก ์ฌ๋ผ์ ์
์นด ์ดฌ์ ๊ณ์, ์ฆ๊ฑฐ์ด ์ฌ๋ฆ ํด๊ฐ ๋ถ์๊ธฐ.", |
|
|
4, |
|
|
], |
|
|
[ |
|
|
"wan22_input_2.jpg", |
|
|
"์ธ๋ จ๋ ๋ฌ ํ์ฌ ์ฐจ๋์ด ์ผ์ชฝ์์ ์ค๋ฅธ์ชฝ์ผ๋ก ๋ฏธ๋๋ฌ์ง๋ฏ ์ด๋ํ๋ฉฐ ๋ฌ ๋จผ์ง๋ฅผ ์ผ์ผํด. ํฐ ์ฐ์ฃผ๋ณต์ ์
์ ์ฐ์ฃผ์ธ๋ค์ด ๋ฌ ํน์ ์ ๋ฐ๋ ๋์์ผ๋ก ํ์น. ๋จผ ๋ฐฐ๊ฒฝ์์ VTOL ๋นํ์ฒด๊ฐ ์์ง์ผ๋ก ํ๊ฐํ์ฌ ํ๋ฉด์ ์กฐ์ฉํ ์ฐฉ๋ฅ. ์ฅ๋ฉด ์ ์ฒด์ ๊ฑธ์ณ ์ดํ์ค์ ์ธ ์ค๋ก๋ผ๊ฐ ๋ณ์ด ๊ฐ๋ํ ํ๋์ ๊ฐ๋ก์ง๋ฅด๋ฉฐ ์ถค์ถ๊ณ , ๋
น์, ํ๋์, ๋ณด๋ผ์ ๋น์ ์ปคํผ์ด ๋ฌ ํ๊ฒฝ์ ์ ๋น๋กญ๊ณ ๋ง๋ฒ ๊ฐ์ ๋น์ผ๋ก ๊ฐ์.", |
|
|
4, |
|
|
], |
|
|
[ |
|
|
"kill_bill.jpeg", |
|
|
"์ฐ๋ง ์๋จผ์ ์บ๋ฆญํฐ ๋ฒ ์ํธ๋ฆญ์ค ํค๋๊ฐ ์ํ ๊ฐ์ ์กฐ๋ช
์์์ ๋ ์นด๋ก์ด ์นดํ๋ ๊ฒ์ ์์ ์ ์ผ๋ก ๋ค๊ณ ์์. ๊ฐ์๊ธฐ ๊ดํ ๋๋ ๊ฐ์ฒ ์ด ๋ถ๋๋ฌ์์ง๊ณ ์๊ณก๋๊ธฐ ์์ํ๋ฉฐ ๊ฐ์ด๋ ๊ธ์์ฒ๋ผ ๊ตฌ์กฐ์ ์์ ์ฑ์ ์๊ธฐ ์์. ๊ฒ๋ ์ ์๋ฒฝํ ๋์ด ์ฒ์ฒํ ํ์ด์ง๊ณ ๋์ด์ง๋ฉฐ, ๋
น์ ๊ฐ์ฒ ์ด ์๋น ๋ฌผ์ค๊ธฐ๋ก ์๋๋ก ํ๋ฌ๋ด๋ฆผ. ๋ณํ์ ์ฒ์์๋ ๋ฏธ๋ฌํ๊ฒ ์์๋๋ค๊ฐ ๊ธ์์ด ์ ์ ๋ ์ ๋์ ์ด ๋๋ฉด์ ๊ฐ์ํ. ์นด๋ฉ๋ผ๋ ๊ทธ๋
์ ์ผ๊ตด์ ๊ณ ์ ํ๊ณ ๋ ์นด๋ก์ด ๋๋น์ด ์ ์ฐจ ์ข์์ง๋๋ฐ, ์น๋ช
์ ์ธ ์ง์ค์ด ์๋๋ผ ๋ฌด๊ธฐ๊ฐ ๋์์์ ๋
น๋ ๊ฒ์ ๋ณด๋ฉฐ ํผ๋๊ณผ ๊ฒฝ์
. ํธํก์ด ์ฝ๊ฐ ๋นจ๋ผ์ง๋ฉฐ ์ด ๋ถ๊ฐ๋ฅํ ๋ณํ์ ๋ชฉ๊ฒฉ. ๋
น๋ ํ์์ด ๊ฐํ๋๊ณ ์นดํ๋์ ์๋ฒฝํ ํํ๊ฐ ์ ์ ์ถ์์ ์ด ๋๋ฉฐ ์์์ ์์์ฒ๋ผ ๋จ์ด์ง. ๋
น์ ๋ฐฉ์ธ์ด ๋ถ๋๋ฌ์ด ๊ธ์ ์ถฉ๊ฒฉ์๊ณผ ํจ๊ป ๋ฐ๋ฅ์ ๋จ์ด์ง. ํ์ ์ด ์ฐจ๋ถํ ์ค๋น์์ ๋นํน๊ฐ๊ณผ ์ฐ๋ ค๋ก ๋ฐ๋๋ฉฐ ์ ์ค์ ์ธ ๋ณต์์ ๋๊ตฌ๊ฐ ์์์ ๋ฌธ์ ๊ทธ๋๋ก ์กํ๋์ด ๋ฌด๋ฐฉ๋น ์ํ๊ฐ ๋จ.", |
|
|
6, |
|
|
], |
|
|
], |
|
|
inputs=[input_image_component, prompt_input, steps_slider], |
|
|
outputs=[video_output, seed_input], |
|
|
fn=generate_video, |
|
|
cache_examples="lazy" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue().launch(mcp_server=True) |