ginipick's picture
Update app.py
cf65276 verified
import spaces
import torch
from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
from diffusers.utils.export_utils import export_to_video
import gradio as gr
import tempfile
import numpy as np
from PIL import Image
import random
import gc
from torchao.quantization import quantize_
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
from torchao.quantization import Int8WeightOnlyConfig
import aoti
MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
MAX_DIM = 832
MIN_DIM = 480
SQUARE_DIM = 640
MULTIPLE_OF = 16
MAX_SEED = np.iinfo(np.int32).max
FIXED_FPS = 16
MIN_FRAMES_MODEL = 8
MAX_FRAMES_MODEL = 80
MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
subfolder='transformer',
torch_dtype=torch.bfloat16,
device_map='cuda',
),
transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
subfolder='transformer_2',
torch_dtype=torch.bfloat16,
device_map='cuda',
),
torch_dtype=torch.bfloat16,
).to('cuda')
pipe.load_lora_weights(
"Kijai/WanVideo_comfy",
weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
adapter_name="lightx2v"
)
kwargs_lora = {}
kwargs_lora["load_into_transformer_2"] = True
pipe.load_lora_weights(
"Kijai/WanVideo_comfy",
weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
adapter_name="lightx2v_2", **kwargs_lora
)
pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
pipe.unload_lora_weights()
quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
default_prompt_i2v = "์ด ์ด๋ฏธ์ง€์— ์ƒ๋™๊ฐ์„ ๋ถ€์—ฌํ•˜๊ณ , ์˜ํ™” ๊ฐ™์€ ์›€์ง์ž„๊ณผ ๋ถ€๋“œ๋Ÿฌ์šด ์• ๋‹ˆ๋ฉ”์ด์…˜์„ ์ ์šฉ"
default_negative_prompt = "์ƒ‰์กฐ ์„ ๋ช…, ๊ณผ๋‹ค ๋…ธ์ถœ, ์ •์ , ์„ธ๋ถ€ ํ๋ฆผ, ์ž๋ง‰, ์Šคํƒ€์ผ, ์ž‘ํ’ˆ, ๊ทธ๋ฆผ, ํ™”๋ฉด, ์ •์ง€, ํšŒ์ƒ‰์กฐ, ์ตœ์•… ํ’ˆ์งˆ, ์ €ํ’ˆ์งˆ, JPEG ์••์ถ•, ์ถ”ํ•จ, ๋ถˆ์™„์ „, ์ถ”๊ฐ€ ์†๊ฐ€๋ฝ, ์ž˜๋ชป ๊ทธ๋ ค์ง„ ์†, ์ž˜๋ชป ๊ทธ๋ ค์ง„ ์–ผ๊ตด, ๊ธฐํ˜•, ๋ณ€ํ˜•, ํ˜•ํƒœ ๋ถˆ๋Ÿ‰ ์‚ฌ์ง€, ์†๊ฐ€๋ฝ ์œตํ•ฉ, ์ •์ง€ ํ™”๋ฉด, ์ง€์ €๋ถ„ํ•œ ๋ฐฐ๊ฒฝ, ์„ธ ๊ฐœ์˜ ๋‹ค๋ฆฌ, ๋ฐฐ๊ฒฝ ์‚ฌ๋žŒ ๋งŽ์Œ, ๋’ค๋กœ ๊ฑท๊ธฐ"
def resize_image(image: Image.Image) -> Image.Image:
width, height = image.size
if width == height:
return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
aspect_ratio = width / height
MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM
image_to_resize = image
if aspect_ratio > MAX_ASPECT_RATIO:
target_w, target_h = MAX_DIM, MIN_DIM
crop_width = int(round(height * MAX_ASPECT_RATIO))
left = (width - crop_width) // 2
image_to_resize = image.crop((left, 0, left + crop_width, height))
elif aspect_ratio < MIN_ASPECT_RATIO:
target_w, target_h = MIN_DIM, MAX_DIM
crop_height = int(round(width / MIN_ASPECT_RATIO))
top = (height - crop_height) // 2
image_to_resize = image.crop((0, top, width, top + crop_height))
else:
if width > height:
target_w = MAX_DIM
target_h = int(round(target_w / aspect_ratio))
else:
target_h = MAX_DIM
target_w = int(round(target_h * aspect_ratio))
final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF
final_w = max(MIN_DIM, min(MAX_DIM, final_w))
final_h = max(MIN_DIM, min(MAX_DIM, final_h))
return image_to_resize.resize((final_w, final_h), Image.LANCZOS)
def get_num_frames(duration_seconds: float):
return 1 + int(np.clip(
int(round(duration_seconds * FIXED_FPS)),
MIN_FRAMES_MODEL,
MAX_FRAMES_MODEL,
))
def get_duration(
input_image,
prompt,
steps,
negative_prompt,
duration_seconds,
guidance_scale,
guidance_scale_2,
seed,
randomize_seed,
progress,
):
BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
BASE_STEP_DURATION = 15
width, height = resize_image(input_image).size
frames = get_num_frames(duration_seconds)
factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
step_duration = BASE_STEP_DURATION * factor ** 1.5
return 10 + int(steps) * step_duration
@spaces.GPU(duration=get_duration)
def generate_video(
input_image,
prompt,
steps = 4,
negative_prompt=default_negative_prompt,
duration_seconds = MAX_DURATION,
guidance_scale = 1,
guidance_scale_2 = 1,
seed = 42,
randomize_seed = False,
progress=gr.Progress(track_tqdm=True),
):
if input_image is None:
raise gr.Error("์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.")
num_frames = get_num_frames(duration_seconds)
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
resized_image = resize_image(input_image)
output_frames_list = pipe(
image=resized_image,
prompt=prompt,
negative_prompt=negative_prompt,
height=resized_image.height,
width=resized_image.width,
num_frames=num_frames,
guidance_scale=float(guidance_scale),
guidance_scale_2=float(guidance_scale_2),
num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed),
).frames[0]
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
video_path = tmpfile.name
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
return video_path, current_seed
# ์„ธ๋ จ๋œ ํ•œ๊ธ€ UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# ๐ŸŽฌ WAN ๊ธฐ๋ฐ˜ ์ดˆ๊ณ ์† ์ด๋ฏธ์ง€ to ๋น„๋””์˜ค ๋ฌด๋ฃŒ ์ƒ์„ฑ ์˜คํ”ˆ์†Œ์Šค")
gr.Markdown("** WAN 2.2 14B + FAST + ํ•œ๊ธ€ํ™” + ํŠœ๋‹ ** - 4~8๋‹จ๊ณ„๋กœ ๋น ๋ฅธ ์˜์ƒ ์ƒ์„ฑ")
gr.Markdown("** ํŠธ๋ž˜ํ”ฝ ์ œํ•œ์‹œ ๋‹ค์Œ 4๊ฐœ์˜ ๋ฏธ๋Ÿฌ๋ง ์„œ๋ฒ„๋“ค์„ ์ด์šฉํ•˜์—ฌ ๋ถ„์‚ฐ ์‚ฌ์šฉ ๊ถŒ๊ณ ")
gr.HTML("""
<div style="display: flex; gap: 10px; flex-wrap: wrap; justify-content: center; margin: 20px 0;">
<a href="https://huggingface.co/spaces/Heartsync/wan2_2-I2V-14B-FAST" target="_blank">
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
</a>
<a href="https://huggingface.co/spaces/ginipick/wan2_2-I2V-14B-FAST" target="_blank">
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
</a>
<a href="https://huggingface.co/spaces/ginigen/wan2_2-I2V-14B-FAST" target="_blank">
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
</a>
<a href="https://huggingface.co/spaces/VIDraft/wan2_2-I2V-14B-FAST" target="_blank">
<img src="https://img.shields.io/static/v1?label=WAN%202.2%2014B%20FAST%2B&message=Image%20to%20Video&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
</a>
<a href="https://discord.gg/openfreeai" target="_blank">
<img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="badge"></a>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
input_image_component = gr.Image(type="pil", label="์ž…๋ ฅ ์ด๋ฏธ์ง€")
prompt_input = gr.Textbox(label="ํ”„๋กฌํ”„ํŠธ", value=default_prompt_i2v, lines=2)
duration_seconds_input = gr.Slider(
minimum=MIN_DURATION,
maximum=MAX_DURATION,
step=0.1,
value=3.5,
label="์˜์ƒ ๊ธธ์ด (์ดˆ)"
)
with gr.Accordion("๊ณ ๊ธ‰ ์„ค์ •", open=False):
negative_prompt_input = gr.Textbox(label="๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ", value=default_negative_prompt, lines=2)
steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="์ƒ์„ฑ ๋‹จ๊ณ„")
guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ 1")
guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ 2")
seed_input = gr.Slider(label="์‹œ๋“œ", minimum=0, maximum=MAX_SEED, step=1, value=42)
randomize_seed_checkbox = gr.Checkbox(label="๋žœ๋ค ์‹œ๋“œ ์‚ฌ์šฉ", value=True)
generate_button = gr.Button("๐ŸŽฅ ์˜์ƒ ์ƒ์„ฑ", variant="primary", size="lg")
with gr.Column(scale=1):
video_output = gr.Video(label="์ƒ์„ฑ๋œ ์˜์ƒ", autoplay=True, interactive=False)
ui_inputs = [
input_image_component, prompt_input, steps_slider,
negative_prompt_input, duration_seconds_input,
guidance_scale_input, guidance_scale_2_input, seed_input, randomize_seed_checkbox
]
generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
gr.Examples(
examples=[
[
"wan_i2v_input.JPG",
"POV ์…€์นด ์˜์ƒ, ์„ ๊ธ€๋ผ์Šค ๋‚€ ํฐ ๊ณ ์–‘์ด๊ฐ€ ์„œํ•‘๋ณด๋“œ์— ์„œ์„œ ํŽธ์•ˆํ•œ ๋ฏธ์†Œ. ๋ฐฐ๊ฒฝ์— ์—ด๋Œ€ ํ•ด๋ณ€(๋ง‘์€ ๋ฌผ, ๋…น์ƒ‰ ์–ธ๋•, ๊ตฌ๋ฆ„ ๋‚€ ํ‘ธ๋ฅธ ํ•˜๋Š˜). ์„œํ•‘๋ณด๋“œ๊ฐ€ ๊ธฐ์šธ์–ด์ง€๊ณ  ๊ณ ์–‘์ด๊ฐ€ ๋ฐ”๋‹ค๋กœ ๋–จ์–ด์ง€๋ฉฐ ์นด๋ฉ”๋ผ๊ฐ€ ๊ฑฐํ’ˆ๊ณผ ํ–‡๋น›๊ณผ ํ•จ๊ป˜ ๋ฌผ์†์œผ๋กœ ๋น ์ง. ์ž ๊น ๋ฌผ์†์—์„œ ๊ณ ์–‘์ด ์–ผ๊ตด ๋ณด์ด๋‹ค๊ฐ€ ๋‹ค์‹œ ์ˆ˜๋ฉด ์œ„๋กœ ์˜ฌ๋ผ์™€ ์…€์นด ์ดฌ์˜ ๊ณ„์†, ์ฆ๊ฑฐ์šด ์—ฌ๋ฆ„ ํœด๊ฐ€ ๋ถ„์œ„๊ธฐ.",
4,
],
[
"wan22_input_2.jpg",
"์„ธ๋ จ๋œ ๋‹ฌ ํƒ์‚ฌ ์ฐจ๋Ÿ‰์ด ์™ผ์ชฝ์—์„œ ์˜ค๋ฅธ์ชฝ์œผ๋กœ ๋ฏธ๋„๋Ÿฌ์ง€๋“ฏ ์ด๋™ํ•˜๋ฉฐ ๋‹ฌ ๋จผ์ง€๋ฅผ ์ผ์œผํ‚ด. ํฐ ์šฐ์ฃผ๋ณต์„ ์ž…์€ ์šฐ์ฃผ์ธ๋“ค์ด ๋‹ฌ ํŠน์œ ์˜ ๋›ฐ๋Š” ๋™์ž‘์œผ๋กœ ํƒ‘์Šน. ๋จผ ๋ฐฐ๊ฒฝ์—์„œ VTOL ๋น„ํ–‰์ฒด๊ฐ€ ์ˆ˜์ง์œผ๋กœ ํ•˜๊ฐ•ํ•˜์—ฌ ํ‘œ๋ฉด์— ์กฐ์šฉํžˆ ์ฐฉ๋ฅ™. ์žฅ๋ฉด ์ „์ฒด์— ๊ฑธ์ณ ์ดˆํ˜„์‹ค์ ์ธ ์˜ค๋กœ๋ผ๊ฐ€ ๋ณ„์ด ๊ฐ€๋“ํ•œ ํ•˜๋Š˜์„ ๊ฐ€๋กœ์ง€๋ฅด๋ฉฐ ์ถค์ถ”๊ณ , ๋…น์ƒ‰, ํŒŒ๋ž€์ƒ‰, ๋ณด๋ผ์ƒ‰ ๋น›์˜ ์ปคํŠผ์ด ๋‹ฌ ํ’๊ฒฝ์„ ์‹ ๋น„๋กญ๊ณ  ๋งˆ๋ฒ• ๊ฐ™์€ ๋น›์œผ๋กœ ๊ฐ์Œˆ.",
4,
],
[
"kill_bill.jpeg",
"์šฐ๋งˆ ์„œ๋จผ์˜ ์บ๋ฆญํ„ฐ ๋ฒ ์•„ํŠธ๋ฆญ์Šค ํ‚ค๋„๊ฐ€ ์˜ํ™” ๊ฐ™์€ ์กฐ๋ช… ์†์—์„œ ๋‚ ์นด๋กœ์šด ์นดํƒ€๋‚˜ ๊ฒ€์„ ์•ˆ์ •์ ์œผ๋กœ ๋“ค๊ณ  ์žˆ์Œ. ๊ฐ‘์ž๊ธฐ ๊ด‘ํƒ ๋‚˜๋Š” ๊ฐ•์ฒ ์ด ๋ถ€๋“œ๋Ÿฌ์›Œ์ง€๊ณ  ์™œ๊ณก๋˜๊ธฐ ์‹œ์ž‘ํ•˜๋ฉฐ ๊ฐ€์—ด๋œ ๊ธˆ์†์ฒ˜๋Ÿผ ๊ตฌ์กฐ์  ์™„์ „์„ฑ์„ ์žƒ๊ธฐ ์‹œ์ž‘. ๊ฒ€๋‚ ์˜ ์™„๋ฒฝํ•œ ๋์ด ์ฒœ์ฒœํžˆ ํœ˜์–ด์ง€๊ณ  ๋Š˜์–ด์ง€๋ฉฐ, ๋…น์€ ๊ฐ•์ฒ ์ด ์€๋น› ๋ฌผ์ค„๊ธฐ๋กœ ์•„๋ž˜๋กœ ํ˜๋Ÿฌ๋‚ด๋ฆผ. ๋ณ€ํ˜•์€ ์ฒ˜์Œ์—๋Š” ๋ฏธ๋ฌ˜ํ•˜๊ฒŒ ์‹œ์ž‘๋˜๋‹ค๊ฐ€ ๊ธˆ์†์ด ์ ์  ๋” ์œ ๋™์ ์ด ๋˜๋ฉด์„œ ๊ฐ€์†ํ™”. ์นด๋ฉ”๋ผ๋Š” ๊ทธ๋…€์˜ ์–ผ๊ตด์„ ๊ณ ์ •ํ•˜๊ณ  ๋‚ ์นด๋กœ์šด ๋ˆˆ๋น›์ด ์ ์ฐจ ์ข์•„์ง€๋Š”๋ฐ, ์น˜๋ช…์ ์ธ ์ง‘์ค‘์ด ์•„๋‹ˆ๋ผ ๋ฌด๊ธฐ๊ฐ€ ๋ˆˆ์•ž์—์„œ ๋…น๋Š” ๊ฒƒ์„ ๋ณด๋ฉฐ ํ˜ผ๋ž€๊ณผ ๊ฒฝ์•…. ํ˜ธํก์ด ์•ฝ๊ฐ„ ๋นจ๋ผ์ง€๋ฉฐ ์ด ๋ถˆ๊ฐ€๋Šฅํ•œ ๋ณ€ํ˜•์„ ๋ชฉ๊ฒฉ. ๋…น๋Š” ํ˜„์ƒ์ด ๊ฐ•ํ™”๋˜๊ณ  ์นดํƒ€๋‚˜์˜ ์™„๋ฒฝํ•œ ํ˜•ํƒœ๊ฐ€ ์ ์  ์ถ”์ƒ์ ์ด ๋˜๋ฉฐ ์†์—์„œ ์ˆ˜์€์ฒ˜๋Ÿผ ๋–จ์–ด์ง. ๋…น์€ ๋ฐฉ์šธ์ด ๋ถ€๋“œ๋Ÿฌ์šด ๊ธˆ์† ์ถฉ๊ฒฉ์Œ๊ณผ ํ•จ๊ป˜ ๋ฐ”๋‹ฅ์— ๋–จ์–ด์ง. ํ‘œ์ •์ด ์ฐจ๋ถ„ํ•œ ์ค€๋น„์—์„œ ๋‹นํ˜น๊ฐ๊ณผ ์šฐ๋ ค๋กœ ๋ฐ”๋€Œ๋ฉฐ ์ „์„ค์ ์ธ ๋ณต์ˆ˜์˜ ๋„๊ตฌ๊ฐ€ ์†์—์„œ ๋ฌธ์ž ๊ทธ๋Œ€๋กœ ์•กํ™”๋˜์–ด ๋ฌด๋ฐฉ๋น„ ์ƒํƒœ๊ฐ€ ๋จ.",
6,
],
],
inputs=[input_image_component, prompt_input, steps_slider],
outputs=[video_output, seed_input],
fn=generate_video,
cache_examples="lazy"
)
if __name__ == "__main__":
demo.queue().launch(mcp_server=True)