import torch
import gradio as gr
import numpy as np
import imageio.v3 as iio
from PIL import Image
from diffusers import DiffusionPipeline

MODEL_ID = "stabilityai/stable-video-diffusion-img2vid"

device = "cpu"
dtype = torch.float32

pipe = DiffusionPipeline.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    variant="fp16" if dtype == torch.float16 else None,
)
pipe.to(device)

def resize_to_multiple_of_8(img, max_side=1024):
    w, h = img.size
    scale = min(max_side / max(w, h), 1.0)
    new_w = int(np.floor(w * scale / 8) * 8)
    new_h = int(np.floor(h * scale / 8) * 8)
    return img.convert("RGB").resize((new_w, new_h), Image.LANCZOS)

def generate_video(image, motion=50, noise=0.1, num_frames=25, fps=8, seed=0):
    if image is None:
        raise gr.Error("Please upload an image.")
    image = resize_to_multiple_of_8(image)

    generator = torch.Generator(device=device)
    if seed:
        generator.manual_seed(seed)

    with torch.autocast(device_type=device, dtype=dtype):
        result = pipe(
            image,
            num_frames=num_frames,
            fps=fps,
            motion_bucket_id=motion,
            noise_aug_strength=noise,
            generator=generator,
        )
    frames = [np.array(f.convert("RGB")) for f in result.frames]
    iio.imwrite("out.mp4", frames, fps=fps, codec="libx264", quality=8)
    return "out.mp4"

demo = gr.Interface(
    fn=generate_video,
    inputs=[
        gr.Image(label="Input Image", type="pil"),
        gr.Slider(1, 255, 50, label="Motion strength"),
        gr.Slider(0.0, 0.3, 0.1, label="Noise strength"),
        gr.Slider(8, 25, 25, step=1, label="Frames"),
        gr.Slider(5, 30, 8, step=1, label="FPS"),
        gr.Number(value=0, label="Seed (0=random)"),
    ],
    outputs=gr.Video(label="Generated Video"),
    title="Stable Video Diffusion (Image → Video)",
    description="Generate ~3-second short video clips from a single image using Stability AI’s open model.",
)

if __name__ == "__main__":
    demo.launch()