import spaces import gradio as gr import torch import argparse from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG from diffusers import FluxPipeline import tempfile from ovi.utils.io_utils import save_video from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible from huggingface_hub import snapshot_download import os # ---------------------------- # Parse CLI Args # ---------------------------- parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo") parser.add_argument( "--use_image_gen", action="store_true", help="Enable image generation UI with FluxPipeline" ) parser.add_argument( "--cpu_offload", action="store_true", help="Enable CPU offload for both OviFusionEngine and FluxPipeline" ) args = parser.parse_args() ckpt_dir = "./ckpts" # Wan2.2 wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B") snapshot_download( repo_id="Wan-AI/Wan2.2-TI2V-5B", local_dir=wan_dir, allow_patterns=[ "google/*", "models_t5_umt5-xxl-enc-bf16.pth", "Wan2.2_VAE.pth" ] ) # MMAudio mm_audio_dir = os.path.join(ckpt_dir, "MMAudio") snapshot_download( repo_id="hkchengrex/MMAudio", local_dir=mm_audio_dir, allow_patterns=[ "ext_weights/best_netG.pt", "ext_weights/v1-16.pth" ] ) ovi_dir = os.path.join(ckpt_dir, "Ovi") snapshot_download( repo_id="chetwinlow1/Ovi", local_dir=ovi_dir, allow_patterns=[ "model.safetensors" ] ) # Initialize OviFusionEngine enable_cpu_offload = args.cpu_offload or args.use_image_gen use_image_gen = args.use_image_gen print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo") DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded ovi_engine = OviFusionEngine() flux_model = None if use_image_gen: flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16) flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM print("loaded model") @spaces.GPU() def generate_video( text_prompt, image, video_frame_height, video_frame_width, video_seed, solver_name, sample_steps, shift, video_guidance_scale, audio_guidance_scale, slg_layer, video_negative_prompt, audio_negative_prompt, ): try: image_path = None if image is not None: image_path = image generated_video, generated_audio, _ = ovi_engine.generate( text_prompt=text_prompt, image_path=image_path, video_frame_height_width=[video_frame_height, video_frame_width], seed=video_seed, solver_name=solver_name, sample_steps=sample_steps, shift=shift, video_guidance_scale=video_guidance_scale, audio_guidance_scale=audio_guidance_scale, slg_layer=slg_layer, video_negative_prompt=video_negative_prompt, audio_negative_prompt=audio_negative_prompt, ) tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) output_path = tmpfile.name save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000) return output_path except Exception as e: print(f"Error during video generation: {e}") return None def generate_image(text_prompt, image_seed, image_height, image_width): if flux_model is None: return None text_prompt = clean_text(text_prompt) print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})") image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024) image = flux_model( text_prompt, height=image_h, width=image_w, guidance_scale=4.5, generator=torch.Generator().manual_seed(int(image_seed)) ).images[0] tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False) image.save(tmpfile.name) return tmpfile.name # Build UI with gr.Blocks() as demo: gr.Markdown("# 🎥 Ovi Joint Video + Audio Generation Demo") gr.Markdown( """ ## 📘 Instructions Follow the steps in order: 1️⃣ **Enter a Text Prompt** — describe your video. (This text prompt will be shared for image generation if enabled.) 2️⃣ **Upload or Generate an Image** — Upload an image or generate one if image generation is enabled. (If you do not see the image generation options, make sure to run the script with `--use_image_gen`.) 3️⃣ **Configure Video Options** — set resolution, seed, solver, and other parameters. (It will automatically use the uploaded/generated image as the first frame, whichever is rendered on your screen at the time of video generation.) 4️⃣ **Generate Video** — click the button to produce your final video with audio. 5️⃣ **View the Result** — your generated video will appear below. --- ### 💡 Tips 1. For best results, use detailed and specific text prompts. 2. Ensure text prompt format is correct, i.e speech to be said should be wrapped with `...`. Can provide optional audio description at the end, wrapping them in ` ... `, refer to examples 3. Do not be discouraged by bad or weird results, check prompt format and try different seeds, cfg values and slg layers. """ ) with gr.Row(): with gr.Column(): # Image section image = gr.Image(type="filepath", label="First Frame Image (upload or generate)") if args.use_image_gen: with gr.Accordion("🖼️ Image Generation Options", visible=True): image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...") image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed") image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height") image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width") gen_img_btn = gr.Button("Generate Image 🎨") else: gen_img_btn = None with gr.Accordion("🎬 Video Generation Options", open=True): video_text_prompt = gr.Textbox(label="Video Prompt", placeholder="Describe your video...") video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height") video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width") video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed") solver_name = gr.Dropdown( choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name" ) sample_steps = gr.Number( value=50, label="Sample Steps", precision=0, minimum=20, maximum=100 ) shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift") video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale") audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale") slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer") video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video") audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio") run_btn = gr.Button("Generate Video 🚀") with gr.Column(): output_path = gr.Video(label="Generated Video") if args.use_image_gen and gen_img_btn is not None: gen_img_btn.click( fn=generate_image, inputs=[image_text_prompt, image_seed, image_height, image_width], outputs=[image], ) # Hook up video generation run_btn.click( fn=generate_video, inputs=[ video_text_prompt, image, video_height, video_width, video_seed, solver_name, sample_steps, shift, video_guidance_scale, audio_guidance_scale, slg_layer, video_negative_prompt, audio_negative_prompt, ], outputs=[output_path], ) if __name__ == "__main__": demo.launch(share=True)