Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| import warnings | |
| os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/segment_anything") | |
| os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO") | |
| # os.system("pip install --upgrade diffusers[torch]") | |
| warnings.filterwarnings("ignore") | |
| import gradio as gr | |
| from inference import InferencePipeline | |
| class InferenceUtil: | |
| def __init__(self, hf_token: str | None): | |
| self.hf_token = hf_token | |
| def load_model_info(self, model_id: str) -> tuple[str, str]: | |
| ## TODO the modelcard is in the readme of huggingface repo, should know how to write it | |
| try: | |
| card = InferencePipeline.get_model_card(model_id, self.hf_token) | |
| except Exception: | |
| return '', '' | |
| # return '' | |
| base_model = getattr(card.data, 'base_model', '') | |
| protagonist = getattr(card.data, 'protagonist', '') | |
| training_prompt = getattr(card.data, 'training_prompt', '') | |
| return protagonist, training_prompt | |
| # return training_prompt | |
| # TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)' | |
| HF_TOKEN = os.getenv('HF_TOKEN') | |
| # print("HF Token ===> ", HF_TOKEN) | |
| pipe = InferencePipeline(HF_TOKEN) | |
| app = InferenceUtil(HF_TOKEN) | |
| with gr.Blocks(css='style.css') as demo: | |
| # gr.Markdown(TITLE) | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 1200px; margin: 20px auto;"> | |
| <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem"> | |
| Make-A-Protagonist: | |
| <br> | |
| Generic Video Editing with An Ensemble of Experts | |
| </h1> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
| <a href="https://yuyangzhao.com">Yuyang Zhao</a><sup>1</sup> | |
| <a href="https://xieenze.github.io/">Enze Xie</a><sup>2</sup> | |
| <a href="https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ&hl=en">Lanqing Hong</a><sup>2</sup> | |
| <a href="https://scholar.google.com.sg/citations?user=XboZC1AAAAAJ&hl=en">Zhenguo Li</a><sup>2</sup> | |
| <a href="https://www.comp.nus.edu.sg/~leegh/">Gim Hee Lee</a><sup>1</sup> | |
| </h2> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
| <sup>1 </sup>National University of Singapore | |
| <sup>2 </sup>Huawei Noah's Ark Lab</span> | |
| </h2> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
| <span class="link-block"> | |
| [<a href="https://arxiv.org/abs/2305.08850" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="ai ai-arxiv"></i> | |
| </span> | |
| <span>arXiv</span> | |
| </a>] | |
| </span> | |
| <!-- Github link --> | |
| <span class="link-block"> | |
| [<a href="https://github.com/Make-A-Protagonist/Make-A-Protagonist" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Code</span> | |
| </a>] | |
| </span> | |
| <!-- Github link --> | |
| <span class="link-block"> | |
| [<a href="https://make-a-protagonist.github.io/" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Homepage</span> | |
| </a>] | |
| </span> | |
| </h2> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> | |
| TL;DR: The first framework for generic video editing with both visual and textual clues. | |
| </h2> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Box(): | |
| model_id = gr.Dropdown( | |
| label='Model ID', | |
| choices=[ | |
| 'Make-A-Protagonist/ikun', | |
| 'Make-A-Protagonist/huaqiang', | |
| 'Make-A-Protagonist/yanzi', | |
| 'Make-A-Protagonist/car-turn', | |
| ], | |
| value='Make-A-Protagonist/ikun') | |
| with gr.Row(): | |
| base_model_used_for_training = gr.Textbox( | |
| label='Protagonist', interactive=False, value='man') | |
| prompt_used_for_training = gr.Textbox( | |
| label='Training prompt', interactive=False, value='A man is playing basketball') | |
| with gr.Box(): | |
| ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto") | |
| ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt', | |
| max_lines=1, | |
| placeholder='Example: "man"') | |
| prompt = gr.Textbox(label='Prompt', | |
| max_lines=1, | |
| placeholder='Example: "A panda is surfing"') | |
| video_length = gr.Slider(label='Video length', | |
| minimum=4, | |
| maximum=8, | |
| step=1, | |
| value=8) | |
| fps = gr.Slider(label='FPS', | |
| minimum=1, | |
| maximum=8, | |
| step=1, | |
| value=4) | |
| seed = gr.Slider(label='Seed', | |
| minimum=0, | |
| maximum=100000, | |
| step=1, | |
| value=0) | |
| with gr.Accordion('ControlNet Parameters', open=True): | |
| control_pose = gr.Slider(label='Pose', | |
| minimum=0, | |
| maximum=1, | |
| step=0.1, | |
| value=.5) | |
| control_depth = gr.Slider(label='Depth', | |
| minimum=0, | |
| maximum=1, | |
| step=0.1, | |
| value=.5) | |
| with gr.Accordion('Editing Function', open=True): | |
| with gr.Row(): | |
| source_pro = gr.Slider(label='Source Protagonist', | |
| minimum=0, | |
| maximum=1, | |
| step=1, | |
| value=0) | |
| source_bg = gr.Slider(label='Source Background', | |
| minimum=0, | |
| maximum=1, | |
| step=1, | |
| value=0) | |
| with gr.Accordion('Other Parameters', open=False): | |
| num_steps = gr.Slider(label='Number of Steps', | |
| minimum=0, | |
| maximum=100, | |
| step=1, | |
| value=50) | |
| guidance_scale = gr.Slider(label='CFG Scale', | |
| minimum=0, | |
| maximum=50, | |
| step=0.1, | |
| value=12.5) | |
| noise_level = gr.Slider(label='Noise Level', | |
| minimum=0, | |
| maximum=999, | |
| step=1, | |
| value=0) | |
| run_button = gr.Button('Generate') | |
| gr.Markdown(''' | |
| - It takes a few minutes to download model first. | |
| - It takes one minute to load model and conduct DDIM inverse | |
| ''') | |
| with gr.Column(): | |
| result = gr.Video(label='Result') | |
| with gr.Row(): | |
| examples = [ | |
| [ | |
| 'Make-A-Protagonist/ikun', | |
| 'A man is playing basketball on the beach, anime style.', | |
| 8, | |
| 4, | |
| 33, | |
| 50, | |
| 12.5, | |
| 'data/ikun/reference_images/zhongli.jpg', | |
| 'man', | |
| 0, | |
| 0.5, | |
| 0.5, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 'Make-A-Protagonist/huaqiang', | |
| 'Elon Musk walking down the street.', | |
| 8, | |
| 4, | |
| 33, | |
| 50, | |
| 12.5, | |
| 'data/huaqiang/reference_images/musk.jpg', | |
| 'man', | |
| 0, | |
| 0.5, | |
| 0.5, | |
| 0, | |
| 1, | |
| ], | |
| [ | |
| 'Make-A-Protagonist/yanzi', | |
| 'A panda walking down the snowy street.', | |
| 8, | |
| 4, | |
| 33, | |
| 50, | |
| 12.5, | |
| 'data/yanzi/reference_images/panda.jpeg', | |
| 'panda', | |
| 0, | |
| 0.5, | |
| 0.5, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 'Make-A-Protagonist/car-turn', | |
| 'A car moving in the desert.', | |
| 8, | |
| 4, | |
| 33, | |
| 50, | |
| 12.5, | |
| 'data/car-turn/reference_images/audi.jpeg', | |
| 'car', | |
| 0, | |
| 0.0, | |
| 1.0, | |
| 0, | |
| 0 | |
| ], | |
| [ | |
| 'Make-A-Protagonist/car-turn', | |
| 'A Suzuki Jimny driving down a mountain road in the rain.', | |
| 8, | |
| 4, | |
| 33, | |
| 50, | |
| 12.5, | |
| 'data/car-turn/images/0000.jpg', | |
| 'car', | |
| 0, | |
| 0.0, | |
| 1.0, | |
| 1, | |
| 0 | |
| ], | |
| ] | |
| gr.Examples(examples=examples, | |
| inputs=[ | |
| model_id, | |
| prompt, | |
| video_length, | |
| fps, | |
| seed, | |
| num_steps, | |
| guidance_scale, | |
| ref_image, | |
| ref_pro_prompt, | |
| noise_level, | |
| control_pose, | |
| control_depth, | |
| source_pro, | |
| source_bg, | |
| ], | |
| outputs=result, | |
| fn=pipe.run, | |
| cache_examples=os.getenv('SYSTEM') == 'spaces') | |
| model_id.change(fn=app.load_model_info, | |
| inputs=model_id, | |
| outputs=[ | |
| base_model_used_for_training, | |
| prompt_used_for_training, | |
| ]) | |
| inputs = [ | |
| model_id, | |
| prompt, | |
| video_length, | |
| fps, | |
| seed, | |
| num_steps, | |
| guidance_scale, | |
| ref_image, | |
| ref_pro_prompt, | |
| noise_level, | |
| control_pose, | |
| control_depth, | |
| source_pro, | |
| source_bg, | |
| ] | |
| prompt.submit(fn=pipe.run, inputs=inputs, outputs=result) | |
| run_button.click(fn=pipe.run, inputs=inputs, outputs=result) | |
| demo.queue().launch(share=True) | |