Spaces:
Runtime error
Runtime error
| import argparse | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from glob import glob | |
| from torchvision.transforms import CenterCrop, Compose, Resize | |
| from gradio_utils.camera_utils import CAMERA_MOTION_MODE, process_camera, create_relative | |
| from gradio_utils.utils import vis_camera | |
| from gradio_utils.motionctrl_cmcm_gradio import build_model, motionctrl_sample | |
| os.environ['KMP_DUPLICATE_LIB_OK']='True' | |
| SPACE_ID = os.environ.get('SPACE_ID', '') | |
| #### Description #### | |
| title = r"""<h1 align="center">MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</h1>""" | |
| subtitle = r"""<h2 align="center">Deployed on SVD Generation</h2>""" | |
| important_link = r""" | |
| <div align='center'> | |
| <a href='https://wzhouxiff.github.io/projects/MotionCtrl/assets/paper/MotionCtrl.pdf'>[Paper]</a> | |
|   <a href='https://wzhouxiff.github.io/projects/MotionCtrl/'>[Project Page]</a> | |
|   <a href='https://github.com/TencentARC/MotionCtrl'>[Code]</a> | |
|   <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/showcase_svd.md'>[Showcases]</a> | |
|   <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md'>[Tutorial]</a> | |
| </div> | |
| """ | |
| description = r""" | |
| <b>Official Gradio demo</b> for <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'><b>MotionCtrl: A Unified and Flexible Motion Controller for Video Generation</b></a>.<br> | |
| π₯ MotionCtrl is capable of independently and flexibly controling the camera motion and object motion of a generated video, with only a unified model.<br> | |
| π€ Try to control the motion of the generated videos yourself!<br> | |
| βββ Please note **ONLY** Camera Motion Control in the current version of **MotionCtrl** deployed on **SVD** is avaliable.<br> | |
| βββ <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/showcase_svd.md' target='_blank'>Showcases</a> and | |
| <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md' target='_blank'>Tutorial</a> can be found | |
| <a href='https://github.com/TencentARC/MotionCtrl/blob/svd/doc/tutorial.md' target='_blank'>here</a><br>. | |
| """ | |
| # <div> | |
| # <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/00_ibzz5-dxv2h.gif", width="300"> | |
| # <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/01_5guvn-0x6v2.gif", width="300"> | |
| # <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/12_sn7bz-0hcaf.gif", width="300"> | |
| # <img src="https://raw.githubusercontent.com/TencentARC/MotionCtrl/main/assets/svd/13_3lyco-4ru8j.gif", width="300"> | |
| # </div> | |
| article = r""" | |
| If MotionCtrl is helpful, please help to β the <a href='https://github.com/TencentARC/MotionCtrl' target='_blank'>Github Repo</a>. Thanks! | |
| [](https://github.com/TencentARC/MotionCtrl) | |
| --- | |
| π **Citation** | |
| <br> | |
| If our work is useful for your research, please consider citing: | |
| ```bibtex | |
| @inproceedings{wang2023motionctrl, | |
| title={MotionCtrl: A Unified and Flexible Motion Controller for Video Generation}, | |
| author={Wang, Zhouxia and Yuan, Ziyang and Wang, Xintao and Chen, Tianshui and Xia, Menghan and Luo, Ping and Shan, Ying}, | |
| booktitle={arXiv preprint arXiv:2312.03641}, | |
| year={2023} | |
| } | |
| ``` | |
| π§ **Contact** | |
| <br> | |
| If you have any questions, please feel free to reach me out at <b>[email protected]</b>. | |
| """ | |
| css = """ | |
| .gradio-container {width: 85% !important} | |
| .gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important;} | |
| span.svelte-s1r2yt {font-size: 17px !important; font-weight: bold !important; color: #d30f2f !important;} | |
| button {border-radius: 8px !important;} | |
| .add_button {background-color: #4CAF50 !important;} | |
| .remove_button {background-color: #f44336 !important;} | |
| .clear_button {background-color: gray !important;} | |
| .mask_button_group {gap: 10px !important;} | |
| .video {height: 300px !important;} | |
| .image {height: 300px !important;} | |
| .video .wrap.svelte-lcpz3o {display: flex !important; align-items: center !important; justify-content: center !important;} | |
| .video .wrap.svelte-lcpz3o > :first-child {height: 100% !important;} | |
| .margin_center {width: 50% !important; margin: auto !important;} | |
| .jc_center {justify-content: center !important;} | |
| """ | |
| T_base = [ | |
| [1.,0.,0.], ## W2C x ηζ£ζΉεοΌ ηΈζΊζε·¦ left | |
| [-1.,0.,0.], ## W2C x ηθ΄ζΉεοΌ ηΈζΊζε³ right | |
| [0., 1., 0.], ## W2C y ηζ£ζΉεοΌ ηΈζΊζδΈ up | |
| [0.,-1.,0.], ## W2C y ηθ΄ζΉεοΌ ηΈζΊζδΈ down | |
| [0.,0.,1.], ## W2C z ηζ£ζΉεοΌ ηΈζΊεΎε zoom out | |
| [0.,0.,-1.], ## W2C z ηθ΄ζΉεοΌ ηΈζΊεΎε zoom in | |
| ] | |
| radius = 1 | |
| n = 16 | |
| # step = | |
| look_at = np.array([0, 0, 0.8]).reshape(3,1) | |
| # look_at = np.array([0, 0, 0.2]).reshape(3,1) | |
| T_list = [] | |
| base_R = np.array([[1., 0., 0.], | |
| [0., 1., 0.], | |
| [0., 0., 1.]]) | |
| res = [] | |
| res_forsave = [] | |
| T_range = 1.8 | |
| for i in range(0, 16): | |
| # theta = (1)*np.pi*i/n | |
| R = base_R[:,:3] | |
| T = np.array([0.,0.,1.]).reshape(3,1) * (i/n)*2 | |
| RT = np.concatenate([R,T], axis=1) | |
| res.append(RT) | |
| fig = vis_camera(res) | |
| # MODE = ["camera motion control", "object motion control", "camera + object motion control"] | |
| MODE = ["control camera poses", "control object trajectory", "control both camera and object motion"] | |
| RESIZE_MODE = ['Center Crop To 576x1024', 'Keep original spatial ratio'] | |
| DIY_MODE = ['Customized Mode 1: First A then B', | |
| 'Customized Mode 2: Both A and B', | |
| 'Customized Mode 3: RAW Camera Poses'] | |
| ## load default model | |
| num_frames = 14 | |
| num_steps = 25 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device {device}") | |
| config = "configs/inference/config_motionctrl_cmcm.yaml" | |
| ckpt='checkpoints/motionctrl_svd.ckpt' | |
| if not os.path.exists(ckpt): | |
| os.system(f'wget https://huggingface.co/TencentARC/MotionCtrl/resolve/main/motionctrl_svd.ckpt?download=true -P .') | |
| os.system(f'mkdir checkpoints') | |
| os.system(f'mv motionctrl_svd.ckpt?download=true {ckpt}') | |
| model = build_model(config, ckpt, device, num_frames, num_steps) | |
| width, height = 1024, 576 | |
| traj_list = [] | |
| camera_dict = { | |
| "motion":[], | |
| "mode": "Customized Mode 1: First A then B", # "First A then B", "Both A and B", "Custom" | |
| "speed": 1.0, | |
| "complex": None | |
| } | |
| def fn_vis_camera(camera_args): | |
| global camera_dict, num_frames, width, height | |
| RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height) # [t, 3, 4] | |
| rescale_T = 1.0 | |
| rescale_T = max(rescale_T, np.max(np.abs(RT[:,:,-1])) / 1.9) | |
| fig = vis_camera(create_relative(RT), rescale_T=rescale_T) | |
| vis_step3_prompt_generate = True | |
| vis_generation_dec = True | |
| vis_prompt = True | |
| vis_num_samples = True | |
| vis_seed = True | |
| vis_start = True | |
| vis_gen_video = True | |
| vis_repeat_highlight = True | |
| return fig, \ | |
| gr.update(visible=vis_step3_prompt_generate), \ | |
| gr.update(visible=vis_generation_dec), \ | |
| gr.update(visible=vis_prompt), \ | |
| gr.update(visible=vis_num_samples), \ | |
| gr.update(visible=vis_seed), \ | |
| gr.update(visible=vis_start), \ | |
| gr.update(visible=vis_gen_video, value=None), \ | |
| gr.update(visible=vis_repeat_highlight) | |
| def display_camera_info(camera_dict, camera_mode=None): | |
| if camera_dict['complex'] is not None: | |
| res = f"complex : {camera_dict['complex']}. " | |
| res += f"speed : {camera_dict['speed']}. " | |
| else: | |
| res = "" | |
| res += f"motion : {[_ for _ in camera_dict['motion']]}. " | |
| res += f"speed : {camera_dict['speed']}. " | |
| if camera_mode == CAMERA_MOTION_MODE[2]: | |
| res += f"mode : {camera_dict['mode']}. " | |
| return res | |
| def add_camera_motion(camera_motion, camera_mode): | |
| global camera_dict | |
| if camera_dict['complex'] is not None: | |
| camera_dict['complex'] = None | |
| if camera_mode == CAMERA_MOTION_MODE[2] and len(camera_dict['motion']) <2: | |
| camera_dict['motion'].append(camera_motion) | |
| else: | |
| camera_dict['motion']=[camera_motion] | |
| return display_camera_info(camera_dict, camera_mode) | |
| def add_complex_camera_motion(camera_motion): | |
| global camera_dict | |
| camera_dict['complex']=camera_motion | |
| return display_camera_info(camera_dict) | |
| def input_raw_camera_pose(combine_type, camera_mode): | |
| global camera_dict | |
| camera_dict['mode'] = combine_type | |
| vis_U = False | |
| vis_D = False | |
| vis_L = False | |
| vis_R = False | |
| vis_I = False | |
| vis_O = False | |
| vis_ACW = False | |
| vis_CW = False | |
| vis_speed = True | |
| vis_combine3_des = True | |
| return gr.update(value='1 0 0 0 0 1 0 0 0 0 1 0\n1 0 0 0 0 1 0 0 0 0 1 -0.225\n1 0 0 0 0 1 0 0 0 0 1 -0.45\n1 0 0 0 0 1 0 0 0 0 1 -0.675\n1 0 0 0 0 1 0 0 0 0 1 -0.9\n1 0 0 0 0 1 0 0 0 0 1 -1.125\n1 0 0 0 0 1 0 0 0 0 1 -1.35\n1 0 0 0 0 1 0 0 0 0 1 -1.575\n1 0 0 0 0 1 0 0 0 0 1 -1.8\n1 0 0 0 0 1 0 0 0 0 1 -2.025\n1 0 0 0 0 1 0 0 0 0 1 -2.25\n1 0 0 0 0 1 0 0 0 0 1 -2.475\n1 0 0 0 0 1 0 0 0 0 1 -2.7\n1 0 0 0 0 1 0 0 0 0 1 -2.925\n', max_lines=16, interactive=True), \ | |
| gr.update(visible=vis_U), \ | |
| gr.update(visible=vis_D), \ | |
| gr.update(visible=vis_L),\ | |
| gr.update(visible=vis_R), \ | |
| gr.update(visible=vis_I), \ | |
| gr.update(visible=vis_O), \ | |
| gr.update(visible=vis_ACW), \ | |
| gr.update(visible=vis_CW), \ | |
| gr.update(visible=vis_speed), \ | |
| gr.update(visible=vis_combine3_des) | |
| def change_camera_mode(combine_type, camera_mode): | |
| global camera_dict | |
| camera_dict['mode'] = combine_type | |
| vis_U = True | |
| vis_D = True | |
| vis_L = True | |
| vis_R = True | |
| vis_I = True | |
| vis_O = True | |
| vis_ACW = True | |
| vis_CW = True | |
| vis_speed = True | |
| vis_combine3_des = False | |
| return display_camera_info(camera_dict, camera_mode), \ | |
| gr.update(visible=vis_U), \ | |
| gr.update(visible=vis_D), \ | |
| gr.update(visible=vis_L),\ | |
| gr.update(visible=vis_R), \ | |
| gr.update(visible=vis_I), \ | |
| gr.update(visible=vis_O), \ | |
| gr.update(visible=vis_ACW), \ | |
| gr.update(visible=vis_CW), \ | |
| gr.update(visible=vis_speed), \ | |
| gr.update(visible=vis_combine3_des) | |
| def change_camera_speed(camera_speed): | |
| global camera_dict | |
| camera_dict['speed'] = camera_speed | |
| return display_camera_info(camera_dict) | |
| def reset_camera(): | |
| global camera_dict | |
| camera_dict = { | |
| "motion":[], | |
| "mode": "Customized Mode 1: First A then B", | |
| "speed": 1.0, | |
| "complex": None | |
| } | |
| return display_camera_info(camera_dict) | |
| def visualized_camera_poses(step2_camera_motion): | |
| reset_camera() | |
| # generate video | |
| vis_step3_prompt_generate = False | |
| vis_generation_dec = False | |
| vis_prompt = False | |
| vis_num_samples = False | |
| vis_seed = False | |
| vis_start = False | |
| vis_gen_video = False | |
| vis_repeat_highlight = False | |
| if step2_camera_motion == CAMERA_MOTION_MODE[0]: | |
| vis_basic_camera_motion = True | |
| vis_basic_camera_motion_des = True | |
| vis_custom_camera_motion = False | |
| vis_custom_run_status = False | |
| vis_complex_camera_motion = False | |
| vis_complex_camera_motion_des = False | |
| vis_U = True | |
| vis_D = True | |
| vis_L = True | |
| vis_R = True | |
| vis_I = True | |
| vis_O = True | |
| vis_ACW = True | |
| vis_CW = True | |
| vis_combine1 = False | |
| vis_combine2 = False | |
| vis_combine3 = False | |
| vis_combine3_des = False | |
| vis_speed = True | |
| vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False | |
| vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False | |
| elif step2_camera_motion == CAMERA_MOTION_MODE[1]: | |
| vis_basic_camera_motion = False | |
| vis_basic_camera_motion_des = False | |
| vis_custom_camera_motion = False | |
| vis_custom_run_status = False | |
| vis_complex_camera_motion = True | |
| vis_complex_camera_motion_des = True | |
| vis_U = False | |
| vis_D = False | |
| vis_L = False | |
| vis_R = False | |
| vis_I = False | |
| vis_O = False | |
| vis_ACW = False | |
| vis_CW = False | |
| vis_combine1 = False | |
| vis_combine2 = False | |
| vis_combine3 = False | |
| vis_combine3_des = False | |
| vis_speed = True | |
| vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = True, True, True, True | |
| vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = True, True, True, True | |
| else: # step2_camera_motion = CAMERA_MOTION_MODE[2]: | |
| vis_basic_camera_motion = False | |
| vis_basic_camera_motion_des = False | |
| vis_custom_camera_motion = True | |
| vis_custom_run_status = True | |
| vis_complex_camera_motion = False | |
| vis_complex_camera_motion_des = False | |
| vis_U = False | |
| vis_D = False | |
| vis_L = False | |
| vis_R = False | |
| vis_I = False | |
| vis_O = False | |
| vis_ACW = False | |
| vis_CW = False | |
| vis_combine1 = True | |
| vis_combine2 = True | |
| vis_combine3 = True | |
| vis_combine3_des = False | |
| vis_speed = False | |
| vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False | |
| vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False | |
| vis_camera_args = True | |
| vis_camera_reset = True | |
| vis_camera_vis = True | |
| vis_vis_camera = True | |
| return gr.update(visible=vis_basic_camera_motion), \ | |
| gr.update(visible=vis_basic_camera_motion_des), \ | |
| gr.update(visible=vis_custom_camera_motion), \ | |
| gr.update(visible=vis_custom_run_status), \ | |
| gr.update(visible=vis_complex_camera_motion), \ | |
| gr.update(visible=vis_complex_camera_motion_des), \ | |
| gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \ | |
| gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \ | |
| gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \ | |
| gr.update(visible=vis_combine3_des), \ | |
| gr.update(visible=vis_speed), \ | |
| gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \ | |
| gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \ | |
| gr.update(visible=vis_camera_args, value=None), \ | |
| gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \ | |
| gr.update(visible=vis_vis_camera, value=None), \ | |
| gr.update(visible=vis_step3_prompt_generate), \ | |
| gr.update(visible=vis_generation_dec), \ | |
| gr.update(visible=vis_prompt), \ | |
| gr.update(visible=vis_num_samples), \ | |
| gr.update(visible=vis_seed), \ | |
| gr.update(visible=vis_start), \ | |
| gr.update(visible=vis_gen_video), \ | |
| gr.update(visible=vis_repeat_highlight) | |
| def process_input_image(input_image, resize_mode): | |
| global width, height | |
| if resize_mode == RESIZE_MODE[0]: | |
| height = 576 | |
| width = 1024 | |
| w, h = input_image.size | |
| h_ratio = h / height | |
| w_ratio = w / width | |
| if h_ratio > w_ratio: | |
| h = int(h / w_ratio) | |
| if h < height: | |
| h = height | |
| input_image = Resize((h, width))(input_image) | |
| else: | |
| w = int(w / h_ratio) | |
| if w < width: | |
| w = width | |
| input_image = Resize((height, w))(input_image) | |
| transformer = Compose([ | |
| # Resize(width), | |
| CenterCrop((height, width)), | |
| ]) | |
| input_image = transformer(input_image) | |
| else: | |
| w, h = input_image.size | |
| if h > w: | |
| height = 576 | |
| width = int(w * height / h) | |
| else: | |
| width = 1024 | |
| height = int(h * width / w) | |
| input_image = Resize((height, width))(input_image) | |
| # print(f'input_image size: {input_image.size}') | |
| vis_step2_camera_motion = True | |
| vis_step2_camera_motion_des = True | |
| vis_camera_mode = True | |
| vis_camera_info = True | |
| #### | |
| # camera motion control | |
| vis_basic_camera_motion = False | |
| vis_basic_camera_motion_des = False | |
| vis_custom_camera_motion = False | |
| vis_custom_run_status = False | |
| vis_complex_camera_motion = False | |
| vis_complex_camera_motion_des = False | |
| vis_U = False | |
| vis_D = False | |
| vis_L = False | |
| vis_R = False | |
| vis_I = False | |
| vis_O = False | |
| vis_ACW = False | |
| vis_CW = False | |
| vis_combine1 = False | |
| vis_combine2 = False | |
| vis_combine3 = False | |
| vis_combine3_des = False | |
| vis_speed = False | |
| vis_Pose_1, vis_Pose_2, vis_Pose_3, vis_Pose_4 = False, False, False, False | |
| vis_Pose_5, vis_Pose_6, vis_Pose_7, vis_Pose_8 = False, False, False, False | |
| vis_camera_args = False | |
| vis_camera_reset = False | |
| vis_camera_vis = False | |
| vis_vis_camera = False | |
| # generate video | |
| vis_step3_prompt_generate = False | |
| vis_generation_dec = False | |
| vis_prompt = False | |
| vis_num_samples = False | |
| vis_seed = False | |
| vis_start = False | |
| vis_gen_video = False | |
| vis_repeat_highlight = False | |
| return gr.update(visible=True, value=input_image, height=height, width=width), \ | |
| gr.update(visible=vis_step2_camera_motion), \ | |
| gr.update(visible=vis_step2_camera_motion_des), \ | |
| gr.update(visible=vis_camera_mode), \ | |
| gr.update(visible=vis_camera_info), \ | |
| gr.update(visible=vis_basic_camera_motion), \ | |
| gr.update(visible=vis_basic_camera_motion_des), \ | |
| gr.update(visible=vis_custom_camera_motion), \ | |
| gr.update(visible=vis_custom_run_status), \ | |
| gr.update(visible=vis_complex_camera_motion), \ | |
| gr.update(visible=vis_complex_camera_motion_des), \ | |
| gr.update(visible=vis_U), gr.update(visible=vis_D), gr.update(visible=vis_L), gr.update(visible=vis_R), \ | |
| gr.update(visible=vis_I), gr.update(visible=vis_O), gr.update(visible=vis_ACW), gr.update(visible=vis_CW), \ | |
| gr.update(visible=vis_combine1), gr.update(visible=vis_combine2), gr.update(visible=vis_combine3), \ | |
| gr.update(visible=vis_combine3_des), \ | |
| gr.update(visible=vis_speed), \ | |
| gr.update(visible=vis_Pose_1), gr.update(visible=vis_Pose_2), gr.update(visible=vis_Pose_3), gr.update(visible=vis_Pose_4), \ | |
| gr.update(visible=vis_Pose_5), gr.update(visible=vis_Pose_6), gr.update(visible=vis_Pose_7), gr.update(visible=vis_Pose_8), \ | |
| gr.update(visible=vis_camera_args, value=None), \ | |
| gr.update(visible=vis_camera_reset), gr.update(visible=vis_camera_vis), \ | |
| gr.update(visible=vis_vis_camera, value=None), \ | |
| gr.update(visible=vis_step3_prompt_generate), \ | |
| gr.update(visible=vis_generation_dec), \ | |
| gr.update(visible=vis_prompt), \ | |
| gr.update(visible=vis_num_samples), \ | |
| gr.update(visible=vis_seed), \ | |
| gr.update(visible=vis_start), \ | |
| gr.update(visible=vis_gen_video), \ | |
| gr.update(visible=vis_repeat_highlight) | |
| def model_run(input_image, fps_id, seed, n_samples, camera_args): | |
| global model, device, camera_dict, num_frames, num_steps, width, height | |
| RT = process_camera(camera_dict, camera_args, num_frames=num_frames, width=width, height=height).reshape(-1,12) | |
| video_path = motionctrl_sample( | |
| model=model, | |
| image=input_image, | |
| RT=RT, | |
| num_frames=num_frames, | |
| fps_id=fps_id, | |
| decoding_t=1, | |
| seed=seed, | |
| sample_num=n_samples, | |
| device=device | |
| ) | |
| return video_path | |
| def main(args): | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.Markdown(title) | |
| gr.Markdown(subtitle) | |
| gr.Markdown(important_link) | |
| gr.Markdown(description) | |
| with gr.Column(): | |
| # step 0: Some useful tricks | |
| gr.Markdown("## Step 0/3: Some Useful Tricks", show_label=False) | |
| gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \ | |
| \n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \ | |
| or increase `FPS`.", "Normal")], | |
| color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=True) | |
| # step 2: input an image | |
| step2_title = gr.Markdown("---\n## Step 1/3: Input an Image", show_label=False, visible=True) | |
| step2_dec = gr.Markdown(f"\n 1. Upload an Image by `Drag` or Click `Upload Image`; \ | |
| \n 2. Click `{RESIZE_MODE[0]}` or `{RESIZE_MODE[1]}` to select the image resize mode. \ | |
| You will get a processed image and go into the next step. \ | |
| \n - `{RESIZE_MODE[0]}`: Our MotionCtrl is train on image with spatial size 576x1024. Choose `{RESIZE_MODE[0]}` can get better generated video. \ | |
| \n - `{RESIZE_MODE[1]}`: Choose `{RESIZE_MODE[1]}` if you want to generate video with the same spatial ratio as the input image.", | |
| show_label=False, visible=True) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=2): | |
| input_image = gr.Image(type="pil", interactive=True, elem_id="input_image", elem_classes='image', visible=True) | |
| # process_input_image_button = gr.Button(value="Process Input Image", visible=False) | |
| with gr.Row(): | |
| center_crop_botton = gr.Button(value=RESIZE_MODE[0], visible=True) | |
| keep_spatial_raition_botton = gr.Button(value=RESIZE_MODE[1], visible=True) | |
| with gr.Column(scale=2): | |
| process_image = gr.Image(type="pil", interactive=False, elem_id="process_image", elem_classes='image', visible=False) | |
| # step2_proceed_button = gr.Button(value="Proceed", visible=False) | |
| # step3 - camera motion control | |
| step2_camera_motion = gr.Markdown("---\n## Step 2/3: Select the camera poses", show_label=False, visible=False) | |
| step2_camera_motion_des = gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Including 8 basic camera poses, such as pan up, pan down, zoom in, and zoom out. \ | |
| \n - {CAMERA_MOTION_MODE[1]}: Complex camera poses extracted from the real videos. \ | |
| \n - {CAMERA_MOTION_MODE[2]}: You can customize complex camera poses yourself by combining or fusing two of the eight basic camera poses or input RAW RT matrix. \ | |
| \n - Click `Proceed` to go into next step", | |
| show_label=False, visible=False) | |
| camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False) | |
| camera_info = gr.Button(value="Proceed", visible=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # step3.1 - camera motion control - basic | |
| basic_camera_motion = gr.Markdown("---\n### Basic Camera Poses", show_label=False, visible=False) | |
| basic_camera_motion_des = gr.Markdown(f"\n 1. Click one of the basic camera poses, such as `Pan Up`; \ | |
| \n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \ | |
| \n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \ | |
| \n 4. Click `Reset Camera` to reset the camera poses (If needed). ", | |
| show_label=False, visible=False) | |
| # step3.2 - camera motion control - provided complex | |
| complex_camera_motion = gr.Markdown("---\n### Provided Complex Camera Poses", show_label=False, visible=False) | |
| complex_camera_motion_des = gr.Markdown(f"\n 1. Click one of the complex camera poses, such as `Pose_1`; \ | |
| \n 2. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \ | |
| \n 3. Click `Reset Camera` to reset the camera poses (If needed). ", | |
| show_label=False, visible=False) | |
| # step3.3 - camera motion control - custom | |
| custom_camera_motion = gr.Markdown(f"---\n### {CAMERA_MOTION_MODE[2]}", show_label=False, visible=False) | |
| custom_run_status = gr.Markdown(f"\n 1. Click `{DIY_MODE[0]}`, `{DIY_MODE[1]}`, or `{DIY_MODE[2]}` \ | |
| \n - `Customized Mode 1: First A then B`: For example, click `Pan Up` and `Pan Left`, the camera will first `Pan Up` and then `Pan Left`; \ | |
| \n - `Customized Mode 2: Both A and B`: For example, click `Pan Up` and `Pan Left`, the camera will move towards the upper left corner; \ | |
| \n - `{DIY_MODE[2]}`: Input the RAW RT matrix yourselves. \ | |
| \n 2. Slide the `Motion speed` to get a speed value. The large the value, the fast the camera motion; \ | |
| \n 3. Click `Visualize Camera and Proceed` to visualize the camera poses and go proceed; \ | |
| \n 4. Click `Reset Camera` to reset the camera poses (If needed). ", | |
| show_label=False, visible=False) | |
| gr.HighlightedText(value=[("",""), ("1. Select two of the basic camera poses; 2. Select Customized Mode 1 OR Customized Mode 2. 3. Visualized Camera to show the customized camera poses", "Normal")], | |
| color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False) | |
| with gr.Row(): | |
| combine1 = gr.Button(value=DIY_MODE[0], visible=False) | |
| combine2 = gr.Button(value=DIY_MODE[1], visible=False) | |
| combine3 = gr.Button(value=DIY_MODE[2], visible=False) | |
| with gr.Row(): | |
| combine3_des = gr.Markdown(f"---\n#### Input your camera pose in the following textbox. \ | |
| A total of 14 lines and each line contains 12 float number, indicated \ | |
| the RT matrix in the shape of 1x12. \ | |
| The example is RT matrix of ZOOM IN.", show_label=False, visible=False) | |
| with gr.Row(): | |
| U = gr.Button(value="Pan Up", visible=False) | |
| D = gr.Button(value="Pan Down", visible=False) | |
| L = gr.Button(value="Pan Left", visible=False) | |
| R = gr.Button(value="Pan Right", visible=False) | |
| with gr.Row(): | |
| I = gr.Button(value="Zoom In", visible=False) | |
| O = gr.Button(value="Zoom Out", visible=False) | |
| ACW = gr.Button(value="ACW", visible=False) | |
| CW = gr.Button(value="CW", visible=False) | |
| with gr.Row(): | |
| speed = gr.Slider(minimum=0, maximum=8, step=0.2, label="Motion Speed", value=1.0, visible=False) | |
| with gr.Row(): | |
| Pose_1 = gr.Button(value="Pose_1", visible=False) | |
| Pose_2 = gr.Button(value="Pose_2", visible=False) | |
| Pose_3 = gr.Button(value="Pose_3", visible=False) | |
| Pose_4 = gr.Button(value="Pose_4", visible=False) | |
| with gr.Row(): | |
| Pose_5 = gr.Button(value="Pose_5", visible=False) | |
| Pose_6 = gr.Button(value="Pose_6", visible=False) | |
| Pose_7 = gr.Button(value="Pose_7", visible=False) | |
| Pose_8 = gr.Button(value="Pose_8", visible=False) | |
| with gr.Row(): | |
| camera_args = gr.Textbox(value="Camera Type", label="Camera Type", visible=False) | |
| with gr.Row(): | |
| camera_vis= gr.Button(value="Visualize Camera and Proceed", visible=False) | |
| camera_reset = gr.Button(value="Reset Camera", visible=False) | |
| with gr.Column(): | |
| vis_camera = gr.Plot(fig, label='Camera Poses', visible=False) | |
| # step4 - Generate videos | |
| with gr.Row(): | |
| with gr.Column(): | |
| step3_prompt_generate = gr.Markdown("---\n## Step 3/3: Generate videos", show_label=False, visible=False) | |
| generation_dec = gr.Markdown(f"\n 1. Set `FPS`.; \ | |
| \n 2. Set `n_samples`; \ | |
| \n 3. Set `seed`; \ | |
| \n 4. Click `Start generation !` to generate videos; ", visible=False) | |
| # prompt = gr.Textbox(value="a dog sitting on grass", label="Prompt", interactive=True, visible=False) | |
| prompt = gr.Slider(minimum=5, maximum=30, step=1, label="FPS", value=10, visible=False) | |
| n_samples = gr.Number(value=1, precision=0, interactive=True, label="n_samples", visible=False) | |
| seed = gr.Number(value=1234, precision=0, interactive=True, label="Seed", visible=False) | |
| start = gr.Button(value="Start generation !", visible=False) | |
| with gr.Column(): | |
| gen_video = gr.Video(value=None, label="Generate Video", visible=False) | |
| repeat_highlight=gr.HighlightedText(value=[("",""), (f"1. If the motion control is not obvious, try to increase the `Motion Speed`. \ | |
| \n 2. If the generated videos are distored severely, try to descrease the `Motion Speed` \ | |
| or increase `FPS`.", "Normal")], | |
| color_map={"Normal": "green", "Error": "red", "Clear clicks": "gray", "Add mask": "green", "Remove mask": "red"}, visible=False) | |
| center_crop_botton.click( | |
| fn=process_input_image, | |
| inputs=[input_image, center_crop_botton], | |
| outputs=[ | |
| process_image, | |
| step2_camera_motion, | |
| step2_camera_motion_des, | |
| camera_mode, | |
| camera_info, | |
| basic_camera_motion, | |
| basic_camera_motion_des, | |
| custom_camera_motion, | |
| custom_run_status, | |
| complex_camera_motion, | |
| complex_camera_motion_des, | |
| U, D, L, R, | |
| I, O, ACW, CW, | |
| combine1, combine2, combine3, combine3_des, | |
| speed, | |
| Pose_1, Pose_2, Pose_3, Pose_4, | |
| Pose_5, Pose_6, Pose_7, Pose_8, | |
| camera_args, | |
| camera_reset, camera_vis, | |
| vis_camera, | |
| step3_prompt_generate, | |
| generation_dec, | |
| prompt, | |
| n_samples, | |
| seed, start, gen_video, repeat_highlight]) | |
| keep_spatial_raition_botton.click( | |
| fn=process_input_image, | |
| inputs=[input_image, keep_spatial_raition_botton], | |
| outputs=[ | |
| process_image, | |
| step2_camera_motion, | |
| step2_camera_motion_des, | |
| camera_mode, | |
| camera_info, | |
| basic_camera_motion, | |
| basic_camera_motion_des, | |
| custom_camera_motion, | |
| custom_run_status, | |
| complex_camera_motion, | |
| complex_camera_motion_des, | |
| U, D, L, R, | |
| I, O, ACW, CW, | |
| combine1, combine2, combine3, combine3_des, | |
| speed, | |
| Pose_1, Pose_2, Pose_3, Pose_4, | |
| Pose_5, Pose_6, Pose_7, Pose_8, | |
| camera_args, | |
| camera_reset, camera_vis, | |
| vis_camera, | |
| step3_prompt_generate, | |
| generation_dec, | |
| prompt, | |
| n_samples, | |
| seed, start, gen_video, repeat_highlight]) | |
| camera_info.click( | |
| fn=visualized_camera_poses, | |
| inputs=[camera_mode], | |
| outputs=[basic_camera_motion, | |
| basic_camera_motion_des, | |
| custom_camera_motion, | |
| custom_run_status, | |
| complex_camera_motion, | |
| complex_camera_motion_des, | |
| U, D, L, R, | |
| I, O, ACW, CW, | |
| combine1, combine2, combine3, combine3_des, | |
| speed, | |
| Pose_1, Pose_2, Pose_3, Pose_4, | |
| Pose_5, Pose_6, Pose_7, Pose_8, | |
| camera_args, | |
| camera_reset, camera_vis, | |
| vis_camera, | |
| step3_prompt_generate, generation_dec, prompt, n_samples, seed, start, gen_video, repeat_highlight], | |
| ) | |
| U.click(fn=add_camera_motion, inputs=[U, camera_mode], outputs=camera_args) | |
| D.click(fn=add_camera_motion, inputs=[D, camera_mode], outputs=camera_args) | |
| L.click(fn=add_camera_motion, inputs=[L, camera_mode], outputs=camera_args) | |
| R.click(fn=add_camera_motion, inputs=[R, camera_mode], outputs=camera_args) | |
| I.click(fn=add_camera_motion, inputs=[I, camera_mode], outputs=camera_args) | |
| O.click(fn=add_camera_motion, inputs=[O, camera_mode], outputs=camera_args) | |
| ACW.click(fn=add_camera_motion, inputs=[ACW, camera_mode], outputs=camera_args) | |
| CW.click(fn=add_camera_motion, inputs=[CW, camera_mode], outputs=camera_args) | |
| speed.change(fn=change_camera_speed, inputs=speed, outputs=camera_args) | |
| camera_reset.click(fn=reset_camera, inputs=None, outputs=[camera_args]) | |
| combine1.click(fn=change_camera_mode, | |
| inputs=[combine1, camera_mode], | |
| outputs=[camera_args, | |
| U, D, L, R, | |
| I, O, ACW, CW, speed, | |
| combine3_des]) | |
| combine2.click(fn=change_camera_mode, | |
| inputs=[combine2, camera_mode], | |
| outputs=[camera_args, | |
| U, D, L, R, | |
| I, O, ACW, CW, | |
| speed, | |
| combine3_des]) | |
| combine3.click(fn=input_raw_camera_pose, | |
| inputs=[combine3, camera_mode], | |
| outputs=[camera_args, | |
| U, D, L, R, | |
| I, O, ACW, CW, | |
| speed, | |
| combine3_des]) | |
| camera_vis.click(fn=fn_vis_camera, inputs=[camera_args], | |
| outputs=[vis_camera, | |
| step3_prompt_generate, | |
| generation_dec, | |
| prompt, | |
| n_samples, | |
| seed, | |
| start, | |
| gen_video, | |
| repeat_highlight]) | |
| Pose_1.click(fn=add_complex_camera_motion, inputs=Pose_1, outputs=camera_args) | |
| Pose_2.click(fn=add_complex_camera_motion, inputs=Pose_2, outputs=camera_args) | |
| Pose_3.click(fn=add_complex_camera_motion, inputs=Pose_3, outputs=camera_args) | |
| Pose_4.click(fn=add_complex_camera_motion, inputs=Pose_4, outputs=camera_args) | |
| Pose_5.click(fn=add_complex_camera_motion, inputs=Pose_5, outputs=camera_args) | |
| Pose_6.click(fn=add_complex_camera_motion, inputs=Pose_6, outputs=camera_args) | |
| Pose_7.click(fn=add_complex_camera_motion, inputs=Pose_7, outputs=camera_args) | |
| Pose_8.click(fn=add_complex_camera_motion, inputs=Pose_8, outputs=camera_args) | |
| start.click(fn=model_run, | |
| inputs=[process_image, prompt, seed, n_samples, camera_args], | |
| outputs=gen_video) | |
| # set example | |
| gr.Markdown("## Examples") | |
| examples = glob(os.path.join(os.path.dirname(__file__), "./assets/demo/images", "*.png")) | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[input_image], | |
| examples_per_page=15 | |
| ) | |
| gr.Markdown(article) | |
| # demo.launch(server_name='0.0.0.0', share=False, server_port=args['server_port']) | |
| # demo.queue(concurrency_count=1, max_size=10) | |
| # demo.launch() | |
| demo.queue(max_size=10).launch(**args) | |
| if __name__=="__main__": | |
| parser = argparse.ArgumentParser() | |
| # parser.add_argument("--port", type=int, default=12345) | |
| parser.add_argument( | |
| '--listen', | |
| type=str, | |
| default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1', | |
| help='IP to listen on for connections to Gradio', | |
| ) | |
| parser.add_argument( | |
| '--username', type=str, default='', help='Username for authentication' | |
| ) | |
| parser.add_argument( | |
| '--password', type=str, default='', help='Password for authentication' | |
| ) | |
| parser.add_argument( | |
| '--server_port', | |
| type=int, | |
| default=0, | |
| help='Port to run the server listener on', | |
| ) | |
| parser.add_argument( | |
| '--inbrowser', action='store_true', help='Open in browser' | |
| ) | |
| parser.add_argument( | |
| '--share', action='store_true', help='Share the gradio UI' | |
| ) | |
| args = parser.parse_args() | |
| launch_kwargs = {} | |
| launch_kwargs['server_name'] = args.listen | |
| if args.username and args.password: | |
| launch_kwargs['auth'] = (args.username, args.password) | |
| if args.server_port: | |
| launch_kwargs['server_port'] = args.server_port | |
| if args.inbrowser: | |
| launch_kwargs['inbrowser'] = args.inbrowser | |
| if args.share: | |
| launch_kwargs['share'] = args.share | |
| main(launch_kwargs) | |