Spaces:

chenyangqi
/

FateZero

Runtime error

App Files Files Community

chenyangqi commited on Mar 28, 2023

Commit

afd7574

1 Parent(s): 0cbd26d

rearrange the spatial layout; add crop to input video

Browse files

Files changed (3) hide show

FateZero/video_diffusion/data/dataset.py +21 -2
app_fatezero.py +94 -46
inference_fatezero.py +30 -1

FateZero/video_diffusion/data/dataset.py CHANGED Viewed

@@ -4,6 +4,8 @@ import numpy as np
 from PIL import Image
 from einops import rearrange
 from pathlib import Path
 import torch
 from torch.utils.data import Dataset
@@ -149,10 +151,27 @@ class ImageSequenceDataset(Dataset):
         frame_start = index
         return (frame_start + i  for i in range(self.n_sample_frame))
-    @staticmethod
-    def get_image_list(path):
         images = []
         for file in sorted(os.listdir(path)):
             if file.endswith(IMAGE_EXTENSION):
                 images.append(file)
         return images

 from PIL import Image
 from einops import rearrange
 from pathlib import Path
+import imageio
+import cv2
 import torch
 from torch.utils.data import Dataset
         frame_start = index
         return (frame_start + i  for i in range(self.n_sample_frame))
+    # @staticmethod
+    def get_image_list(self, path):
         images = []
+        if path[-4:] == '.mp4':
+            path = self.mp4_to_png(path)
+            self.path = path
         for file in sorted(os.listdir(path)):
             if file.endswith(IMAGE_EXTENSION):
                 images.append(file)
         return images
+    # @staticmethod
+    def mp4_to_png(self, video_source=None):
+        reader = imageio.get_reader(video_source)
+        os.makedirs(video_source[:-4], exist_ok=True)
+        for i, im in enumerate(reader):
+            # use :05d to add zero, no space before the 05d
+            # if (i+1)%10 == 0:
+            path = os.path.join(video_source[:-4], f"{i:05d}.png")
+            # print(path)
+            cv2.imwrite(path, im[:, :, ::-1])
+        return video_source[:-4]

app_fatezero.py CHANGED Viewed

@@ -36,8 +36,59 @@ with gr.Blocks(css='style.css') as demo:
     with gr.Row():
         with gr.Column():
-            with gr.Box():
                 model_id = gr.Dropdown(
                     label='Model ID',
                     choices=[
@@ -55,54 +106,21 @@ with gr.Blocks(css='style.css') as demo:
                 #         prompt_used_for_training = gr.Text(
                 #             label='Training prompt', interactive=False)
-            data_path = gr.Dropdown(
-                label='data path',
-                choices=[
-                    'FateZero/data/teaser_car-turn',
-                    'FateZero/data/style/sunflower',
-                    # add shape editing ckpt here
-                ],
-                value='FateZero/data/teaser_car-turn')
-            source_prompt = gr.Textbox(label='Source Prompt',
-                                info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
-                                max_lines=1,
-                                placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
-                                value='a silver jeep driving down a curvy road in the countryside')
-            target_prompt = gr.Textbox(label='Target Prompt',
-                                info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
-                                max_lines=1,
-                                placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
-                                value='watercolor painting of a silver jeep driving down a curvy road in the countryside')
-            cross_replace_steps = gr.Slider(label='cross-attention replace steps',
-                            info='More steps, replace more cross attention to preserve semantic layout.',
-                            minimum=0.0,
-                            maximum=1.0,
-                            step=0.1,
-                            value=0.7)
-            self_replace_steps = gr.Slider(label='self-attention replace steps',
-                            info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
-                            minimum=0.0,
-                            maximum=1.0,
-                            step=0.1,
-                            value=0.7)
-            enhance_words = gr.Textbox(label='words to be enhanced',
-                                info='Amplify the target-words cross attention',
-                                max_lines=1,
-                                placeholder='Example: "watercolor "',
-                                value='watercolor')
-            enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
-                            info='larger value, more elements of target words',
-                            minimum=0.0,
-                            maximum=20.0,
-                            step=1,
-                            value=10)
             with gr.Accordion('DDIM Parameters', open=True):
@@ -129,6 +147,34 @@ with gr.Blocks(css='style.css') as demo:
             ''')
         with gr.Column():
             result = gr.Video(label='Result')
     with gr.Row():
         examples = [
             [
@@ -190,6 +236,8 @@ with gr.Blocks(css='style.css') as demo:
             enhance_words_value,
             num_steps,
             guidance_scale,
     ]
     # prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
     target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)

     with gr.Row():
         with gr.Column():
+            with gr.Accordion('Input Video', open=True):
+                user_input_video = gr.File(label='Input Source Video')
+                with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
+                    n_sample_frame = gr.Slider(label='Number of Frames in Video',
+                                        #   info='We test 8 frames in our paper',
+                                        minimum=0,
+                                        maximum=32,
+                                        step=1,
+                                        value=8)
+                    stride = gr.Slider(label='Temporal sampling stride in Video',
+                                            minimum=0,
+                                            maximum=20,
+                                            step=1,
+                                            value=1)
+                    start_sample_frame = gr.Number(label='Start frame in the video',
+                              value=0,
+                              precision=0)
+                with gr.Accordion('Spatial Crop offset', open=False):
+                    left_crop = gr.Number(label='Left crop',
+                              value=0,
+                              precision=0)
+                    right_crop = gr.Number(label='Right crop',
+                              value=0,
+                              precision=0)
+                    top_crop = gr.Number(label='Top crop',
+                              value=0,
+                              precision=0)
+                    bottom_crop = gr.Number(label='Bottom crop',
+                              value=0,
+                              precision=0)
+                    offset_list = [
+                         left_crop,
+                         right_crop,
+                         top_crop,
+                         bottom_crop,
+                    ]
+                ImageSequenceDataset_list = [
+                   start_sample_frame,
+                   n_sample_frame,
+                   stride
+                ] + offset_list
+                data_path = gr.Dropdown(
+                label='provided data path',
+                choices=[
+                    'FateZero/data/teaser_car-turn',
+                    'FateZero/data/style/sunflower',
+                    # add shape editing ckpt here
+                ],
+                value='FateZero/data/teaser_car-turn')
                 model_id = gr.Dropdown(
                     label='Model ID',
                     choices=[
                 #         prompt_used_for_training = gr.Text(
                 #             label='Training prompt', interactive=False)
+            with gr.Accordion('Text Prompt', open=True):
+                source_prompt = gr.Textbox(label='Source Prompt',
+                                    info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
+                                    max_lines=1,
+                                    placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
+                                    value='a silver jeep driving down a curvy road in the countryside')
+                target_prompt = gr.Textbox(label='Target Prompt',
+                                    info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
+                                    max_lines=1,
+                                    placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
+                                    value='watercolor painting of a silver jeep driving down a curvy road in the countryside')
             with gr.Accordion('DDIM Parameters', open=True):
             ''')
         with gr.Column():
             result = gr.Video(label='Result')
+            result.style(height=512, width=512)
+            with gr.Accordion('FateZero Parameters for attention fusing', open=True):
+                cross_replace_steps = gr.Slider(label='cross-attention replace steps',
+                                info='More steps, replace more cross attention to preserve semantic layout.',
+                                minimum=0.0,
+                                maximum=1.0,
+                                step=0.1,
+                                value=0.7)
+                self_replace_steps = gr.Slider(label='self-attention replace steps',
+                                info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
+                                minimum=0.0,
+                                maximum=1.0,
+                                step=0.1,
+                                value=0.7)
+                enhance_words = gr.Textbox(label='words to be enhanced',
+                                    info='Amplify the target-words cross attention',
+                                    max_lines=1,
+                                    placeholder='Example: "watercolor "',
+                                    value='watercolor')
+                enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
+                                info='larger value, more elements of target words',
+                                minimum=0.0,
+                                maximum=20.0,
+                                step=1,
+                                value=10)
     with gr.Row():
         examples = [
             [
             enhance_words_value,
             num_steps,
             guidance_scale,
+            user_input_video,
+            *ImageSequenceDataset_list
     ]
     # prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
     target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)

inference_fatezero.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from FateZero.test_fatezero import *
 import copy
 def merge_config_then_run(
@@ -14,7 +15,17 @@ def merge_config_then_run(
         enhance_words,
         enhance_words_value,
         num_steps,
-        guidance_scale
     ):
     # , ] = inputs
     default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
@@ -26,6 +37,24 @@ def merge_config_then_run(
     # config_now['pretrained_model_path'] = model_id
     config_now['train_dataset']['prompt'] = source_prompt
     config_now['train_dataset']['path'] = data_path
     config_now['validation_sample_logger_config']['prompts'] = [target_prompt]

 from FateZero.test_fatezero import *
 import copy
+import gradio as gr
 def merge_config_then_run(
         enhance_words,
         enhance_words_value,
         num_steps,
+        guidance_scale,
+        user_input_video,
+        # Temporal and spatial crop of the video
+        start_sample_frame,
+        n_sample_frame,
+        stride,
+        left_crop,
+        right_crop,
+        top_crop,
+        bottom_crop,
     ):
     # , ] = inputs
     default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
     # config_now['pretrained_model_path'] = model_id
     config_now['train_dataset']['prompt'] = source_prompt
     config_now['train_dataset']['path'] = data_path
+    # ImageSequenceDataset_dict = { }
+    offset_dict = {
+        "left": left_crop,
+        "right": right_crop,
+        "top": top_crop,
+        "bottom": bottom_crop,
+    }
+    ImageSequenceDataset_dict = {
+        "start_sample_frame" : start_sample_frame,
+        "n_sample_frame" : n_sample_frame,
+        "stride"       : stride,
+        "offset": offset_dict,
+    }
+    config_now['train_dataset'].update(ImageSequenceDataset_dict)
+    if user_input_video and data_path is None:
+        raise gr.Error('You need to upload a video or choose a provided video')
+    if user_input_video is not None and user_input_video.name is not None:
+        config_now['train_dataset']['path'] = user_input_video.name
     config_now['validation_sample_logger_config']['prompts'] = [target_prompt]