Spaces:
Runtime error
Runtime error
Commit
·
afd7574
1
Parent(s):
0cbd26d
rearrange the spatial layout; add crop to input video
Browse files- FateZero/video_diffusion/data/dataset.py +21 -2
- app_fatezero.py +94 -46
- inference_fatezero.py +30 -1
FateZero/video_diffusion/data/dataset.py
CHANGED
|
@@ -4,6 +4,8 @@ import numpy as np
|
|
| 4 |
from PIL import Image
|
| 5 |
from einops import rearrange
|
| 6 |
from pathlib import Path
|
|
|
|
|
|
|
| 7 |
|
| 8 |
import torch
|
| 9 |
from torch.utils.data import Dataset
|
|
@@ -149,10 +151,27 @@ class ImageSequenceDataset(Dataset):
|
|
| 149 |
frame_start = index
|
| 150 |
return (frame_start + i for i in range(self.n_sample_frame))
|
| 151 |
|
| 152 |
-
@staticmethod
|
| 153 |
-
def get_image_list(path):
|
| 154 |
images = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
for file in sorted(os.listdir(path)):
|
| 156 |
if file.endswith(IMAGE_EXTENSION):
|
| 157 |
images.append(file)
|
| 158 |
return images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from PIL import Image
|
| 5 |
from einops import rearrange
|
| 6 |
from pathlib import Path
|
| 7 |
+
import imageio
|
| 8 |
+
import cv2
|
| 9 |
|
| 10 |
import torch
|
| 11 |
from torch.utils.data import Dataset
|
|
|
|
| 151 |
frame_start = index
|
| 152 |
return (frame_start + i for i in range(self.n_sample_frame))
|
| 153 |
|
| 154 |
+
# @staticmethod
|
| 155 |
+
def get_image_list(self, path):
|
| 156 |
images = []
|
| 157 |
+
if path[-4:] == '.mp4':
|
| 158 |
+
path = self.mp4_to_png(path)
|
| 159 |
+
self.path = path
|
| 160 |
+
|
| 161 |
for file in sorted(os.listdir(path)):
|
| 162 |
if file.endswith(IMAGE_EXTENSION):
|
| 163 |
images.append(file)
|
| 164 |
return images
|
| 165 |
+
|
| 166 |
+
# @staticmethod
|
| 167 |
+
def mp4_to_png(self, video_source=None):
|
| 168 |
+
reader = imageio.get_reader(video_source)
|
| 169 |
+
os.makedirs(video_source[:-4], exist_ok=True)
|
| 170 |
+
|
| 171 |
+
for i, im in enumerate(reader):
|
| 172 |
+
# use :05d to add zero, no space before the 05d
|
| 173 |
+
# if (i+1)%10 == 0:
|
| 174 |
+
path = os.path.join(video_source[:-4], f"{i:05d}.png")
|
| 175 |
+
# print(path)
|
| 176 |
+
cv2.imwrite(path, im[:, :, ::-1])
|
| 177 |
+
return video_source[:-4]
|
app_fatezero.py
CHANGED
|
@@ -36,8 +36,59 @@ with gr.Blocks(css='style.css') as demo:
|
|
| 36 |
|
| 37 |
with gr.Row():
|
| 38 |
with gr.Column():
|
| 39 |
-
with gr.
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
model_id = gr.Dropdown(
|
| 42 |
label='Model ID',
|
| 43 |
choices=[
|
|
@@ -55,54 +106,21 @@ with gr.Blocks(css='style.css') as demo:
|
|
| 55 |
# prompt_used_for_training = gr.Text(
|
| 56 |
# label='Training prompt', interactive=False)
|
| 57 |
|
| 58 |
-
data_path = gr.Dropdown(
|
| 59 |
-
label='data path',
|
| 60 |
-
choices=[
|
| 61 |
-
'FateZero/data/teaser_car-turn',
|
| 62 |
-
'FateZero/data/style/sunflower',
|
| 63 |
-
# add shape editing ckpt here
|
| 64 |
-
],
|
| 65 |
-
value='FateZero/data/teaser_car-turn')
|
| 66 |
|
| 67 |
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
cross_replace_steps = gr.Slider(label='cross-attention replace steps',
|
| 81 |
-
info='More steps, replace more cross attention to preserve semantic layout.',
|
| 82 |
-
minimum=0.0,
|
| 83 |
-
maximum=1.0,
|
| 84 |
-
step=0.1,
|
| 85 |
-
value=0.7)
|
| 86 |
-
|
| 87 |
-
self_replace_steps = gr.Slider(label='self-attention replace steps',
|
| 88 |
-
info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
|
| 89 |
-
minimum=0.0,
|
| 90 |
-
maximum=1.0,
|
| 91 |
-
step=0.1,
|
| 92 |
-
value=0.7)
|
| 93 |
-
|
| 94 |
-
enhance_words = gr.Textbox(label='words to be enhanced',
|
| 95 |
-
info='Amplify the target-words cross attention',
|
| 96 |
-
max_lines=1,
|
| 97 |
-
placeholder='Example: "watercolor "',
|
| 98 |
-
value='watercolor')
|
| 99 |
|
| 100 |
-
enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
|
| 101 |
-
info='larger value, more elements of target words',
|
| 102 |
-
minimum=0.0,
|
| 103 |
-
maximum=20.0,
|
| 104 |
-
step=1,
|
| 105 |
-
value=10)
|
| 106 |
|
| 107 |
|
| 108 |
with gr.Accordion('DDIM Parameters', open=True):
|
|
@@ -129,6 +147,34 @@ with gr.Blocks(css='style.css') as demo:
|
|
| 129 |
''')
|
| 130 |
with gr.Column():
|
| 131 |
result = gr.Video(label='Result')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
with gr.Row():
|
| 133 |
examples = [
|
| 134 |
[
|
|
@@ -190,6 +236,8 @@ with gr.Blocks(css='style.css') as demo:
|
|
| 190 |
enhance_words_value,
|
| 191 |
num_steps,
|
| 192 |
guidance_scale,
|
|
|
|
|
|
|
| 193 |
]
|
| 194 |
# prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
|
| 195 |
target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)
|
|
|
|
| 36 |
|
| 37 |
with gr.Row():
|
| 38 |
with gr.Column():
|
| 39 |
+
with gr.Accordion('Input Video', open=True):
|
| 40 |
+
user_input_video = gr.File(label='Input Source Video')
|
| 41 |
+
with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
|
| 42 |
+
n_sample_frame = gr.Slider(label='Number of Frames in Video',
|
| 43 |
+
# info='We test 8 frames in our paper',
|
| 44 |
+
minimum=0,
|
| 45 |
+
maximum=32,
|
| 46 |
+
step=1,
|
| 47 |
+
value=8)
|
| 48 |
+
stride = gr.Slider(label='Temporal sampling stride in Video',
|
| 49 |
+
minimum=0,
|
| 50 |
+
maximum=20,
|
| 51 |
+
step=1,
|
| 52 |
+
value=1)
|
| 53 |
+
start_sample_frame = gr.Number(label='Start frame in the video',
|
| 54 |
+
value=0,
|
| 55 |
+
precision=0)
|
| 56 |
+
|
| 57 |
+
with gr.Accordion('Spatial Crop offset', open=False):
|
| 58 |
+
left_crop = gr.Number(label='Left crop',
|
| 59 |
+
value=0,
|
| 60 |
+
precision=0)
|
| 61 |
+
right_crop = gr.Number(label='Right crop',
|
| 62 |
+
value=0,
|
| 63 |
+
precision=0)
|
| 64 |
+
top_crop = gr.Number(label='Top crop',
|
| 65 |
+
value=0,
|
| 66 |
+
precision=0)
|
| 67 |
+
bottom_crop = gr.Number(label='Bottom crop',
|
| 68 |
+
value=0,
|
| 69 |
+
precision=0)
|
| 70 |
+
offset_list = [
|
| 71 |
+
left_crop,
|
| 72 |
+
right_crop,
|
| 73 |
+
top_crop,
|
| 74 |
+
bottom_crop,
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
ImageSequenceDataset_list = [
|
| 78 |
+
start_sample_frame,
|
| 79 |
+
n_sample_frame,
|
| 80 |
+
stride
|
| 81 |
+
] + offset_list
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
data_path = gr.Dropdown(
|
| 85 |
+
label='provided data path',
|
| 86 |
+
choices=[
|
| 87 |
+
'FateZero/data/teaser_car-turn',
|
| 88 |
+
'FateZero/data/style/sunflower',
|
| 89 |
+
# add shape editing ckpt here
|
| 90 |
+
],
|
| 91 |
+
value='FateZero/data/teaser_car-turn')
|
| 92 |
model_id = gr.Dropdown(
|
| 93 |
label='Model ID',
|
| 94 |
choices=[
|
|
|
|
| 106 |
# prompt_used_for_training = gr.Text(
|
| 107 |
# label='Training prompt', interactive=False)
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
+
with gr.Accordion('Text Prompt', open=True):
|
| 112 |
|
| 113 |
+
source_prompt = gr.Textbox(label='Source Prompt',
|
| 114 |
+
info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
|
| 115 |
+
max_lines=1,
|
| 116 |
+
placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
|
| 117 |
+
value='a silver jeep driving down a curvy road in the countryside')
|
| 118 |
+
target_prompt = gr.Textbox(label='Target Prompt',
|
| 119 |
+
info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
|
| 120 |
+
max_lines=1,
|
| 121 |
+
placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
|
| 122 |
+
value='watercolor painting of a silver jeep driving down a curvy road in the countryside')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
with gr.Accordion('DDIM Parameters', open=True):
|
|
|
|
| 147 |
''')
|
| 148 |
with gr.Column():
|
| 149 |
result = gr.Video(label='Result')
|
| 150 |
+
result.style(height=512, width=512)
|
| 151 |
+
with gr.Accordion('FateZero Parameters for attention fusing', open=True):
|
| 152 |
+
cross_replace_steps = gr.Slider(label='cross-attention replace steps',
|
| 153 |
+
info='More steps, replace more cross attention to preserve semantic layout.',
|
| 154 |
+
minimum=0.0,
|
| 155 |
+
maximum=1.0,
|
| 156 |
+
step=0.1,
|
| 157 |
+
value=0.7)
|
| 158 |
+
|
| 159 |
+
self_replace_steps = gr.Slider(label='self-attention replace steps',
|
| 160 |
+
info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
|
| 161 |
+
minimum=0.0,
|
| 162 |
+
maximum=1.0,
|
| 163 |
+
step=0.1,
|
| 164 |
+
value=0.7)
|
| 165 |
+
|
| 166 |
+
enhance_words = gr.Textbox(label='words to be enhanced',
|
| 167 |
+
info='Amplify the target-words cross attention',
|
| 168 |
+
max_lines=1,
|
| 169 |
+
placeholder='Example: "watercolor "',
|
| 170 |
+
value='watercolor')
|
| 171 |
+
|
| 172 |
+
enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
|
| 173 |
+
info='larger value, more elements of target words',
|
| 174 |
+
minimum=0.0,
|
| 175 |
+
maximum=20.0,
|
| 176 |
+
step=1,
|
| 177 |
+
value=10)
|
| 178 |
with gr.Row():
|
| 179 |
examples = [
|
| 180 |
[
|
|
|
|
| 236 |
enhance_words_value,
|
| 237 |
num_steps,
|
| 238 |
guidance_scale,
|
| 239 |
+
user_input_video,
|
| 240 |
+
*ImageSequenceDataset_list
|
| 241 |
]
|
| 242 |
# prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
|
| 243 |
target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)
|
inference_fatezero.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
from FateZero.test_fatezero import *
|
| 3 |
|
| 4 |
import copy
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def merge_config_then_run(
|
|
@@ -14,7 +15,17 @@ def merge_config_then_run(
|
|
| 14 |
enhance_words,
|
| 15 |
enhance_words_value,
|
| 16 |
num_steps,
|
| 17 |
-
guidance_scale
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
):
|
| 19 |
# , ] = inputs
|
| 20 |
default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
|
|
@@ -26,6 +37,24 @@ def merge_config_then_run(
|
|
| 26 |
# config_now['pretrained_model_path'] = model_id
|
| 27 |
config_now['train_dataset']['prompt'] = source_prompt
|
| 28 |
config_now['train_dataset']['path'] = data_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
|
| 30 |
|
| 31 |
|
|
|
|
| 2 |
from FateZero.test_fatezero import *
|
| 3 |
|
| 4 |
import copy
|
| 5 |
+
import gradio as gr
|
| 6 |
|
| 7 |
|
| 8 |
def merge_config_then_run(
|
|
|
|
| 15 |
enhance_words,
|
| 16 |
enhance_words_value,
|
| 17 |
num_steps,
|
| 18 |
+
guidance_scale,
|
| 19 |
+
user_input_video,
|
| 20 |
+
|
| 21 |
+
# Temporal and spatial crop of the video
|
| 22 |
+
start_sample_frame,
|
| 23 |
+
n_sample_frame,
|
| 24 |
+
stride,
|
| 25 |
+
left_crop,
|
| 26 |
+
right_crop,
|
| 27 |
+
top_crop,
|
| 28 |
+
bottom_crop,
|
| 29 |
):
|
| 30 |
# , ] = inputs
|
| 31 |
default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
|
|
|
|
| 37 |
# config_now['pretrained_model_path'] = model_id
|
| 38 |
config_now['train_dataset']['prompt'] = source_prompt
|
| 39 |
config_now['train_dataset']['path'] = data_path
|
| 40 |
+
# ImageSequenceDataset_dict = { }
|
| 41 |
+
offset_dict = {
|
| 42 |
+
"left": left_crop,
|
| 43 |
+
"right": right_crop,
|
| 44 |
+
"top": top_crop,
|
| 45 |
+
"bottom": bottom_crop,
|
| 46 |
+
}
|
| 47 |
+
ImageSequenceDataset_dict = {
|
| 48 |
+
"start_sample_frame" : start_sample_frame,
|
| 49 |
+
"n_sample_frame" : n_sample_frame,
|
| 50 |
+
"stride" : stride,
|
| 51 |
+
"offset": offset_dict,
|
| 52 |
+
}
|
| 53 |
+
config_now['train_dataset'].update(ImageSequenceDataset_dict)
|
| 54 |
+
if user_input_video and data_path is None:
|
| 55 |
+
raise gr.Error('You need to upload a video or choose a provided video')
|
| 56 |
+
if user_input_video is not None and user_input_video.name is not None:
|
| 57 |
+
config_now['train_dataset']['path'] = user_input_video.name
|
| 58 |
config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
|
| 59 |
|
| 60 |
|