Spaces:
Running
on
Zero
Running
on
Zero
init project
Browse files- app.py +391 -114
- configs/__init__.py +0 -0
- configs/sam2_hiera_b+.yaml +113 -0
- configs/sam2_hiera_l.yaml +117 -0
- configs/sam2_hiera_s.yaml +116 -0
- configs/sam2_hiera_t.yaml +118 -0
- requirements.txt +13 -2
- utils/__init__.py +0 -0
- utils/florence.py +64 -0
- utils/sam.py +50 -0
app.py
CHANGED
|
@@ -1,142 +1,419 @@
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
import random
|
| 4 |
-
|
| 5 |
from diffusers import DiffusionPipeline
|
|
|
|
| 6 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 9 |
-
model_repo_id = "stabilityai/sdxl-turbo" #Replace to the model you would like to use
|
| 10 |
|
| 11 |
-
|
| 12 |
-
torch_dtype = torch.float16
|
| 13 |
-
else:
|
| 14 |
-
torch_dtype = torch.float32
|
| 15 |
|
| 16 |
-
|
| 17 |
-
pipe = pipe.to(device)
|
| 18 |
|
| 19 |
MAX_SEED = np.iinfo(np.int32).max
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
css="""
|
| 49 |
-
#col-container {
|
| 50 |
-
margin: 0 auto;
|
| 51 |
-
max-width: 640px;
|
| 52 |
-
}
|
| 53 |
-
"""
|
| 54 |
-
|
| 55 |
-
with gr.Blocks(css=css) as demo:
|
| 56 |
|
| 57 |
-
|
| 58 |
-
gr.
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
with gr.Row():
|
| 63 |
-
|
| 64 |
-
prompt = gr.Text(
|
| 65 |
-
label="Prompt",
|
| 66 |
-
show_label=False,
|
| 67 |
-
max_lines=1,
|
| 68 |
-
placeholder="Enter your prompt",
|
| 69 |
-
container=False,
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
run_button = gr.Button("Run", scale=0)
|
| 73 |
-
|
| 74 |
-
result = gr.Image(label="Result", show_label=False)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
step=32,
|
| 102 |
-
value=1024, #Replace with defaults that work for your model
|
| 103 |
)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
label="
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
)
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
step=0.1,
|
| 120 |
-
value=0.
|
| 121 |
)
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
step=1,
|
| 128 |
-
value=
|
| 129 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
inputs = [prompt]
|
| 134 |
-
)
|
| 135 |
-
gr.on(
|
| 136 |
-
triggers=[run_button.click, prompt.submit],
|
| 137 |
-
fn = infer,
|
| 138 |
-
inputs = [prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
|
| 139 |
-
outputs = [result, seed]
|
| 140 |
-
)
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
demo.queue().launch()
|
|
|
|
| 1 |
+
from typing import Tuple, Optional
|
| 2 |
+
|
| 3 |
import gradio as gr
|
| 4 |
import numpy as np
|
| 5 |
import random
|
| 6 |
+
import spaces
|
| 7 |
from diffusers import DiffusionPipeline
|
| 8 |
+
from diffusers import FluxInpaintPipeline
|
| 9 |
import torch
|
| 10 |
+
from PIL import Image, ImageFilter
|
| 11 |
+
from huggingface_hub import login
|
| 12 |
+
from huggingface_hub import hf_hub_download, HfFileSystem, ModelCard, snapshot_download
|
| 13 |
+
import copy
|
| 14 |
+
import random
|
| 15 |
+
import time
|
| 16 |
+
import boto3
|
| 17 |
+
from io import BytesIO
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
from diffusers.utils import load_image
|
| 20 |
+
import json
|
| 21 |
+
|
| 22 |
+
from utils.florence import load_florence_model, run_florence_inference, \
|
| 23 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
| 24 |
+
from utils.sam import load_sam_image_model, run_sam_inference
|
| 25 |
+
|
| 26 |
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
login(token=HF_TOKEN)
|
|
|
|
| 31 |
|
| 32 |
MAX_SEED = np.iinfo(np.int32).max
|
| 33 |
+
IMAGE_SIZE = 1024
|
| 34 |
|
| 35 |
+
# init
|
| 36 |
+
dtype = torch.bfloat16
|
| 37 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 38 |
+
base_model = "black-forest-labs/FLUX.1-dev"
|
| 39 |
|
| 40 |
+
taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).to(device)
|
| 41 |
+
good_vae = AutoencoderKL.from_pretrained(base_model, subfolder="vae", torch_dtype=dtype).to(device)
|
| 42 |
+
pipe = FluxInpaintPipeline.from_pretrained(base_model, torch_dtype=dtype, vae=taef1).to(device)
|
| 43 |
+
|
| 44 |
+
class calculateDuration:
|
| 45 |
+
def __init__(self, activity_name=""):
|
| 46 |
+
self.activity_name = activity_name
|
| 47 |
+
|
| 48 |
+
def __enter__(self):
|
| 49 |
+
self.start_time = time.time()
|
| 50 |
+
return self
|
| 51 |
|
| 52 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
| 53 |
+
self.end_time = time.time()
|
| 54 |
+
self.elapsed_time = self.end_time - self.start_time
|
| 55 |
+
if self.activity_name:
|
| 56 |
+
print(f"Elapsed time for {self.activity_name}: {self.elapsed_time:.6f} seconds")
|
| 57 |
+
else:
|
| 58 |
+
print(f"Elapsed time: {self.elapsed_time:.6f} seconds")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def calculate_image_dimensions_for_flux(
|
| 62 |
+
original_resolution_wh: Tuple[int, int],
|
| 63 |
+
maximum_dimension: int = IMAGE_SIZE
|
| 64 |
+
) -> Tuple[int, int]:
|
| 65 |
+
width, height = original_resolution_wh
|
| 66 |
+
|
| 67 |
+
if width > height:
|
| 68 |
+
scaling_factor = maximum_dimension / width
|
| 69 |
+
else:
|
| 70 |
+
scaling_factor = maximum_dimension / height
|
| 71 |
+
|
| 72 |
+
new_width = int(width * scaling_factor)
|
| 73 |
+
new_height = int(height * scaling_factor)
|
| 74 |
+
|
| 75 |
+
new_width = new_width - (new_width % 32)
|
| 76 |
+
new_height = new_height - (new_height % 32)
|
| 77 |
+
|
| 78 |
+
return new_width, new_height
|
| 79 |
+
|
| 80 |
+
def is_mask_empty(image: Image.Image) -> bool:
|
| 81 |
+
gray_img = image.convert("L")
|
| 82 |
+
pixels = list(gray_img.getdata())
|
| 83 |
+
return all(pixel == 0 for pixel in pixels)
|
| 84 |
+
|
| 85 |
+
def process_mask(
|
| 86 |
+
mask: Image.Image,
|
| 87 |
+
mask_inflation: Optional[int] = None,
|
| 88 |
+
mask_blur: Optional[int] = None
|
| 89 |
+
) -> Image.Image:
|
| 90 |
+
"""
|
| 91 |
+
Inflates and blurs the white regions of a mask.
|
| 92 |
+
Args:
|
| 93 |
+
mask (Image.Image): The input mask image.
|
| 94 |
+
mask_inflation (Optional[int]): The number of pixels to inflate the mask by.
|
| 95 |
+
mask_blur (Optional[int]): The radius of the Gaussian blur to apply.
|
| 96 |
+
Returns:
|
| 97 |
+
Image.Image: The processed mask with inflated and/or blurred regions.
|
| 98 |
+
"""
|
| 99 |
+
if mask_inflation and mask_inflation > 0:
|
| 100 |
+
mask_array = np.array(mask)
|
| 101 |
+
kernel = np.ones((mask_inflation, mask_inflation), np.uint8)
|
| 102 |
+
mask_array = cv2.dilate(mask_array, kernel, iterations=1)
|
| 103 |
+
mask = Image.fromarray(mask_array)
|
| 104 |
+
|
| 105 |
+
if mask_blur and mask_blur > 0:
|
| 106 |
+
mask = mask.filter(ImageFilter.GaussianBlur(radius=mask_blur))
|
| 107 |
+
|
| 108 |
+
return mask
|
| 109 |
+
|
| 110 |
+
def upload_image_to_r2(image, account_id, access_key, secret_key, bucket_name):
|
| 111 |
+
print("upload_image_to_r2", account_id, access_key, secret_key, bucket_name)
|
| 112 |
+
connectionUrl = f"https://{account_id}.r2.cloudflarestorage.com"
|
| 113 |
+
|
| 114 |
+
s3 = boto3.client(
|
| 115 |
+
's3',
|
| 116 |
+
endpoint_url=connectionUrl,
|
| 117 |
+
region_name='auto',
|
| 118 |
+
aws_access_key_id=access_key,
|
| 119 |
+
aws_secret_access_key=secret_key
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
current_time = datetime.now().strftime("%Y/%m/%d/%H%M%S")
|
| 123 |
+
image_file = f"generated_images/{current_time}_{random.randint(0, MAX_SEED)}.png"
|
| 124 |
+
buffer = BytesIO()
|
| 125 |
+
image.save(buffer, "PNG")
|
| 126 |
+
buffer.seek(0)
|
| 127 |
+
s3.upload_fileobj(buffer, bucket_name, image_file)
|
| 128 |
+
print("upload finish", image_file)
|
| 129 |
+
return image_file
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@spaces.GPU(duration=50)
|
| 133 |
+
def run_flux(
|
| 134 |
+
image: Image.Image,
|
| 135 |
+
mask: Image.Image,
|
| 136 |
+
prompt: str,
|
| 137 |
+
lora_path: str,
|
| 138 |
+
lora_weights: str,
|
| 139 |
+
lora_scale: float,
|
| 140 |
+
seed_slicer: int,
|
| 141 |
+
randomize_seed_checkbox: bool,
|
| 142 |
+
strength_slider: float,
|
| 143 |
+
num_inference_steps_slider: int,
|
| 144 |
+
resolution_wh: Tuple[int, int],
|
| 145 |
+
) -> Image.Image:
|
| 146 |
+
print("Running FLUX...")
|
| 147 |
+
|
| 148 |
+
with calculateDuration("load lora"):
|
| 149 |
+
print("start to load lora", lora_path, lora_weights)
|
| 150 |
+
pipe.load_lora_weights(lora_path, weight_name=lora_weights)
|
| 151 |
+
|
| 152 |
+
width, height = resolution_wh
|
| 153 |
+
if randomize_seed_checkbox:
|
| 154 |
+
seed_slicer = random.randint(0, MAX_SEED)
|
| 155 |
+
generator = torch.Generator().manual_seed(seed_slicer)
|
| 156 |
+
|
| 157 |
+
return PIPE(
|
| 158 |
+
prompt=prompt,
|
| 159 |
+
image=image,
|
| 160 |
+
mask_image=mask,
|
| 161 |
+
width=width,
|
| 162 |
+
height=height,
|
| 163 |
+
strength=strength_slider,
|
| 164 |
+
generator=generator,
|
| 165 |
+
num_inference_steps=num_inference_steps_slider,
|
| 166 |
+
max_sequence_length=256,
|
| 167 |
+
joint_attention_kwargs={"scale": lora_scale},
|
| 168 |
+
).images[0]
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@spaces.GPU(duration=50)
|
| 172 |
+
def genearte_mask(image: Image.Image, masking_prompt_text: str) -> Image.Image:
|
| 173 |
+
# generate mask by florence & sam
|
| 174 |
+
print("Generating mask...")
|
| 175 |
+
|
| 176 |
+
return
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def process(
|
| 180 |
+
image_url: str,
|
| 181 |
+
inpainting_prompt_text: str,
|
| 182 |
+
masking_prompt_text: str,
|
| 183 |
+
mask_inflation_slider: int,
|
| 184 |
+
mask_blur_slider: int,
|
| 185 |
+
seed_slicer: int,
|
| 186 |
+
randomize_seed_checkbox: bool,
|
| 187 |
+
strength_slider: float,
|
| 188 |
+
num_inference_steps_slider: int,
|
| 189 |
+
lora_path: str,
|
| 190 |
+
lora_weights: str,
|
| 191 |
+
lora_scale: str,
|
| 192 |
+
upload_to_r2: bool,
|
| 193 |
+
account_id: str,
|
| 194 |
+
access_key: str,
|
| 195 |
+
secret_key: str,
|
| 196 |
+
bucket:str
|
| 197 |
+
):
|
| 198 |
+
result = {"status": "false", "message": ""}
|
| 199 |
+
if not image_url:
|
| 200 |
+
gr.Info("please enter image url for inpaiting")
|
| 201 |
+
result["message"] = "invalid image url"
|
| 202 |
+
return None, None, json.dumps(result)
|
| 203 |
+
|
| 204 |
+
if not inpainting_prompt_text:
|
| 205 |
+
gr.Info("Please enter inpainting text prompt.")
|
| 206 |
+
result["message"] = "invalid inpainting prompt"
|
| 207 |
+
return None, None, json.dumps(result)
|
| 208 |
|
| 209 |
+
if not masking_prompt_text:
|
| 210 |
+
gr.Info("Please enter masking_prompt_text.")
|
| 211 |
+
result["message"] = "invalid masking prompt"
|
| 212 |
+
return None, None, json.dumps(result)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
image = load_image(image_url)
|
| 216 |
+
mask = genearte_mask(image, masking_prompt_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
+
if not image:
|
| 219 |
+
gr.Info("Please upload an image.")
|
| 220 |
+
result["message"] = "can not load image"
|
| 221 |
+
return None, None, json.dumps(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
+
if is_mask_empty(mask):
|
| 224 |
+
gr.Info("Please draw a mask or enter a masking prompt.")
|
| 225 |
+
result["message"] = "can not generate mask"
|
| 226 |
+
return None, None, json.dumps(result)
|
| 227 |
+
|
| 228 |
+
# generate
|
| 229 |
+
width, height = calculate_image_dimensions_for_flux(original_resolution_wh=image.size)
|
| 230 |
+
image = image.resize((width, height), Image.LANCZOS)
|
| 231 |
+
mask = mask.resize((width, height), Image.LANCZOS)
|
| 232 |
+
mask = process_mask(mask, mask_inflation=mask_inflation_slider, mask_blur=mask_blur_slider)
|
| 233 |
+
image = run_flux(
|
| 234 |
+
image=image,
|
| 235 |
+
mask=mask,
|
| 236 |
+
prompt=inpainting_prompt_text,
|
| 237 |
+
lora_path=lora_path,
|
| 238 |
+
lora_scale=lora_scale,
|
| 239 |
+
lora_weights=lora_weights,
|
| 240 |
+
seed_slicer=seed_slicer,
|
| 241 |
+
randomize_seed_checkbox=randomize_seed_checkbox,
|
| 242 |
+
strength_slider=strength_slider,
|
| 243 |
+
num_inference_steps_slider=num_inference_steps_slider,
|
| 244 |
+
resolution_wh=(width, height)
|
| 245 |
+
)
|
| 246 |
+
if upload_to_r2:
|
| 247 |
+
url = upload_image_to_r2(image, account_id, access_key, secret_key, bucket)
|
| 248 |
+
result = {"status": "success", "url": url}
|
| 249 |
+
else:
|
| 250 |
+
result = {"status": "success", "message": "Image generated but not uploaded"}
|
| 251 |
+
|
| 252 |
+
return image, mask, json.dumps(result)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
with gr.Blocks() as demo:
|
| 257 |
+
|
| 258 |
+
with gr.Row():
|
| 259 |
+
with gr.Column():
|
| 260 |
|
| 261 |
+
image_url = gr.Text(
|
| 262 |
+
label="Image url for inpainting",
|
| 263 |
+
show_label=False,
|
| 264 |
+
max_lines=1,
|
| 265 |
+
placeholder="Enter image url for inpainting",
|
| 266 |
+
container=False,
|
| 267 |
+
)
|
| 268 |
|
| 269 |
+
masking_prompt_text_component = gr.Text(
|
| 270 |
+
label="Masking prompt",
|
| 271 |
+
show_label=False,
|
| 272 |
+
max_lines=1,
|
| 273 |
+
placeholder="Enter text to generate masking",
|
| 274 |
+
container=False,
|
|
|
|
|
|
|
| 275 |
)
|
| 276 |
+
|
| 277 |
+
inpainting_prompt_text_component = gr.Text(
|
| 278 |
+
label="Inpainting prompt",
|
| 279 |
+
show_label=False,
|
| 280 |
+
max_lines=1,
|
| 281 |
+
placeholder="Enter text to generate inpainting",
|
| 282 |
+
container=False,
|
| 283 |
)
|
| 284 |
+
|
| 285 |
+
submit_button_component = gr.Button(value='Submit', variant='primary', scale=0)
|
| 286 |
+
|
| 287 |
+
with gr.Accordion("Lora Settings", open=True):
|
| 288 |
+
lora_path = gr.Textbox(
|
| 289 |
+
label="Lora model path",
|
| 290 |
+
show_label=True,
|
| 291 |
+
max_lines=1,
|
| 292 |
+
placeholder="Enter your model path",
|
| 293 |
+
info="Currently, only LoRA hosted on Hugging Face'model can be loaded properly.",
|
| 294 |
+
value="XLabs-AI/flux-RealismLora"
|
| 295 |
+
)
|
| 296 |
+
lora_weights = gr.Textbox(
|
| 297 |
+
label="Lora weights",
|
| 298 |
+
show_label=True,
|
| 299 |
+
max_lines=1,
|
| 300 |
+
placeholder="Enter your lora weights name",
|
| 301 |
+
value="lora.safetensors"
|
| 302 |
+
)
|
| 303 |
+
lora_scale = gr.Slider(
|
| 304 |
+
label="Lora scale",
|
| 305 |
+
show_label=True,
|
| 306 |
+
minimum=0,
|
| 307 |
+
maximum=1,
|
| 308 |
step=0.1,
|
| 309 |
+
value=0.9,
|
| 310 |
)
|
| 311 |
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
with gr.Row():
|
| 318 |
+
mask_inflation_slider_component = gr.Slider(
|
| 319 |
+
label="Mask inflation",
|
| 320 |
+
info="Adjusts the amount of mask edge expansion before "
|
| 321 |
+
"inpainting.",
|
| 322 |
+
minimum=0,
|
| 323 |
+
maximum=20,
|
| 324 |
+
step=1,
|
| 325 |
+
value=5,
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
mask_blur_slider_component = gr.Slider(
|
| 329 |
+
label="Mask blur",
|
| 330 |
+
info="Controls the intensity of the Gaussian blur applied to "
|
| 331 |
+
"the mask edges.",
|
| 332 |
+
minimum=0,
|
| 333 |
+
maximum=20,
|
| 334 |
+
step=1,
|
| 335 |
+
value=5,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
seed_slicer_component = gr.Slider(
|
| 339 |
+
label="Seed",
|
| 340 |
+
minimum=0,
|
| 341 |
+
maximum=MAX_SEED,
|
| 342 |
step=1,
|
| 343 |
+
value=42,
|
| 344 |
)
|
| 345 |
+
|
| 346 |
+
randomize_seed_checkbox_component = gr.Checkbox(
|
| 347 |
+
label="Randomize seed", value=True)
|
| 348 |
+
|
| 349 |
+
with gr.Row():
|
| 350 |
+
|
| 351 |
+
strength_slider_component = gr.Slider(
|
| 352 |
+
label="Strength",
|
| 353 |
+
info="Indicates extent to transform the reference `image`. "
|
| 354 |
+
"Must be between 0 and 1. `image` is used as a starting "
|
| 355 |
+
"point and more noise is added the higher the `strength`.",
|
| 356 |
+
minimum=0,
|
| 357 |
+
maximum=1,
|
| 358 |
+
step=0.01,
|
| 359 |
+
value=0.85,
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
num_inference_steps_slider_component = gr.Slider(
|
| 363 |
+
label="Number of inference steps",
|
| 364 |
+
info="The number of denoising steps. More denoising steps "
|
| 365 |
+
"usually lead to a higher quality image at the",
|
| 366 |
+
minimum=1,
|
| 367 |
+
maximum=50,
|
| 368 |
+
step=1,
|
| 369 |
+
value=20,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
upload_to_r2 = gr.Checkbox(label="Upload to R2", value=False)
|
| 373 |
+
account_id = gr.Textbox(label="Account Id", placeholder="Enter R2 account id")
|
| 374 |
+
access_key = gr.Textbox(label="Access Key", placeholder="Enter R2 access key here")
|
| 375 |
+
secret_key = gr.Textbox(label="Secret Key", placeholder="Enter R2 secret key here")
|
| 376 |
+
bucket = gr.Textbox(label="Bucket Name", placeholder="Enter R2 bucket name here")
|
| 377 |
|
| 378 |
+
|
| 379 |
+
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
+
output_image_component = gr.Image(
|
| 382 |
+
type='pil', image_mode='RGB', label='Generated image', format="png")
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
with gr.Accordion("Debug", open=False):
|
| 387 |
+
output_mask_component = gr.Image(
|
| 388 |
+
type='pil', image_mode='RGB', label='Input mask', format="png")
|
| 389 |
+
|
| 390 |
+
output_json_component = gr.Textbox()
|
| 391 |
+
|
| 392 |
+
submit_button_component.click(
|
| 393 |
+
fn=process,
|
| 394 |
+
inputs=[
|
| 395 |
+
image_url,
|
| 396 |
+
inpainting_prompt_text_component,
|
| 397 |
+
masking_prompt_text_component,
|
| 398 |
+
mask_inflation_slider_component,
|
| 399 |
+
mask_blur_slider_component,
|
| 400 |
+
seed_slicer_component,
|
| 401 |
+
randomize_seed_checkbox_component,
|
| 402 |
+
strength_slider_component,
|
| 403 |
+
num_inference_steps_slider_component,
|
| 404 |
+
lora_path,
|
| 405 |
+
lora_weights,
|
| 406 |
+
lora_scale,
|
| 407 |
+
upload_to_r2,
|
| 408 |
+
account_id,
|
| 409 |
+
access_key,
|
| 410 |
+
secret_key,
|
| 411 |
+
bucket
|
| 412 |
+
],
|
| 413 |
+
outputs=[
|
| 414 |
+
output_image_component,
|
| 415 |
+
output_mask_component,
|
| 416 |
+
output_json_component
|
| 417 |
+
]
|
| 418 |
+
)
|
| 419 |
demo.queue().launch()
|
configs/__init__.py
ADDED
|
File without changes
|
configs/sam2_hiera_b+.yaml
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
|
| 3 |
+
# Model
|
| 4 |
+
model:
|
| 5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
| 6 |
+
image_encoder:
|
| 7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
| 8 |
+
scalp: 1
|
| 9 |
+
trunk:
|
| 10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
| 11 |
+
embed_dim: 112
|
| 12 |
+
num_heads: 2
|
| 13 |
+
neck:
|
| 14 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
| 15 |
+
position_encoding:
|
| 16 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 17 |
+
num_pos_feats: 256
|
| 18 |
+
normalize: true
|
| 19 |
+
scale: null
|
| 20 |
+
temperature: 10000
|
| 21 |
+
d_model: 256
|
| 22 |
+
backbone_channel_list: [896, 448, 224, 112]
|
| 23 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
| 24 |
+
fpn_interp_model: nearest
|
| 25 |
+
|
| 26 |
+
memory_attention:
|
| 27 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
| 28 |
+
d_model: 256
|
| 29 |
+
pos_enc_at_input: true
|
| 30 |
+
layer:
|
| 31 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
| 32 |
+
activation: relu
|
| 33 |
+
dim_feedforward: 2048
|
| 34 |
+
dropout: 0.1
|
| 35 |
+
pos_enc_at_attn: false
|
| 36 |
+
self_attention:
|
| 37 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 38 |
+
rope_theta: 10000.0
|
| 39 |
+
feat_sizes: [32, 32]
|
| 40 |
+
embedding_dim: 256
|
| 41 |
+
num_heads: 1
|
| 42 |
+
downsample_rate: 1
|
| 43 |
+
dropout: 0.1
|
| 44 |
+
d_model: 256
|
| 45 |
+
pos_enc_at_cross_attn_keys: true
|
| 46 |
+
pos_enc_at_cross_attn_queries: false
|
| 47 |
+
cross_attention:
|
| 48 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 49 |
+
rope_theta: 10000.0
|
| 50 |
+
feat_sizes: [32, 32]
|
| 51 |
+
rope_k_repeat: True
|
| 52 |
+
embedding_dim: 256
|
| 53 |
+
num_heads: 1
|
| 54 |
+
downsample_rate: 1
|
| 55 |
+
dropout: 0.1
|
| 56 |
+
kv_in_dim: 64
|
| 57 |
+
num_layers: 4
|
| 58 |
+
|
| 59 |
+
memory_encoder:
|
| 60 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
| 61 |
+
out_dim: 64
|
| 62 |
+
position_encoding:
|
| 63 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 64 |
+
num_pos_feats: 64
|
| 65 |
+
normalize: true
|
| 66 |
+
scale: null
|
| 67 |
+
temperature: 10000
|
| 68 |
+
mask_downsampler:
|
| 69 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
| 70 |
+
kernel_size: 3
|
| 71 |
+
stride: 2
|
| 72 |
+
padding: 1
|
| 73 |
+
fuser:
|
| 74 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
| 75 |
+
layer:
|
| 76 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
| 77 |
+
dim: 256
|
| 78 |
+
kernel_size: 7
|
| 79 |
+
padding: 3
|
| 80 |
+
layer_scale_init_value: 1e-6
|
| 81 |
+
use_dwconv: True # depth-wise convs
|
| 82 |
+
num_layers: 2
|
| 83 |
+
|
| 84 |
+
num_maskmem: 7
|
| 85 |
+
image_size: 1024
|
| 86 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
| 87 |
+
sigmoid_scale_for_mem_enc: 20.0
|
| 88 |
+
sigmoid_bias_for_mem_enc: -10.0
|
| 89 |
+
use_mask_input_as_output_without_sam: true
|
| 90 |
+
# Memory
|
| 91 |
+
directly_add_no_mem_embed: true
|
| 92 |
+
# use high-resolution feature map in the SAM mask decoder
|
| 93 |
+
use_high_res_features_in_sam: true
|
| 94 |
+
# output 3 masks on the first click on initial conditioning frames
|
| 95 |
+
multimask_output_in_sam: true
|
| 96 |
+
# SAM heads
|
| 97 |
+
iou_prediction_use_sigmoid: True
|
| 98 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
| 99 |
+
use_obj_ptrs_in_encoder: true
|
| 100 |
+
add_tpos_enc_to_obj_ptrs: false
|
| 101 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
| 102 |
+
# object occlusion prediction
|
| 103 |
+
pred_obj_scores: true
|
| 104 |
+
pred_obj_scores_mlp: true
|
| 105 |
+
fixed_no_obj_ptr: true
|
| 106 |
+
# multimask tracking settings
|
| 107 |
+
multimask_output_for_tracking: true
|
| 108 |
+
use_multimask_token_for_obj_ptr: true
|
| 109 |
+
multimask_min_pt_num: 0
|
| 110 |
+
multimask_max_pt_num: 1
|
| 111 |
+
use_mlp_for_obj_ptr_proj: true
|
| 112 |
+
# Compilation flag
|
| 113 |
+
compile_image_encoder: False
|
configs/sam2_hiera_l.yaml
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
|
| 3 |
+
# Model
|
| 4 |
+
model:
|
| 5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
| 6 |
+
image_encoder:
|
| 7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
| 8 |
+
scalp: 1
|
| 9 |
+
trunk:
|
| 10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
| 11 |
+
embed_dim: 144
|
| 12 |
+
num_heads: 2
|
| 13 |
+
stages: [2, 6, 36, 4]
|
| 14 |
+
global_att_blocks: [23, 33, 43]
|
| 15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
| 16 |
+
window_spec: [8, 4, 16, 8]
|
| 17 |
+
neck:
|
| 18 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
| 19 |
+
position_encoding:
|
| 20 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 21 |
+
num_pos_feats: 256
|
| 22 |
+
normalize: true
|
| 23 |
+
scale: null
|
| 24 |
+
temperature: 10000
|
| 25 |
+
d_model: 256
|
| 26 |
+
backbone_channel_list: [1152, 576, 288, 144]
|
| 27 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
| 28 |
+
fpn_interp_model: nearest
|
| 29 |
+
|
| 30 |
+
memory_attention:
|
| 31 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
| 32 |
+
d_model: 256
|
| 33 |
+
pos_enc_at_input: true
|
| 34 |
+
layer:
|
| 35 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
| 36 |
+
activation: relu
|
| 37 |
+
dim_feedforward: 2048
|
| 38 |
+
dropout: 0.1
|
| 39 |
+
pos_enc_at_attn: false
|
| 40 |
+
self_attention:
|
| 41 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 42 |
+
rope_theta: 10000.0
|
| 43 |
+
feat_sizes: [32, 32]
|
| 44 |
+
embedding_dim: 256
|
| 45 |
+
num_heads: 1
|
| 46 |
+
downsample_rate: 1
|
| 47 |
+
dropout: 0.1
|
| 48 |
+
d_model: 256
|
| 49 |
+
pos_enc_at_cross_attn_keys: true
|
| 50 |
+
pos_enc_at_cross_attn_queries: false
|
| 51 |
+
cross_attention:
|
| 52 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 53 |
+
rope_theta: 10000.0
|
| 54 |
+
feat_sizes: [32, 32]
|
| 55 |
+
rope_k_repeat: True
|
| 56 |
+
embedding_dim: 256
|
| 57 |
+
num_heads: 1
|
| 58 |
+
downsample_rate: 1
|
| 59 |
+
dropout: 0.1
|
| 60 |
+
kv_in_dim: 64
|
| 61 |
+
num_layers: 4
|
| 62 |
+
|
| 63 |
+
memory_encoder:
|
| 64 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
| 65 |
+
out_dim: 64
|
| 66 |
+
position_encoding:
|
| 67 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 68 |
+
num_pos_feats: 64
|
| 69 |
+
normalize: true
|
| 70 |
+
scale: null
|
| 71 |
+
temperature: 10000
|
| 72 |
+
mask_downsampler:
|
| 73 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
| 74 |
+
kernel_size: 3
|
| 75 |
+
stride: 2
|
| 76 |
+
padding: 1
|
| 77 |
+
fuser:
|
| 78 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
| 79 |
+
layer:
|
| 80 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
| 81 |
+
dim: 256
|
| 82 |
+
kernel_size: 7
|
| 83 |
+
padding: 3
|
| 84 |
+
layer_scale_init_value: 1e-6
|
| 85 |
+
use_dwconv: True # depth-wise convs
|
| 86 |
+
num_layers: 2
|
| 87 |
+
|
| 88 |
+
num_maskmem: 7
|
| 89 |
+
image_size: 1024
|
| 90 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
| 91 |
+
sigmoid_scale_for_mem_enc: 20.0
|
| 92 |
+
sigmoid_bias_for_mem_enc: -10.0
|
| 93 |
+
use_mask_input_as_output_without_sam: true
|
| 94 |
+
# Memory
|
| 95 |
+
directly_add_no_mem_embed: true
|
| 96 |
+
# use high-resolution feature map in the SAM mask decoder
|
| 97 |
+
use_high_res_features_in_sam: true
|
| 98 |
+
# output 3 masks on the first click on initial conditioning frames
|
| 99 |
+
multimask_output_in_sam: true
|
| 100 |
+
# SAM heads
|
| 101 |
+
iou_prediction_use_sigmoid: True
|
| 102 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
| 103 |
+
use_obj_ptrs_in_encoder: true
|
| 104 |
+
add_tpos_enc_to_obj_ptrs: false
|
| 105 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
| 106 |
+
# object occlusion prediction
|
| 107 |
+
pred_obj_scores: true
|
| 108 |
+
pred_obj_scores_mlp: true
|
| 109 |
+
fixed_no_obj_ptr: true
|
| 110 |
+
# multimask tracking settings
|
| 111 |
+
multimask_output_for_tracking: true
|
| 112 |
+
use_multimask_token_for_obj_ptr: true
|
| 113 |
+
multimask_min_pt_num: 0
|
| 114 |
+
multimask_max_pt_num: 1
|
| 115 |
+
use_mlp_for_obj_ptr_proj: true
|
| 116 |
+
# Compilation flag
|
| 117 |
+
compile_image_encoder: False
|
configs/sam2_hiera_s.yaml
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
|
| 3 |
+
# Model
|
| 4 |
+
model:
|
| 5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
| 6 |
+
image_encoder:
|
| 7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
| 8 |
+
scalp: 1
|
| 9 |
+
trunk:
|
| 10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
| 11 |
+
embed_dim: 96
|
| 12 |
+
num_heads: 1
|
| 13 |
+
stages: [1, 2, 11, 2]
|
| 14 |
+
global_att_blocks: [7, 10, 13]
|
| 15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
| 16 |
+
neck:
|
| 17 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
| 18 |
+
position_encoding:
|
| 19 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 20 |
+
num_pos_feats: 256
|
| 21 |
+
normalize: true
|
| 22 |
+
scale: null
|
| 23 |
+
temperature: 10000
|
| 24 |
+
d_model: 256
|
| 25 |
+
backbone_channel_list: [768, 384, 192, 96]
|
| 26 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
| 27 |
+
fpn_interp_model: nearest
|
| 28 |
+
|
| 29 |
+
memory_attention:
|
| 30 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
| 31 |
+
d_model: 256
|
| 32 |
+
pos_enc_at_input: true
|
| 33 |
+
layer:
|
| 34 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
| 35 |
+
activation: relu
|
| 36 |
+
dim_feedforward: 2048
|
| 37 |
+
dropout: 0.1
|
| 38 |
+
pos_enc_at_attn: false
|
| 39 |
+
self_attention:
|
| 40 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 41 |
+
rope_theta: 10000.0
|
| 42 |
+
feat_sizes: [32, 32]
|
| 43 |
+
embedding_dim: 256
|
| 44 |
+
num_heads: 1
|
| 45 |
+
downsample_rate: 1
|
| 46 |
+
dropout: 0.1
|
| 47 |
+
d_model: 256
|
| 48 |
+
pos_enc_at_cross_attn_keys: true
|
| 49 |
+
pos_enc_at_cross_attn_queries: false
|
| 50 |
+
cross_attention:
|
| 51 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 52 |
+
rope_theta: 10000.0
|
| 53 |
+
feat_sizes: [32, 32]
|
| 54 |
+
rope_k_repeat: True
|
| 55 |
+
embedding_dim: 256
|
| 56 |
+
num_heads: 1
|
| 57 |
+
downsample_rate: 1
|
| 58 |
+
dropout: 0.1
|
| 59 |
+
kv_in_dim: 64
|
| 60 |
+
num_layers: 4
|
| 61 |
+
|
| 62 |
+
memory_encoder:
|
| 63 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
| 64 |
+
out_dim: 64
|
| 65 |
+
position_encoding:
|
| 66 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 67 |
+
num_pos_feats: 64
|
| 68 |
+
normalize: true
|
| 69 |
+
scale: null
|
| 70 |
+
temperature: 10000
|
| 71 |
+
mask_downsampler:
|
| 72 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
| 73 |
+
kernel_size: 3
|
| 74 |
+
stride: 2
|
| 75 |
+
padding: 1
|
| 76 |
+
fuser:
|
| 77 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
| 78 |
+
layer:
|
| 79 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
| 80 |
+
dim: 256
|
| 81 |
+
kernel_size: 7
|
| 82 |
+
padding: 3
|
| 83 |
+
layer_scale_init_value: 1e-6
|
| 84 |
+
use_dwconv: True # depth-wise convs
|
| 85 |
+
num_layers: 2
|
| 86 |
+
|
| 87 |
+
num_maskmem: 7
|
| 88 |
+
image_size: 1024
|
| 89 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
| 90 |
+
sigmoid_scale_for_mem_enc: 20.0
|
| 91 |
+
sigmoid_bias_for_mem_enc: -10.0
|
| 92 |
+
use_mask_input_as_output_without_sam: true
|
| 93 |
+
# Memory
|
| 94 |
+
directly_add_no_mem_embed: true
|
| 95 |
+
# use high-resolution feature map in the SAM mask decoder
|
| 96 |
+
use_high_res_features_in_sam: true
|
| 97 |
+
# output 3 masks on the first click on initial conditioning frames
|
| 98 |
+
multimask_output_in_sam: true
|
| 99 |
+
# SAM heads
|
| 100 |
+
iou_prediction_use_sigmoid: True
|
| 101 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
| 102 |
+
use_obj_ptrs_in_encoder: true
|
| 103 |
+
add_tpos_enc_to_obj_ptrs: false
|
| 104 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
| 105 |
+
# object occlusion prediction
|
| 106 |
+
pred_obj_scores: true
|
| 107 |
+
pred_obj_scores_mlp: true
|
| 108 |
+
fixed_no_obj_ptr: true
|
| 109 |
+
# multimask tracking settings
|
| 110 |
+
multimask_output_for_tracking: true
|
| 111 |
+
use_multimask_token_for_obj_ptr: true
|
| 112 |
+
multimask_min_pt_num: 0
|
| 113 |
+
multimask_max_pt_num: 1
|
| 114 |
+
use_mlp_for_obj_ptr_proj: true
|
| 115 |
+
# Compilation flag
|
| 116 |
+
compile_image_encoder: False
|
configs/sam2_hiera_t.yaml
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
|
| 3 |
+
# Model
|
| 4 |
+
model:
|
| 5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
| 6 |
+
image_encoder:
|
| 7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
| 8 |
+
scalp: 1
|
| 9 |
+
trunk:
|
| 10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
| 11 |
+
embed_dim: 96
|
| 12 |
+
num_heads: 1
|
| 13 |
+
stages: [1, 2, 7, 2]
|
| 14 |
+
global_att_blocks: [5, 7, 9]
|
| 15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
| 16 |
+
neck:
|
| 17 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
| 18 |
+
position_encoding:
|
| 19 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 20 |
+
num_pos_feats: 256
|
| 21 |
+
normalize: true
|
| 22 |
+
scale: null
|
| 23 |
+
temperature: 10000
|
| 24 |
+
d_model: 256
|
| 25 |
+
backbone_channel_list: [768, 384, 192, 96]
|
| 26 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
| 27 |
+
fpn_interp_model: nearest
|
| 28 |
+
|
| 29 |
+
memory_attention:
|
| 30 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
| 31 |
+
d_model: 256
|
| 32 |
+
pos_enc_at_input: true
|
| 33 |
+
layer:
|
| 34 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
| 35 |
+
activation: relu
|
| 36 |
+
dim_feedforward: 2048
|
| 37 |
+
dropout: 0.1
|
| 38 |
+
pos_enc_at_attn: false
|
| 39 |
+
self_attention:
|
| 40 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 41 |
+
rope_theta: 10000.0
|
| 42 |
+
feat_sizes: [32, 32]
|
| 43 |
+
embedding_dim: 256
|
| 44 |
+
num_heads: 1
|
| 45 |
+
downsample_rate: 1
|
| 46 |
+
dropout: 0.1
|
| 47 |
+
d_model: 256
|
| 48 |
+
pos_enc_at_cross_attn_keys: true
|
| 49 |
+
pos_enc_at_cross_attn_queries: false
|
| 50 |
+
cross_attention:
|
| 51 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
| 52 |
+
rope_theta: 10000.0
|
| 53 |
+
feat_sizes: [32, 32]
|
| 54 |
+
rope_k_repeat: True
|
| 55 |
+
embedding_dim: 256
|
| 56 |
+
num_heads: 1
|
| 57 |
+
downsample_rate: 1
|
| 58 |
+
dropout: 0.1
|
| 59 |
+
kv_in_dim: 64
|
| 60 |
+
num_layers: 4
|
| 61 |
+
|
| 62 |
+
memory_encoder:
|
| 63 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
| 64 |
+
out_dim: 64
|
| 65 |
+
position_encoding:
|
| 66 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
| 67 |
+
num_pos_feats: 64
|
| 68 |
+
normalize: true
|
| 69 |
+
scale: null
|
| 70 |
+
temperature: 10000
|
| 71 |
+
mask_downsampler:
|
| 72 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
| 73 |
+
kernel_size: 3
|
| 74 |
+
stride: 2
|
| 75 |
+
padding: 1
|
| 76 |
+
fuser:
|
| 77 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
| 78 |
+
layer:
|
| 79 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
| 80 |
+
dim: 256
|
| 81 |
+
kernel_size: 7
|
| 82 |
+
padding: 3
|
| 83 |
+
layer_scale_init_value: 1e-6
|
| 84 |
+
use_dwconv: True # depth-wise convs
|
| 85 |
+
num_layers: 2
|
| 86 |
+
|
| 87 |
+
num_maskmem: 7
|
| 88 |
+
image_size: 1024
|
| 89 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
| 90 |
+
# SAM decoder
|
| 91 |
+
sigmoid_scale_for_mem_enc: 20.0
|
| 92 |
+
sigmoid_bias_for_mem_enc: -10.0
|
| 93 |
+
use_mask_input_as_output_without_sam: true
|
| 94 |
+
# Memory
|
| 95 |
+
directly_add_no_mem_embed: true
|
| 96 |
+
# use high-resolution feature map in the SAM mask decoder
|
| 97 |
+
use_high_res_features_in_sam: true
|
| 98 |
+
# output 3 masks on the first click on initial conditioning frames
|
| 99 |
+
multimask_output_in_sam: true
|
| 100 |
+
# SAM heads
|
| 101 |
+
iou_prediction_use_sigmoid: True
|
| 102 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
| 103 |
+
use_obj_ptrs_in_encoder: true
|
| 104 |
+
add_tpos_enc_to_obj_ptrs: false
|
| 105 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
| 106 |
+
# object occlusion prediction
|
| 107 |
+
pred_obj_scores: true
|
| 108 |
+
pred_obj_scores_mlp: true
|
| 109 |
+
fixed_no_obj_ptr: true
|
| 110 |
+
# multimask tracking settings
|
| 111 |
+
multimask_output_for_tracking: true
|
| 112 |
+
use_multimask_token_for_obj_ptr: true
|
| 113 |
+
multimask_min_pt_num: 0
|
| 114 |
+
multimask_max_pt_num: 1
|
| 115 |
+
use_mlp_for_obj_ptr_proj: true
|
| 116 |
+
# Compilation flag
|
| 117 |
+
# HieraT does not currently support compilation, should always be set to False
|
| 118 |
+
compile_image_encoder: False
|
requirements.txt
CHANGED
|
@@ -1,6 +1,17 @@
|
|
| 1 |
accelerate
|
| 2 |
-
diffusers
|
| 3 |
invisible_watermark
|
| 4 |
torch
|
| 5 |
transformers
|
| 6 |
-
xformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
accelerate
|
|
|
|
| 2 |
invisible_watermark
|
| 3 |
torch
|
| 4 |
transformers
|
| 5 |
+
xformers
|
| 6 |
+
tqdm
|
| 7 |
+
einops
|
| 8 |
+
spaces
|
| 9 |
+
timm
|
| 10 |
+
samv2
|
| 11 |
+
gradio
|
| 12 |
+
supervision
|
| 13 |
+
opencv-python
|
| 14 |
+
pytest
|
| 15 |
+
requests
|
| 16 |
+
git+https://github.com/Gothos/diffusers.git@flux-inpaint
|
| 17 |
+
boto3
|
utils/__init__.py
ADDED
|
File without changes
|
utils/florence.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Union, Any, Tuple, Dict
|
| 3 |
+
from unittest.mock import patch
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 8 |
+
from transformers.dynamic_module_utils import get_imports
|
| 9 |
+
|
| 10 |
+
# FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
|
| 11 |
+
FLORENCE_CHECKPOINT = "microsoft/Florence-2-large-ft"
|
| 12 |
+
FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
|
| 13 |
+
FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
|
| 14 |
+
FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
|
| 15 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
|
| 16 |
+
FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
|
| 20 |
+
"""Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
|
| 21 |
+
if not str(filename).endswith("/modeling_florence2.py"):
|
| 22 |
+
return get_imports(filename)
|
| 23 |
+
imports = get_imports(filename)
|
| 24 |
+
imports.remove("flash_attn")
|
| 25 |
+
return imports
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_florence_model(
|
| 29 |
+
device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
|
| 30 |
+
) -> Tuple[Any, Any]:
|
| 31 |
+
with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
|
| 32 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 33 |
+
checkpoint, trust_remote_code=True).to(device).eval()
|
| 34 |
+
processor = AutoProcessor.from_pretrained(
|
| 35 |
+
checkpoint, trust_remote_code=True)
|
| 36 |
+
return model, processor
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def run_florence_inference(
|
| 40 |
+
model: Any,
|
| 41 |
+
processor: Any,
|
| 42 |
+
device: torch.device,
|
| 43 |
+
image: Image,
|
| 44 |
+
task: str,
|
| 45 |
+
text: str = None
|
| 46 |
+
) -> Tuple[str, Dict]:
|
| 47 |
+
if text:
|
| 48 |
+
prompt = task + text
|
| 49 |
+
else:
|
| 50 |
+
prompt = task
|
| 51 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
|
| 52 |
+
print(inputs)
|
| 53 |
+
generated_ids = model.generate(
|
| 54 |
+
input_ids=inputs["input_ids"],
|
| 55 |
+
pixel_values=inputs["pixel_values"],
|
| 56 |
+
max_new_tokens=1024,
|
| 57 |
+
num_beams=3
|
| 58 |
+
)
|
| 59 |
+
generated_text = processor.batch_decode(
|
| 60 |
+
generated_ids, skip_special_tokens=False)[0]
|
| 61 |
+
response = processor.post_process_generation(
|
| 62 |
+
generated_text, task=task, image_size=image.size)
|
| 63 |
+
print(generated_text, response)
|
| 64 |
+
return generated_text, response
|
utils/sam.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import supervision as sv
|
| 5 |
+
import torch
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from sam2.build_sam import build_sam2, build_sam2_video_predictor
|
| 8 |
+
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
| 9 |
+
|
| 10 |
+
# SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
|
| 11 |
+
# SAM_CONFIG = "sam2_hiera_s.yaml"
|
| 12 |
+
SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
|
| 13 |
+
SAM_CONFIG = "sam2_hiera_l.yaml"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_sam_image_model(
|
| 17 |
+
device: torch.device,
|
| 18 |
+
config: str = SAM_CONFIG,
|
| 19 |
+
checkpoint: str = SAM_CHECKPOINT
|
| 20 |
+
) -> SAM2ImagePredictor:
|
| 21 |
+
model = build_sam2(config, checkpoint, device=device)
|
| 22 |
+
return SAM2ImagePredictor(sam_model=model)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_sam_video_model(
|
| 26 |
+
device: torch.device,
|
| 27 |
+
config: str = SAM_CONFIG,
|
| 28 |
+
checkpoint: str = SAM_CHECKPOINT
|
| 29 |
+
) -> Any:
|
| 30 |
+
return build_sam2_video_predictor(config, checkpoint, device=device)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def run_sam_inference(
|
| 34 |
+
model: Any,
|
| 35 |
+
image: Image,
|
| 36 |
+
detections: sv.Detections
|
| 37 |
+
) -> sv.Detections:
|
| 38 |
+
image = np.array(image.convert("RGB"))
|
| 39 |
+
model.set_image(image)
|
| 40 |
+
# from left to right
|
| 41 |
+
bboxes = detections.xyxy
|
| 42 |
+
bboxes = sorted(bboxes, key=lambda bbox: bbox[0])
|
| 43 |
+
mask, score, _ = model.predict(box=bboxes, multimask_output=False)
|
| 44 |
+
|
| 45 |
+
# dirty fix; remove this later
|
| 46 |
+
if len(mask.shape) == 4:
|
| 47 |
+
mask = np.squeeze(mask)
|
| 48 |
+
|
| 49 |
+
detections.mask = mask.astype(bool)
|
| 50 |
+
return detections
|