marigold-dc-metric

Running on Zero

App Files Files Community

toshas commited on 14 days ago

Commit

3151687

1 Parent(s): 555f93d

allow inference without clicks

Browse files

add a nice example with click measurements

Files changed (2) hide show

app.py +72 -31
marigold_dc.py +15 -11

app.py CHANGED Viewed

@@ -16,11 +16,8 @@
 # Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
 # More information about the method can be found at https://marigoldmonodepth.github.io
 # --------------------------------------------------------------------------
-# TODO: min 2 measurements exception does not work now
 # TODO: 16bit depth map download
 # TODO: change to gradio-dualvision (update it with the Examples thumbs first)
-# TODO: good examples where measurements help
-# TODO: examples for measurements with points saved
 import os
 import PIL
@@ -42,7 +39,7 @@ DEFAULT_denoise_steps = 10
 DEFAULT_lr_latent = 0.05
 DEFAULT_lr_scale_shift = 0.005
 TAB10_COLORS = [
     (31, 119, 180),   # blue
     (255, 127, 14),   # orange
@@ -72,15 +69,15 @@ def get_wrapped_color(index):
     return adjust_brightness(base_color, factor)
-def on_click(img: Image.Image, state_orig_img: gr.State, evt: gr.SelectData, table):
     if isinstance(img, str):
         img = Image.open(img)
     if state_orig_img is None:
         state_orig_img = img.copy()
-    x, y = evt.index
     color = get_wrapped_color(len(table))
     color_hex = '#%02x%02x%02x' % color
-    tile_char = "██"
     img = img.convert("RGB")
     draw = ImageDraw.Draw(img)
@@ -89,7 +86,15 @@ def on_click(img: Image.Image, state_orig_img: gr.State, evt: gr.SelectData, tab
     draw.ellipse((x - r, y - r, x + r, y + r), fill=color, outline=color)
     draw.ellipse((x - r, y - r, x + r, y + r), fill=None, outline=(255, 255, 255), width=max(1, r//4))
-    table = table.values.tolist() + [[tile_char, "", x, y, color_hex]]
     return img, state_orig_img, gr.Dataframe(table, visible=True)
@@ -203,9 +208,15 @@ def process(
             sparse_depth[~sparse_depth_valid_mask] = 0
             kernel_size = 10
         else:
-            raise ValueError("At least two valid measurements are required")
     else:
-        raise ValueError("At least two valid measurements are required")
     width, height = image.size
     max_dim = max(width, height)
@@ -230,17 +241,23 @@ def process(
             dry_run=DRY_RUN,
         )
     ):
-        min_both = min(sparse_depth_min, pred.min().item())
-        max_both = min(sparse_depth_max, pred.max().item())
         metrics.append(rmse)
         steps.append(step)
         vis_pred = pipe.image_processor.visualize_depth(pred, val_min=min_both, val_max=max_both)[0]
-        vis_sparse = pipe.image_processor.visualize_depth(sparse_depth, val_min=min_both, val_max=max_both)[0]
-        vis_sparse = np.array(vis_sparse)
-        vis_sparse[sparse_depth <= 0] = (0, 0, 0)
-        vis_sparse = dilate_rgb_image(vis_sparse, kernel_size=kernel_size)
         vis_sparse = Image.fromarray(vis_sparse)
         plot = generate_rmse_plot(steps, metrics, denoise_steps)
@@ -366,7 +383,7 @@ with gr.Blocks(
                 visible=False,
             )
             input_image = gr.Image(
-                label="Input image",
                 type="filepath",
                 interactive=True,
             )
@@ -501,30 +518,52 @@ with gr.Blocks(
     )
     def examples_depth_lidar_fn(path_thumb):
-        real_url = lambda fname: f"https://huggingface.co/spaces/prs-eth/marigold-dc/resolve/main/files/{fname}"
         l_thumb = os.path.basename(path_thumb)
         d_thumb = os.path.dirname(path_thumb)
-        l_image, l_sparse = {
-            "thumb_kitti_1.jpg": ["kitti_1.png", "kitti_1.npy"],
-            "thumb_kitti_2.jpg": ["kitti_2.png", "kitti_2.npy"],
-            "thumb_teaser_10.jpg": ["teaser.png", "teaser_10.npy"],
-            "thumb_teaser_100.jpg": ["teaser.png", "teaser_100.npy"],
-            "thumb_teaser_1000.jpg": ["teaser.png", "teaser_1000.npy"],
         }[l_thumb]
-        u_image, u_sparse = real_url(l_image), real_url(l_sparse)
         l_down_image = os.path.join(d_thumb, l_image)
-        l_down_sparse = os.path.join(d_thumb, l_sparse)
-        for url, down_path in ((u_image, l_down_image), (u_sparse, l_down_sparse)):
-            response = requests.get(url)
             response.raise_for_status()
-            with open(down_path, "wb") as f:
                 f.write(response.content)
-        for outputs in process(l_down_image, None, [], l_down_sparse):
-            yield l_down_image, l_down_sparse, *outputs
     examples = gr.Examples(
         fn=examples_depth_lidar_fn,
         examples=[
             "files/thumb_kitti_1.jpg",
             "files/thumb_kitti_2.jpg",
             "files/thumb_teaser_10.jpg",
@@ -537,6 +576,8 @@ with gr.Blocks(
         outputs=[
             input_image,
             input_sparse,
             output_slider,
             plot,
         ],

 # Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
 # More information about the method can be found at https://marigoldmonodepth.github.io
 # --------------------------------------------------------------------------
 # TODO: 16bit depth map download
 # TODO: change to gradio-dualvision (update it with the Examples thumbs first)
 import os
 import PIL
 DEFAULT_lr_latent = 0.05
 DEFAULT_lr_scale_shift = 0.005
+TILE_CHAR = "██"
 TAB10_COLORS = [
     (31, 119, 180),   # blue
     (255, 127, 14),   # orange
     return adjust_brightness(base_color, factor)
+def process_click_data(img: Image.Image, state_orig_img: gr.State, table, x: int, y: int, value: str = ""):
     if isinstance(img, str):
         img = Image.open(img)
     if state_orig_img is None:
         state_orig_img = img.copy()
+    if isinstance(table, pandas.DataFrame):
+        table = table.values.tolist()
     color = get_wrapped_color(len(table))
     color_hex = '#%02x%02x%02x' % color
     img = img.convert("RGB")
     draw = ImageDraw.Draw(img)
     draw.ellipse((x - r, y - r, x + r, y + r), fill=color, outline=color)
     draw.ellipse((x - r, y - r, x + r, y + r), fill=None, outline=(255, 255, 255), width=max(1, r//4))
+    if not isinstance(table, list):
+        table = table.values.tolist()
+    table = table + [[TILE_CHAR, value, x, y, color_hex]]
+    return img, state_orig_img, table
+def on_click(img: Image.Image, state_orig_img: gr.State, evt: gr.SelectData, table):
+    x, y = evt.index
+    img, state_orig_img, table = process_click_data(img, state_orig_img, table, x, y)
     return img, state_orig_img, gr.Dataframe(table, visible=True)
             sparse_depth[~sparse_depth_valid_mask] = 0
             kernel_size = 10
         else:
+            sparse_depth = None
+            sparse_depth_min = 0
+            sparse_depth_max = 1
+            kernel_size = 5
     else:
+        sparse_depth = None
+        sparse_depth_min = 0
+        sparse_depth_max = 1
+        kernel_size = 5
     width, height = image.size
     max_dim = max(width, height)
             dry_run=DRY_RUN,
         )
     ):
+        min_both = pred.min().item()
+        max_both = pred.max().item()
+        if sparse_depth is not None:
+            min_both = min(sparse_depth_min, min_both)
+            max_both = min(sparse_depth_max, max_both)
         metrics.append(rmse)
         steps.append(step)
         vis_pred = pipe.image_processor.visualize_depth(pred, val_min=min_both, val_max=max_both)[0]
+        if sparse_depth is not None:
+            vis_sparse = pipe.image_processor.visualize_depth(sparse_depth, val_min=min_both, val_max=max_both)[0]
+            vis_sparse = np.array(vis_sparse)
+            vis_sparse[sparse_depth <= 0] = (0, 0, 0)
+            vis_sparse = dilate_rgb_image(vis_sparse, kernel_size=kernel_size)
+        else:
+            vis_sparse = np.full_like(vis_pred, 0)
         vis_sparse = Image.fromarray(vis_sparse)
         plot = generate_rmse_plot(steps, metrics, denoise_steps)
                 visible=False,
             )
             input_image = gr.Image(
+                label="Input image (click to enter depth)",
                 type="filepath",
                 interactive=True,
             )
     )
     def examples_depth_lidar_fn(path_thumb):
+        real_url = lambda fname: f"https://huggingface.co/spaces/obukhovai/marigold-dc-metric/resolve/main/files/{fname}"
         l_thumb = os.path.basename(path_thumb)
         d_thumb = os.path.dirname(path_thumb)
+        l_image, l_sparse, clicks = {
+            "thumb_matterhorn_clicks.jpg": ["matterhorn.png", None, [
+                [TILE_CHAR, "2", 495, 1573, '#%02x%02x%02x' % get_wrapped_color(0)],
+                [TILE_CHAR, "3", 1062, 1550, '#%02x%02x%02x' % get_wrapped_color(1)],
+            ]],
+            "thumb_kitti_1.jpg": ["kitti_1.png", "kitti_1.npy", []],
+            "thumb_kitti_2.jpg": ["kitti_2.png", "kitti_2.npy", []],
+            "thumb_teaser_10.jpg": ["teaser.png", "teaser_10.npy", []],
+            "thumb_teaser_100.jpg": ["teaser.png", "teaser_100.npy", []],
+            "thumb_teaser_1000.jpg": ["teaser.png", "teaser_1000.npy", []],
         }[l_thumb]
+        u_image = real_url(l_image)
         l_down_image = os.path.join(d_thumb, l_image)
+        response = requests.get(u_image)
+        response.raise_for_status()
+        with open(l_down_image, "wb") as f:
+            f.write(response.content)
+        table_visible = len(clicks) > 0
+        l_down_sparse = None
+        if l_sparse is not None:
+            u_sparse = real_url(l_sparse)
+            l_down_sparse = os.path.join(d_thumb, l_sparse)
+            response = requests.get(u_sparse)
             response.raise_for_status()
+            with open(l_down_sparse, "wb") as f:
                 f.write(response.content)
+        state_orig_img = None
+        table = []
+        if len(clicks) > 0:
+            for click in clicks:
+                _, value, x, y, _ = click
+                l_down_image, state_orig_img, table = process_click_data(l_down_image, state_orig_img, table, x, y, value)
+        for outputs in process(l_down_image, state_orig_img, clicks, l_down_sparse):
+            yield l_down_image, l_down_sparse, state_orig_img, gr.Dataframe(table, visible=table_visible), *outputs
     examples = gr.Examples(
         fn=examples_depth_lidar_fn,
         examples=[
+            "files/thumb_matterhorn_clicks.jpg",
             "files/thumb_kitti_1.jpg",
             "files/thumb_kitti_2.jpg",
             "files/thumb_teaser_10.jpg",
         outputs=[
             input_image,
             input_sparse,
+            state_orig_img,
+            table,
             output_slider,
             plot,
         ],

marigold_dc.py CHANGED Viewed

@@ -80,6 +80,8 @@ class MarigoldDepthCompletionPipeline(MarigoldDepthPipeline):
         sparse_depth = sparse_depth.to(device)
         sparse_mask = sparse_depth > 0
         sparse_depth = sparse_depth[sparse_mask]
         # Set up optimization targets
         pred_latent = torch.nn.Parameter(pred_latent, requires_grad=True)
@@ -89,14 +91,14 @@ class MarigoldDepthCompletionPipeline(MarigoldDepthPipeline):
             sparse_range = 1.0
         else:
             scale = torch.nn.Parameter(torch.ones(1, device=device), requires_grad=True)
-            sparse_range = (sparse_depth.max() - sparse_depth.min()).item()
         if override_shift:
             shift = np.sqrt(override_shift)
             sparse_lower = 1.0
         else:
             shift = torch.nn.Parameter(torch.ones(1, device=device), requires_grad=True)
-            sparse_lower = (sparse_depth.min()).item()
         def affine_to_metric(depth):
             return (scale**2) * sparse_range * depth + (shift**2) * sparse_lower
@@ -156,17 +158,19 @@ class MarigoldDepthCompletionPipeline(MarigoldDepthPipeline):
             # Preview the final output depth, compute loss with guidance, backprop
             pred_original_sample = step_output.pred_original_sample
             current_metric_estimate = latent_to_metric(pred_original_sample)
-            loss, rmse = loss_l1l2(current_metric_estimate[sparse_mask], sparse_depth)
-            loss.backward()
-            # Scale gradients up
-            with torch.no_grad():
-                pred_epsilon_norm = torch.linalg.norm(pred_epsilon).item()
-                depth_latent_grad_norm = torch.linalg.norm(pred_latent.grad).item()
-                scaling_factor = pred_epsilon_norm / max(depth_latent_grad_norm, 1e-8)
-                pred_latent.grad *= scaling_factor
-            optimizer.step()
             with torch.no_grad():
                 pred_latent.data = self.scheduler.step(noise, t, pred_latent, generator=generator).prev_sample

         sparse_depth = sparse_depth.to(device)
         sparse_mask = sparse_depth > 0
         sparse_depth = sparse_depth[sparse_mask]
+        sparse_depth_min = sparse_depth.min() if sparse_depth.numel() > 0 else 0
+        sparse_depth_max = sparse_depth.max() if sparse_depth.numel() > 0 else 1
         # Set up optimization targets
         pred_latent = torch.nn.Parameter(pred_latent, requires_grad=True)
             sparse_range = 1.0
         else:
             scale = torch.nn.Parameter(torch.ones(1, device=device), requires_grad=True)
+            sparse_range = (sparse_depth_max - sparse_depth_min).item()
         if override_shift:
             shift = np.sqrt(override_shift)
             sparse_lower = 1.0
         else:
             shift = torch.nn.Parameter(torch.ones(1, device=device), requires_grad=True)
+            sparse_lower = (sparse_depth_min).item()
         def affine_to_metric(depth):
             return (scale**2) * sparse_range * depth + (shift**2) * sparse_lower
             # Preview the final output depth, compute loss with guidance, backprop
             pred_original_sample = step_output.pred_original_sample
             current_metric_estimate = latent_to_metric(pred_original_sample)
+            if sparse_depth.numel() > 0:
+                loss, rmse = loss_l1l2(current_metric_estimate[sparse_mask], sparse_depth)
+                loss.backward()
+                # Scale gradients up
+                with torch.no_grad():
+                    pred_epsilon_norm = torch.linalg.norm(pred_epsilon).item()
+                    depth_latent_grad_norm = torch.linalg.norm(pred_latent.grad).item()
+                    scaling_factor = pred_epsilon_norm / max(depth_latent_grad_norm, 1e-8)
+                    pred_latent.grad *= scaling_factor
+                optimizer.step()
             with torch.no_grad():
                 pred_latent.data = self.scheduler.step(noise, t, pred_latent, generator=generator).prev_sample