Spaces:

NeerjaK
/

DinoV3-Patch-Similarity-Visualiser

Sleeping

App Files Files Community

NeerjaK commited on Oct 23

Commit

3789875

verified ·

1 Parent(s): 5492757

Update utils.py

Browse files

Files changed (1) hide show

utils.py +5 -10

utils.py CHANGED Viewed

@@ -45,16 +45,16 @@ def display_image(img, rows,cols):
 def get_patch_embeddings(img, ps=16, device="cuda"):
-    inputs = processor(images=img, return_tensors="pt").to(device, torch.float16)
     B, C, H, W = inputs["pixel_values"].shape
-    rows, cols = H // ps, W // ps
     with torch.no_grad():
         out = model(**inputs)
     hs = out.last_hidden_state.squeeze(0).detach().cpu().numpy()
-    # remove CLS + register tokens
     n_patches = rows * cols
     patch_embs = hs[-n_patches:, :].reshape(rows, cols, -1)
@@ -62,22 +62,18 @@ def get_patch_embeddings(img, ps=16, device="cuda"):
     X = patch_embs.reshape(-1, patch_embs.shape[-1])
     Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8)
-    return patch_embs, Xn, rows, cols
 def compute_patch_similarity(patch_embs, patch_embs_norm, row, col):
     rows, cols, dim = patch_embs.shape
     patch_idx = row * cols + col  # flatten index
-    # cosine similarity via dot product
-    sim = patch_embs_norm @ patch_embs_norm[patch_idx]
     sim_map = sim.reshape(rows, cols)
     sim_map = (sim_map - sim_map.min()) / (sim_map.max() - sim_map.min() + 1e-8)
     return sim_map
 def overlay_similarity(img, sim_map, alpha=0.5, cmap="hot"):
-    """Draw heatmap overlay with grid and return as PIL image (for Gradio)."""
     W, H = img.size
     # Expand sim_map (14x14) to full resolution via Kronecker upsampling
@@ -87,7 +83,6 @@ def overlay_similarity(img, sim_map, alpha=0.5, cmap="hot"):
     ax.imshow(img)
     ax.imshow(sim_map_resized, cmap=cmap, alpha=alpha)
-    # Draw patch grid
     patch_w = W / sim_map.shape[1]
     patch_h = H / sim_map.shape[0]
     for i in range(1, sim_map.shape[1]):

 def get_patch_embeddings(img, ps=16, device="cuda"):
+    inputs = processor(images=img, return_tensors="pt").to(device, torch.float16) # preprocessing for image include scaling, normalization etc
     B, C, H, W = inputs["pixel_values"].shape
+    rows, cols = H // ps, W // ps # image of size 224x224, patch size = 16x16, hence image has 14x14 patches
     with torch.no_grad():
         out = model(**inputs)
     hs = out.last_hidden_state.squeeze(0).detach().cpu().numpy()
+    # remove CLS + any non-patch token
     n_patches = rows * cols
     patch_embs = hs[-n_patches:, :].reshape(rows, cols, -1)
     X = patch_embs.reshape(-1, patch_embs.shape[-1])
     Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-8)
+    return patch_embs, Xn, rows, cols # list of normalized patch vectors
 def compute_patch_similarity(patch_embs, patch_embs_norm, row, col):
     rows, cols, dim = patch_embs.shape
     patch_idx = row * cols + col  # flatten index
+    sim = patch_embs_norm @ patch_embs_norm[patch_idx] # cosine similarity via dot product
     sim_map = sim.reshape(rows, cols)
     sim_map = (sim_map - sim_map.min()) / (sim_map.max() - sim_map.min() + 1e-8)
     return sim_map
 def overlay_similarity(img, sim_map, alpha=0.5, cmap="hot"):
     W, H = img.size
     # Expand sim_map (14x14) to full resolution via Kronecker upsampling
     ax.imshow(img)
     ax.imshow(sim_map_resized, cmap=cmap, alpha=alpha)
     patch_w = W / sim_map.shape[1]
     patch_h = H / sim_map.shape[0]
     for i in range(1, sim_map.shape[1]):