allenai
/

MolmoAct-7B-D-0812

image-text-to-text

Model card Files Files and versions

hqfang commited on Sep 2

Commit

78f8879

·

verified ·

1 Parent(s): b27b007

Update image_processing_molmoact.py

Files changed (1) hide show

image_processing_molmoact.py +4 -12

image_processing_molmoact.py CHANGED Viewed

@@ -160,12 +160,8 @@ def siglip_resize_and_pad(
     desired_output_size: Tuple[int, int],
 ) -> Tuple[np.ndarray, np.ndarray]:
     desired_output_size = _ensure_pyint_size2(desired_output_size)
-    if len(image.shape) == 3:
-        is_video = False
-        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
-    else:
-        is_video = True
-        image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
     dtype = image.dtype
     if torch.is_floating_point(image):
         in_min = 0.0
@@ -190,12 +186,8 @@ def siglip_resize_and_pad(
     resized = resized.to(torch.float32)
     resized = (resized - in_min) / (in_max - in_min)
-    if is_video:
-        resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
-        image_mask = None
-    else:
-        resized = torch.permute(resized, [1, 2, 0]).numpy()
-        image_mask = np.ones_like(resized[:, :, 0], dtype=np.bool_)
     return resized, image_mask

     desired_output_size: Tuple[int, int],
 ) -> Tuple[np.ndarray, np.ndarray]:
     desired_output_size = _ensure_pyint_size2(desired_output_size)
+    # by default, image is a single image
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
     dtype = image.dtype
     if torch.is_floating_point(image):
         in_min = 0.0
     resized = resized.to(torch.float32)
     resized = (resized - in_min) / (in_max - in_min)
+    resized = torch.permute(resized, [1, 2, 0]).numpy()
+    image_mask = np.ones_like(resized[:, :, 0], dtype=np.bool_)
     return resized, image_mask