Spaces:

JiaZL
/

STAMP-2B-uni

Running

App Files Files Community

realzliu commited on 16 days ago

Commit

96f36aa

1 Parent(s): c9049cc

init

Browse files

Files changed (4) hide show

eval/eval_refer_seg.py +333 -0
eval/refer.py +323 -0
eval/transforms.py +97 -0
eval/val_utils.py +149 -0

eval/eval_refer_seg.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import argparse
+import torch
+import os
+from tqdm import tqdm
+import random
+from PIL import Image
+import numpy as np
+import torch.nn.functional as F
+import datetime
+from model.segment_anything import SamPredictor, sam_model_registry
+from dataset.refer_seg_dataset import ValDataset
+from dataset.grefer_seg_dataset import grefcocoValDataset
+from data.question_answer_list import QUESTION_PARTIAL
+from segment_predictor import GenerativeSegmenter
+from eval.utils import AverageMeter, Summary, intersectionAndUnionGPU, \
+    compute_logits_from_mask, masks_sample_points
+from torch.utils.data import Dataset, DataLoader
+import math
+# --- Accelerate Import ---
+from accelerate import Accelerator
+def get_chunk(ds, n, k):
+    chunk_size = math.ceil(len(ds) / n)
+    i = chunk_size * k
+    start_index = i
+    end_index = i + chunk_size
+    ds.refer_seg_ds["images"] = ds.refer_seg_ds["images"][start_index:end_index]
+    return ds
+def gget_chunk(ds, n, k):
+    chunk_size = math.ceil(len(ds) / n)
+    i = chunk_size * k
+    start_index = i
+    end_index = i + chunk_size
+    ds.loaded_images = ds.loaded_images[start_index:end_index]
+    return ds
+class CustomDataset(Dataset):
+    def __init__(self, sub_dataset):
+        self.dataset = sub_dataset
+    def __getitem__(self, index):
+        image, masks, questions, image_path = self.dataset[index]
+        image_name = os.path.basename(image_path).split(".")[0]
+        questions = [random.choice(QUESTION_PARTIAL).replace("[class_name]", q) for q in questions]
+        return image, masks, image_name, questions, image_path
+    def __len__(self):
+        return len(self.dataset)
+def collate_fn(batch):
+    images, masks, image_names, questions, image_paths = zip(*batch)
+    return images, masks, image_names, questions, image_paths
+def create_data_loader(args, sub_dataset, batch_size=1):
+    assert batch_size == 1, "Batch size must be 1 for this evaluation script."
+    dataset = CustomDataset(sub_dataset)
+    return DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=collate_fn)
+def eval_model(args):
+    # --- Initialize Accelerator ---
+    accelerator = Accelerator()
+    # --- Model and Predictor Initialization ---
+    segmenter = GenerativeSegmenter(args.model_path, device_map=accelerator.device, min_pixels=args.min_pixels,
+                                    max_pixels=args.max_pixels)
+    sam_help = args.sam_path is not None
+    if sam_help:
+        sam = sam_model_registry["vit_h"](checkpoint=args.sam_path)
+        sam = sam.to(dtype=torch.float32, device=accelerator.device)
+        predictor = SamPredictor(sam)
+    else:
+        predictor = None
+    # --- Dataset and DataLoader Initialization ---
+    if accelerator.is_main_process:
+        print("Loading dataset...")
+    # First, load the full dataset based on the parameters
+    if "grefcoco" in args.dataset_split:
+        val_dataset = grefcocoValDataset(args.image_folder, args.dataset_split)
+    else:
+        val_dataset = ValDataset(args.image_folder, args.dataset_split)
+    if accelerator.is_main_process:
+        total_data_size = len(val_dataset)
+        print(f"Total evaluation data volume (full dataset): {total_data_size} samples.")
+    # Then, get a chunk of the dataset as needed
+    if "grefcoco" in args.dataset_split:
+        sub_dataset = gget_chunk(val_dataset, args.num_chunks, args.chunk_idx)
+    else:
+        sub_dataset = get_chunk(val_dataset, args.num_chunks, args.chunk_idx)
+    data_loader = create_data_loader(args, sub_dataset)
+    data_loader = accelerator.prepare(data_loader)
+    # --- Metric Meters Initialization ---
+    intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM)
+    union_meter = AverageMeter("Union", ":6.3f", Summary.SUM)
+    acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM)
+    if sam_help:
+        intersection_meter_sam = AverageMeter("Intersec", ":6.3f", Summary.SUM)
+        union_meter_sam = AverageMeter("Union", ":6.3f", Summary.SUM)
+        acc_iou_meter_sam = AverageMeter("gIoU", ":6.3f", Summary.SUM)
+    progress_bar = tqdm(data_loader, disable=not accelerator.is_main_process, total=len(data_loader))
+    for batch in progress_bar:
+        images, masks, image_names, questions, image_paths = batch
+        image, gt_masks, image_name, prompts = images[0], masks[0], image_names[0], questions[0]
+        w_ori, h_ori = image.size
+        total_intersection = torch.zeros(2, device=accelerator.device)
+        total_union = torch.zeros(2, device=accelerator.device)
+        total_acc_iou = torch.zeros(2, device=accelerator.device)
+        if sam_help:
+            total_intersection_sam = torch.zeros(2, device=accelerator.device)
+            total_union_sam = torch.zeros(2, device=accelerator.device)
+            total_acc_iou_sam = torch.zeros(2, device=accelerator.device)
+        num_masks_in_image = len(prompts)
+        with torch.inference_mode():
+            if sam_help:
+                predictor.set_image(np.array(image))
+            for i, question in enumerate(prompts):
+                gt_mask = gt_masks[i].to(accelerator.device).float().contiguous()
+                segmentation_masks, _ = segmenter.generate_with_segmentation(image, question)
+                if segmentation_masks is None or len(segmentation_masks) == 0:
+                    pred_mask = torch.zeros((h_ori, w_ori), device=accelerator.device)
+                else:
+                    mask = segmentation_masks[0].to(accelerator.device)
+                    pred_mask = F.interpolate(mask.unsqueeze(0).unsqueeze(0).double(), size=(h_ori, w_ori),
+                                              mode='nearest').squeeze()
+                # if accelerator.is_main_process:
+                #     print("\n" + "=" * 20 + " DEBUG INFO (first iteration) " + "=" * 20)
+                #     print(f"Image Name: {image_name}")
+                #     print(f"GT Mask Shape: {gt_mask.shape}")
+                #     print(f"GT Mask DType: {gt_mask.dtype}")
+                #     print(f"Unique values in GT Mask: {torch.unique(gt_mask)}")
+                #     print(f"Pred Mask Shape: {pred_mask.shape}")
+                #     print(f"Pred Mask DType: {pred_mask.dtype}")
+                #     # Print the number of non-zero pixels in the predicted mask to check if the model generated an all-black image
+                #     print(f"Number of non-zero pixels in Pred Mask: {torch.count_nonzero(pred_mask)}")
+                #     print("=" * 68)
+                sam_refined_mask = torch.zeros_like(pred_mask)
+                if sam_help:
+                    unique_classes = torch.unique(pred_mask)
+                    for class_id in unique_classes:
+                        if class_id == 0: continue
+                        binary_mask = (pred_mask == class_id).double().cpu()
+                        try:
+                            logits = compute_logits_from_mask(pred_mask.cpu())
+                            point_coords, point_labels = masks_sample_points(binary_mask)
+                            sam_mask, _, logit = predictor.predict(point_coords=point_coords,
+                                                                   point_labels=point_labels,
+                                                                   mask_input=logits, multimask_output=False)
+                            for _ in range(2):
+                                sam_mask, _, logit = predictor.predict(point_coords=point_coords,
+                                                                       point_labels=point_labels,
+                                                                       mask_input=logit, multimask_output=False)
+                            sam_mask = sam_mask[0].astype(np.float32)
+                        except Exception as E:
+                            print(f"Error: {E}")
+                            sam_mask = np.zeros((h_ori, w_ori))
+                        sam_refined_mask = torch.from_numpy(sam_mask).to(accelerator.device)
+                        # sam_refined_mask[torch.from_numpy(sam_mask[0] > 0).to(accelerator.device)] = class_id
+                intersection_i, union_i, _ = intersectionAndUnionGPU(pred_mask, gt_mask, 2, ignore_index=255)
+                total_intersection += intersection_i
+                total_union += union_i
+                iou_per_sample = intersection_i / (union_i + 1e-5)
+                iou_per_sample[union_i == 0] = 1.0
+                total_acc_iou += iou_per_sample
+                if sam_help:
+                    intersection_sam_i, union_sam_i, _ = intersectionAndUnionGPU(sam_refined_mask, gt_mask, 2,
+                                                                                 ignore_index=255)
+                    total_intersection_sam += intersection_sam_i
+                    total_union_sam += union_sam_i
+                    iou_per_sample_sam = intersection_sam_i / (union_sam_i + 1e-5)
+                    iou_per_sample_sam[union_sam_i == 0] = 1.0
+                    total_acc_iou_sam += iou_per_sample_sam
+                if args.save_masks and accelerator.is_main_process:
+                    ds_split_sanitized = args.dataset_split.replace("|", "_")
+                    model_name = os.path.basename(args.model_path.strip('/'))
+                    save_path = os.path.join(args.save_file, model_name, ds_split_sanitized, "masks", image_name)
+                    if not os.path.exists(save_path): os.makedirs(save_path)
+                    Image.fromarray(pred_mask.cpu().numpy().astype("uint8") * 255).convert('L').save(
+                        os.path.join(save_path, f"{i}_pred_mask.png"))
+                    if sam_help:
+                        Image.fromarray(sam_refined_mask.cpu().numpy().astype("uint8") * 255).convert('L').save(
+                            os.path.join(save_path, f"{i}_sam_mask.png"))
+                    Image.fromarray(gt_mask.cpu().numpy().astype("uint8") * 255).convert('L').save(
+                        os.path.join(save_path, f"{i}_gt_mask.png"))
+                    image.save(os.path.join(save_path, f"{i}_image.png"))
+        intersection_meter.update(total_intersection.cpu().numpy())
+        union_meter.update(total_union.cpu().numpy())
+        if sam_help:
+            intersection_meter_sam.update(total_intersection_sam.cpu().numpy())
+            union_meter_sam.update(total_union_sam.cpu().numpy())
+        if num_masks_in_image > 0:
+            total_acc_iou = total_acc_iou / num_masks_in_image
+            acc_iou_meter.update(total_acc_iou.cpu().numpy(), n=num_masks_in_image)
+            if sam_help:
+                total_acc_iou_sam = total_acc_iou_sam / num_masks_in_image
+                acc_iou_meter_sam.update(total_acc_iou_sam.cpu().numpy(), n=num_masks_in_image)
+        # break
+    # --- Synchronize metrics across all processes ---
+    all_intersections = accelerator.gather_for_metrics(torch.from_numpy(intersection_meter.sum).to(accelerator.device))
+    all_unions = accelerator.gather_for_metrics(torch.from_numpy(union_meter.sum).to(accelerator.device))
+    all_giou_sum = accelerator.gather_for_metrics(torch.from_numpy(acc_iou_meter.sum).to(accelerator.device))
+    all_giou_count = accelerator.gather_for_metrics(torch.tensor(acc_iou_meter.count, device=accelerator.device))
+    all_intersections = all_intersections.view(-1, 2)
+    all_unions = all_unions.view(-1, 2)
+    all_giou_sum = all_giou_sum.view(-1, 2)
+    all_giou_count = all_giou_count.view(-1, 1)
+    if sam_help:
+        all_intersections_sam = accelerator.gather_for_metrics(
+            torch.from_numpy(intersection_meter_sam.sum).to(accelerator.device))
+        all_unions_sam = accelerator.gather_for_metrics(torch.from_numpy(union_meter_sam.sum).to(accelerator.device))
+        all_giou_sum_sam = accelerator.gather_for_metrics(
+            torch.from_numpy(acc_iou_meter_sam.sum).to(accelerator.device))
+        all_giou_count_sam = accelerator.gather_for_metrics(
+            torch.tensor(acc_iou_meter_sam.count, device=accelerator.device))
+        all_intersections_sam = all_intersections_sam.view(-1, 2)
+        all_unions_sam = all_unions_sam.view(-1, 2)
+        all_giou_sum_sam = all_giou_sum_sam.view(-1, 2)
+        all_giou_count_sam = all_giou_count_sam.view(-1, 1)
+    # --- Only calculate and output final results on the main process ---
+    if accelerator.is_main_process:
+        iou_class = torch.sum(all_intersections, dim=0) / (torch.sum(all_unions, dim=0) + 1e-5)
+        # print(all_intersections, all_unions, iou_class)
+        ciou = iou_class[1].item()
+        giou = (torch.sum(all_giou_sum, dim=0)[1] / torch.sum(all_giou_count)).item()
+        if sam_help:
+            iou_class_sam = torch.sum(all_intersections_sam, dim=0) / (torch.sum(all_unions_sam, dim=0) + 1e-5)
+            ciou_sam = iou_class_sam[1].item()
+            giou_sam = (torch.sum(all_giou_sum_sam, dim=0)[1] / torch.sum(all_giou_count_sam)).item()
+        else:
+            giou_sam, ciou_sam = 0.0, 0.0
+        # <--- Added: Calculate and print accurate evaluation totals ---
+        total_evaluated_images = len(sub_dataset)  # Total images evaluated
+        total_evaluated_masks = torch.sum(all_giou_count).item()  # Total masks/prompts evaluated
+        # <--- End added ---
+        print("\n" + "=" * 50)
+        print(f"Evaluation finished for: {args.model_path}")
+        print(f"Dataset: {args.dataset_split}")
+        print("-" * 50)
+        # <--- Added: Print evaluation sample counts ---
+        print(f"Total images evaluated: {total_evaluated_images}")
+        print(f"Total masks/prompts evaluated: {total_evaluated_masks}")
+        print("-" * 50)
+        # <--- End added ---
+        print(f"Raw Model Mask   -> gIoU: {giou:.4f}, cIoU: {ciou:.4f}")
+        if sam_help:
+            print(f"SAM-Refined Mask -> gIoU: {giou_sam:.4f}, cIoU: {ciou_sam:.4f}")
+        print("=" * 50 + "\n")
+        # --- Dynamically construct output file path and write results ---
+        model_name = os.path.basename(args.model_path.strip('/'))
+        ds_split_sanitized = args.dataset_split.replace("|", "_")
+        output_dir = os.path.join(args.save_file, model_name, ds_split_sanitized)
+        os.makedirs(output_dir, exist_ok=True)
+        output_filepath = os.path.join(output_dir, "evaluation_results.txt")
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        chunk_info = f"(chunk {args.chunk_idx + 1}/{args.num_chunks})" if args.num_chunks > 1 else ""
+        header_text = f"[{current_time}] Model: {args.model_path}, Dataset: {args.dataset_split} {chunk_info}\n"
+        # <--- Added: Also record evaluation sample counts in the file ---
+        eval_stats_text = f"  - Evaluated on {total_evaluated_images} images and {total_evaluated_masks} masks.\n"
+        # <--- End added ---
+        output_text = f"  - Raw Model Mask   -> gIoU: {giou:.4f}, cIoU: {ciou:.4f}\n"
+        if sam_help:
+            output_text += f"  - SAM-Refined Mask -> gIoU: {giou_sam:.4f}, cIoU: {ciou_sam:.4f}\n"
+        with open(output_filepath, "a") as file:
+            file.write(header_text)
+            file.write(eval_stats_text) # <--- Added
+            file.write(output_text)
+            file.write("-" * 60 + "\n")
+        print(f"Results appended to: {output_filepath}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default='/raid2/DATA/text4seg/model_trained_qwen_2b/',
+                        help="Path to your GenerativeSegmenter model checkpoint.")
+    parser.add_argument("--sam_path", type=str, default='/efficient_sag4text/sam_vit_h_4b8939.pth', help="Path to the SAM checkpoint.")
+    parser.add_argument("--image_folder", type=str, default='/efficient_sag4text/seg_data/refer_seg', help="Root folder for the dataset images.")
+    parser.add_argument("--dataset_split", type=str, default="refcoco|unc|val", help="Dataset split to evaluate on.")
+    parser.add_argument("--save_file", type=str, default="output_eval_accelerated/",
+                        help="Root directory to save evaluation outputs (masks and metrics).")
+    parser.add_argument("--save_masks", action='store_true', help="Set this flag to save output masks and images.")
+    parser.add_argument("--num_chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--min_pixels", type=int, default=1024*28 * 28, help="Minimum pixels for segmentation.")
+    parser.add_argument("--max_pixels", type=int, default=1024*28 * 28, help="Maximum pixels for segmentation.")
+    args = parser.parse_args()
+    eval_model(args)

eval/refer.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import itertools
+import json
+import os.path as osp
+import pickle
+import sys
+import time
+from pprint import pprint
+import matplotlib.pyplot as plt
+import numpy as np
+import skimage.io as io
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pycocotools import mask
+class REFER:
+    def __init__(self, data_root, dataset="refcoco", splitBy="unc"):
+        print("loading dataset %s into memory..." % dataset)
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ["refcoco", "refcoco+", "refcocog"]:
+            self.IMAGE_DIR = osp.join(data_root, "images/coco_2014/train2014")
+        elif dataset == "refclef":
+            self.IMAGE_DIR = osp.join(data_root, "images/saiapr_tc-12")
+        else:
+            print("No refer dataset is called [%s]" % dataset)
+            sys.exit()
+        self.dataset = dataset
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+        ref_file = osp.join(self.DATA_DIR, "refs(" + splitBy + ").p")
+        print("ref_file: ", ref_file)
+        self.data = {}
+        self.data["dataset"] = dataset
+        self.data["refs"] = pickle.load(open(ref_file, "rb"))
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, "instances.json")
+        instances = json.load(open(instances_file, "rb"))
+        self.data["images"] = instances["images"]
+        self.data["annotations"] = instances["annotations"]
+        self.data["categories"] = instances["categories"]
+        # create index
+        self.createIndex()
+        print("DONE (t=%.2fs)" % (time.time() - tic))
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print("creating index...")
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data["annotations"]:
+            Anns[ann["id"]] = ann
+            imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
+        for img in self.data["images"]:
+            Imgs[img["id"]] = img
+        for cat in self.data["categories"]:
+            Cats[cat["id"]] = cat["name"]
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data["refs"]:
+            # ids
+            ref_id = ref["ref_id"]
+            ann_id = ref["ann_id"]
+            category_id = ref["category_id"]
+            image_id = ref["image_id"]
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+            # add mapping of sent
+            for sent in ref["sentences"]:
+                Sents[sent["sent_id"]] = sent
+                sentToRef[sent["sent_id"]] = ref
+                sentToTokens[sent["sent_id"]] = sent["tokens"]
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print("index created.")
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data["refs"]
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data["refs"]
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref["category_id"] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref["ref_id"] in ref_ids]
+            if not len(split) == 0:
+                if split in ["testA", "testB", "testC"]:
+                    refs = [
+                        ref for ref in refs if split[-1] in ref["split"]
+                    ]  # we also consider testAB, testBC, ...
+                elif split in ["testAB", "testBC", "testAC"]:
+                    refs = [
+                        ref for ref in refs if ref["split"] == split
+                    ]  # rarely used I guess...
+                elif split == "test":
+                    refs = [ref for ref in refs if "test" in ref["split"]]
+                elif split == "train" or split == "val":
+                    refs = [ref for ref in refs if ref["split"] == split]
+                else:
+                    print("No such split [%s]" % split)
+                    sys.exit()
+        ref_ids = [ref["ref_id"] for ref in refs]
+        return ref_ids
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann["id"] for ann in self.data["annotations"]]
+        else:
+            if not len(image_ids) == 0:
+                lists = [
+                    self.imgToAnns[image_id]
+                    for image_id in image_ids
+                    if image_id in self.imgToAnns
+                ]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data["annotations"]
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann["category_id"] in cat_ids]
+            ann_ids = [ann["id"] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(
+                    set([self.Refs[ref_id]["ann_id"] for ref_id in ref_ids])
+                )
+        return ann_ids
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+    def getCatIds(self):
+        return self.Cats.keys()
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int or type(ann_ids) == unicode:
+            return [self.Anns[ann_ids]]
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann["bbox"]  # [x, y, w, h]
+    def showRef(self, ref, seg_box="seg"):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref["image_id"]]
+        I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"]))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref["sentences"]):
+            print("%s. %s" % (sid + 1, sent["sent"]))
+        # show segmentations
+        if seg_box == "seg":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = "none"
+            if type(ann["segmentation"][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann["segmentation"]:
+                    poly = np.array(seg).reshape((len(seg) / 2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 1, 0, 0),
+                    linewidths=3,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 0, 0, 0),
+                    linewidths=1,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann["segmentation"]
+                m = mask.decode(rle)
+                img = np.ones((m.shape[0], m.shape[1], 3))
+                color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                for i in range(3):
+                    img[:, :, i] = color_mask[i]
+                ax.imshow(np.dstack((img, m * 0.5)))
+        # show bounding-box
+        elif seg_box == "box":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref["ref_id"])
+            box_plot = Rectangle(
+                (bbox[0], bbox[1]),
+                bbox[2],
+                bbox[3],
+                fill=False,
+                edgecolor="green",
+                linewidth=3,
+            )
+            ax.add_patch(box_plot)
+    def getMask(self, ref):
+        # return mask, area and mask-center
+        ann = self.refToAnn[ref["ref_id"]]
+        image = self.Imgs[ref["image_id"]]
+        if type(ann["segmentation"][0]) == list:  # polygon
+            rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"])
+        else:
+            rle = ann["segmentation"]
+        m = mask.decode(rle)
+        m = np.sum(
+            m, axis=2
+        )  # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = m.astype(np.uint8)  # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {"mask": m, "area": area}
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M["mask"]
+        ax = plt.gca()
+        ax.imshow(msk)
+if __name__ == "__main__":
+    refer = REFER(dataset="refcocog", splitBy="google")
+    ref_ids = refer.getRefIds()
+    print(len(ref_ids))
+    print(len(refer.Imgs))
+    print(len(refer.imgToRefs))
+    ref_ids = refer.getRefIds(split="train")
+    print("There are %s training referred objects." % len(ref_ids))
+    for ref_id in ref_ids:
+        ref = refer.loadRefs(ref_id)[0]
+        if len(ref["sentences"]) < 2:
+            continue
+        pprint(ref)
+        print("The label is %s." % refer.Cats[ref["category_id"]])
+        plt.figure()
+        refer.showRef(ref, seg_box="box")
+        plt.show()

eval/transforms.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
+from copy import deepcopy
+from typing import Tuple
+class ResizeLongestSide:
+    """
+    Resizes images to the longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes. Provides methods for
+    transforming both numpy array and batched torch tensors.
+    """
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+        return np.array(resize(to_pil_image(image), target_size))
+    def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """
+        Expects a numpy array shape Bx4. Requires the original image size
+        in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Expects batched images with shape BxCxHxW and float format. This
+        transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
+        return F.interpolate(
+            image, target_size, mode="bilinear", align_corners=False, antialias=True
+        )
+    def apply_coords_torch(
+        self, coords: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+    def apply_boxes_torch(
+        self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with shape Bx4. Requires the original image
+        size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+    @staticmethod
+    def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target long side length.
+        """
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)

eval/val_utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+import numpy as np
+from PIL import Image
+from .utils import AverageMeter, Summary, intersectionAndUnionGPU, compute_logits_from_mask, masks_sample_points
+def run_in_process_evaluation(model, accelerator, eval_dataloader, sam_predictor=None):
+    # --- 1. Initialize all metric recorders ---
+    intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM)
+    union_meter = AverageMeter("Union", ":6.3f", Summary.SUM)
+    acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM)
+    use_sam = sam_predictor is not None
+    if use_sam:
+        intersection_meter_sam = AverageMeter("Intersec", ":6.3f", Summary.SUM)
+        union_meter_sam = AverageMeter("Union", ":6.3f", Summary.SUM)
+        acc_iou_meter_sam = AverageMeter("gIoU", ":6.3f", Summary.SUM)
+    progress_bar = tqdm(eval_dataloader, disable=not accelerator.is_main_process, desc="Running Evaluation")
+    # --- 2. Iterate over the evaluation dataset ---
+    for batch in progress_bar:
+        images, masks, image_names, questions, image_paths = batch
+        image, gt_masks, image_name, prompts = images[0], masks[0], image_names[0], questions[0]
+        w_ori, h_ori = image.size
+        total_intersection = torch.zeros(2, device=accelerator.device)
+        total_union = torch.zeros(2, device=accelerator.device)
+        total_acc_iou = torch.zeros(2, device=accelerator.device)
+        if use_sam:
+            total_intersection_sam = torch.zeros(2, device=accelerator.device)
+            total_union_sam = torch.zeros(2, device=accelerator.device)
+            total_acc_iou_sam = torch.zeros(2, device=accelerator.device)
+        num_masks_in_image = len(prompts)
+        if num_masks_in_image == 0:
+            continue
+        with torch.inference_mode():
+            if use_sam:
+                predictor = sam_predictor
+                predictor.set_image(np.array(image))
+            for i, question in enumerate(prompts):
+                gt_mask = gt_masks[i].to(accelerator.device)
+                # --- Key difference: directly use the passed-in model object for inference ---
+                segmentation_masks, _ = model.generate_with_segmentation(image, question)
+                if segmentation_masks is None or len(segmentation_masks) == 0:
+                    pred_mask = torch.zeros((h_ori, w_ori), device=accelerator.device)
+                else:
+                    mask = segmentation_masks[0].to(accelerator.device)
+                    pred_mask = F.interpolate(mask.unsqueeze(0).unsqueeze(0).double(), size=(h_ori, w_ori),
+                                              mode='nearest').squeeze()
+                # --- SAM post-processing (if enabled) ---
+                if use_sam:
+                    sam_refined_mask = torch.zeros((h_ori, w_ori), device=accelerator.device)
+                    unique_classes = torch.unique(pred_mask)
+                    for class_id in unique_classes:
+                        if class_id == 0: continue
+                        binary_mask = (pred_mask == class_id).double()
+                        try:
+                            logits = compute_logits_from_mask(binary_mask)
+                            point_coords, point_labels = masks_sample_points(binary_mask)
+                            sam_mask, _, logit = predictor.predict(point_coords=point_coords, point_labels=point_labels,
+                                                                   mask_input=logits, multimask_output=False)
+                            for _ in range(2):
+                                sam_mask, _, logit = predictor.predict(point_coords=point_coords, point_labels=point_labels,
+                                                                       mask_input=logit, multimask_output=False)
+                        except Exception:
+                            sam_mask = np.zeros((1, h_ori, w_ori))
+                        sam_refined_mask[torch.from_numpy(sam_mask[0] > 0).to(accelerator.device)] = class_id
+                # --- Metric calculation (current mask) ---
+                # Use the original model for prediction
+                # <-- MODIFICATION 1: Add ignore_index=255
+                intersection_i, union_i, _ = intersectionAndUnionGPU(pred_mask, gt_mask, 2, ignore_index=255)
+                total_intersection += intersection_i
+                total_union += union_i
+                # <-- MODIFICATION 2: Change epsilon to 1e-5
+                iou_per_sample = intersection_i / (union_i + 1e-5)
+                iou_per_sample[union_i == 0] = 1.0
+                total_acc_iou += iou_per_sample
+                # SAM optimized prediction
+                if use_sam:
+                    # <-- MODIFICATION 1 (SAM): Add ignore_index=255
+                    intersection_sam_i, union_sam_i, _ = intersectionAndUnionGPU(sam_refined_mask, gt_mask, 2, ignore_index=255)
+                    total_intersection_sam += intersection_sam_i
+                    total_union_sam += union_sam_i
+                    # <-- MODIFICATION 2 (SAM): Change epsilon to 1e-5
+                    iou_per_sample_sam = intersection_sam_i / (union_sam_i + 1e-5)
+                    iou_per_sample_sam[union_sam_i == 0] = 1.0
+                    total_acc_iou_sam += iou_per_sample_sam
+        # Update global recorders
+        intersection_meter.update(total_intersection.cpu().numpy())
+        union_meter.update(total_union.cpu().numpy())
+        acc_iou_meter.update(total_acc_iou.cpu().numpy(), n=num_masks_in_image)
+        if use_sam:
+            intersection_meter_sam.update(total_intersection_sam.cpu().numpy())
+            union_meter_sam.update(total_union_sam.cpu().numpy())
+            acc_iou_meter_sam.update(total_acc_iou_sam.cpu().numpy(), n=num_masks_in_image)
+    # --- 3. Aggregate metrics from all GPUs ---
+    all_intersections = accelerator.gather_for_metrics(torch.from_numpy(intersection_meter.sum).to(accelerator.device))
+    all_unions = accelerator.gather_for_metrics(torch.from_numpy(union_meter.sum).to(accelerator.device))
+    all_giou_sum = accelerator.gather_for_metrics(torch.from_numpy(acc_iou_meter.sum).to(accelerator.device))
+    all_giou_count = accelerator.gather_for_metrics(torch.tensor(acc_iou_meter.count, device=accelerator.device))
+    if use_sam:
+        all_intersections_sam = accelerator.gather_for_metrics(
+            torch.from_numpy(intersection_meter_sam.sum).to(accelerator.device))
+        all_unions_sam = accelerator.gather_for_metrics(torch.from_numpy(union_meter_sam.sum).to(accelerator.device))
+        all_giou_sum_sam = accelerator.gather_for_metrics(
+            torch.from_numpy(acc_iou_meter_sam.sum).to(accelerator.device))
+        all_giou_count_sam = accelerator.gather_for_metrics(
+            torch.tensor(acc_iou_meter_sam.count, device=accelerator.device))
+    # --- 4. Calculate final results and return on the main process ---
+    final_metrics = {}
+    if accelerator.is_main_process:
+        # original model metrics
+        # <-- MODIFICATION 2 (cIoU): Change epsilon to 1e-5
+        iou_class = torch.sum(all_intersections, dim=0) / (torch.sum(all_unions, dim=0) + 1e-5)
+        ciou = iou_class[1].item()
+        giou_sum = torch.sum(all_giou_sum, dim=0)[1]
+        giou_count = torch.sum(all_giou_count)
+        giou = (giou_sum / giou_count).item() if giou_count > 0 else 0.0
+        final_metrics['giou'] = giou
+        final_metrics['ciou'] = ciou
+        # SAM optimized metrics
+        if use_sam:
+            # <-- MODIFICATION 2 (cIoU SAM): Change epsilon to 1e-5
+            iou_class_sam = torch.sum(all_intersections_sam, dim=0) / (torch.sum(all_unions_sam, dim=0) + 1e-5)
+            ciou_sam = iou_class_sam[1].item()
+            giou_sum_sam = torch.sum(all_giou_sum_sam, dim=0)[1]
+            giou_count_sam = torch.sum(all_giou_count_sam)
+            giou_sam = (giou_sum_sam / giou_count_sam).item() if giou_count_sam > 0 else 0.0
+            final_metrics['sam_giou'] = giou_sam
+            final_metrics['sam_ciou'] = ciou_sam
+    return final_metrics