CU-1

Running

App Files Files Community

Matis Despujols commited on Sep 30

Commit

066effd

verified ·

1 Parent(s): aa2b37d

Upload 97 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

rfdetr/__init__.py +12 -0
rfdetr/__pycache__/__init__.cpython-313.pyc +0 -0
rfdetr/__pycache__/config.cpython-313.pyc +0 -0
rfdetr/__pycache__/detr.cpython-313.pyc +0 -0
rfdetr/__pycache__/engine.cpython-313.pyc +0 -0
rfdetr/__pycache__/main.cpython-313.pyc +0 -0
rfdetr/cli/__pycache__/main.cpython-313.pyc +0 -0
rfdetr/cli/main.py +87 -0
rfdetr/config.py +142 -0
rfdetr/datasets/__init__.py +36 -0
rfdetr/datasets/__pycache__/__init__.cpython-313.pyc +0 -0
rfdetr/datasets/__pycache__/coco.cpython-313.pyc +0 -0
rfdetr/datasets/__pycache__/coco_eval.cpython-313.pyc +0 -0
rfdetr/datasets/__pycache__/o365.cpython-313.pyc +0 -0
rfdetr/datasets/__pycache__/transforms.cpython-313.pyc +0 -0
rfdetr/datasets/coco.py +280 -0
rfdetr/datasets/coco_eval.py +271 -0
rfdetr/datasets/o365.py +53 -0
rfdetr/datasets/transforms.py +475 -0
rfdetr/deploy/__init__.py +0 -0
rfdetr/deploy/__pycache__/__init__.cpython-313.pyc +0 -0
rfdetr/deploy/__pycache__/benchmark.cpython-313.pyc +0 -0
rfdetr/deploy/__pycache__/export.cpython-313.pyc +0 -0
rfdetr/deploy/_onnx/__init__.py +13 -0
rfdetr/deploy/_onnx/__pycache__/__init__.cpython-313.pyc +0 -0
rfdetr/deploy/_onnx/__pycache__/optimizer.cpython-313.pyc +0 -0
rfdetr/deploy/_onnx/__pycache__/symbolic.cpython-313.pyc +0 -0
rfdetr/deploy/_onnx/optimizer.py +579 -0
rfdetr/deploy/_onnx/symbolic.py +37 -0
rfdetr/deploy/benchmark.py +590 -0
rfdetr/deploy/export.py +276 -0
rfdetr/detr.py +451 -0
rfdetr/engine.py +340 -0
rfdetr/main.py +1062 -0
rfdetr/models/__init__.py +16 -0
rfdetr/models/__pycache__/__init__.cpython-313.pyc +0 -0
rfdetr/models/__pycache__/lwdetr.cpython-313.pyc +0 -0
rfdetr/models/__pycache__/matcher.cpython-313.pyc +0 -0
rfdetr/models/__pycache__/position_encoding.cpython-313.pyc +0 -0
rfdetr/models/__pycache__/transformer.cpython-313.pyc +0 -0
rfdetr/models/backbone/__init__.py +110 -0
rfdetr/models/backbone/__pycache__/__init__.cpython-313.pyc +0 -0
rfdetr/models/backbone/__pycache__/backbone.cpython-313.pyc +0 -0
rfdetr/models/backbone/__pycache__/base.cpython-313.pyc +0 -0
rfdetr/models/backbone/__pycache__/dinov2.cpython-313.pyc +0 -0
rfdetr/models/backbone/__pycache__/dinov2_with_windowed_attn.cpython-313.pyc +0 -0
rfdetr/models/backbone/__pycache__/projector.cpython-313.pyc +0 -0
rfdetr/models/backbone/backbone.py +205 -0
rfdetr/models/backbone/base.py +20 -0
rfdetr/models/backbone/dinov2.py +197 -0

rfdetr/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import os
+if os.environ.get("PYTORCH_ENABLE_MPS_FALLBACK") is None:
+    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+from rfdetr.detr import RFDETRBase, RFDETRLarge, RFDETRNano, RFDETRSmall, RFDETRMedium

rfdetr/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (530 Bytes). View file

rfdetr/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (7.12 kB). View file

rfdetr/__pycache__/detr.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

rfdetr/__pycache__/engine.cpython-313.pyc ADDED Viewed

Binary file (17.6 kB). View file

rfdetr/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (47.4 kB). View file

rfdetr/cli/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (4.16 kB). View file

rfdetr/cli/main.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+import argparse
+from rf100vl import get_rf100vl_projects
+import roboflow
+from rfdetr import RFDETRBase
+import torch
+import os
+def download_dataset(rf_project: roboflow.Project, dataset_version: int):
+    versions = rf_project.versions()
+    if dataset_version is not None:
+        versions = [v for v in versions if v.version == str(dataset_version)]
+        if len(versions) == 0:
+            raise ValueError(f"Dataset version {dataset_version} not found")
+        version = versions[0]
+    else:
+        version = max(versions, key=lambda v: v.id)
+    location = os.path.join("datasets/", rf_project.name + "_v" + version.version)
+    if not os.path.exists(location):
+        location = version.download(
+            model_format="coco", location=location, overwrite=False
+        ).location
+    return location
+def train_from_rf_project(rf_project: roboflow.Project, dataset_version: int):
+    location = download_dataset(rf_project, dataset_version)
+    print(location)
+    rf_detr = RFDETRBase()
+    device_supports_cuda = torch.cuda.is_available()
+    rf_detr.train(
+        dataset_dir=location,
+        epochs=1,
+        device="cuda" if device_supports_cuda else "cpu",
+    )
+def train_from_coco_dir(coco_dir: str):
+    rf_detr = RFDETRBase()
+    rf_detr.train(
+        dataset_dir=coco_dir,
+        epochs=1,
+        device="cuda" if device_supports_cuda else "cpu",
+    )
+def trainer():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--coco_dir", type=str, required=False)
+    parser.add_argument("--api_key", type=str, required=False)
+    parser.add_argument("--workspace", type=str, required=False, default=None)
+    parser.add_argument("--project_name", type=str, required=False, default=None)
+    parser.add_argument("--dataset_version", type=int, required=False, default=None)
+    args = parser.parse_args()
+    if args.coco_dir is not None:
+        train_from_coco_dir(args.coco_dir)
+        return
+    if (args.workspace is None and args.project_name is not None) or (
+        args.workspace is not None and args.project_name is None
+    ):
+        raise ValueError(
+            "Either both workspace and project_name must be provided or none of them"
+        )
+    if args.workspace is not None:
+        rf = roboflow.Roboflow(api_key=args.api_key)
+        project = rf.workspace(args.workspace).project(args.project_name)
+    else:
+        projects = get_rf100vl_projects(api_key=args.api_key)
+        project = projects[0].rf_project
+    train_from_rf_project(project, args.dataset_version)
+if __name__ == "__main__":
+    trainer()

rfdetr/config.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+from pydantic import BaseModel
+from typing import List, Optional, Literal, Type
+import torch
+DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+class ModelConfig(BaseModel):
+    encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"]
+    out_feature_indexes: List[int]
+    dec_layers: int
+    two_stage: bool = True
+    projector_scale: List[Literal["P3", "P4", "P5"]]
+    hidden_dim: int
+    patch_size: int
+    num_windows: int
+    sa_nheads: int
+    ca_nheads: int
+    dec_n_points: int
+    bbox_reparam: bool = True
+    lite_refpoint_refine: bool = True
+    layer_norm: bool = True
+    amp: bool = True
+    num_classes: int = 90
+    pretrain_weights: Optional[str] = None
+    device: Literal["cpu", "cuda", "mps"] = DEVICE
+    resolution: int
+    group_detr: int = 13
+    gradient_checkpointing: bool = False
+    positional_encoding_size: int
+class RFDETRBaseConfig(ModelConfig):
+    """
+    The configuration for an RF-DETR Base model.
+    """
+    encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = "dinov2_windowed_small"
+    hidden_dim: int = 256
+    patch_size: int = 14
+    num_windows: int = 4
+    dec_layers: int = 3
+    sa_nheads: int = 8
+    ca_nheads: int = 16
+    dec_n_points: int = 2
+    num_queries: int = 300
+    num_select: int = 300
+    projector_scale: List[Literal["P3", "P4", "P5"]] = ["P4"]
+    out_feature_indexes: List[int] = [2, 5, 8, 11]
+    pretrain_weights: Optional[str] = "rf-detr-base.pth"
+    resolution: int = 560
+    positional_encoding_size: int = 37
+class RFDETRLargeConfig(RFDETRBaseConfig):
+    """
+    The configuration for an RF-DETR Large model.
+    """
+    encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = "dinov2_windowed_base"
+    hidden_dim: int = 384
+    sa_nheads: int = 12
+    ca_nheads: int = 24
+    dec_n_points: int = 4
+    projector_scale: List[Literal["P3", "P4", "P5"]] = ["P3", "P5"]
+    pretrain_weights: Optional[str] = "rf-detr-large.pth"
+class RFDETRNanoConfig(RFDETRBaseConfig):
+    """
+    The configuration for an RF-DETR Nano model.
+    """
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 2
+    patch_size: int = 16
+    resolution: int = 384
+    positional_encoding_size: int = 24
+    pretrain_weights: Optional[str] = "rf-detr-nano.pth"
+class RFDETRSmallConfig(RFDETRBaseConfig):
+    """
+    The configuration for an RF-DETR Small model.
+    """
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 3
+    patch_size: int = 16
+    resolution: int = 512
+    positional_encoding_size: int = 32
+    pretrain_weights: Optional[str] = "rf-detr-small.pth"
+class RFDETRMediumConfig(RFDETRBaseConfig):
+    """
+    The configuration for an RF-DETR Medium model.
+    """
+    out_feature_indexes: List[int] = [3, 6, 9, 12]
+    num_windows: int = 2
+    dec_layers: int = 4
+    patch_size: int = 16
+    resolution: int = 576
+    positional_encoding_size: int = 36
+    pretrain_weights: Optional[str] = "rf-detr-medium.pth"
+class TrainConfig(BaseModel):
+    lr: float = 1e-4
+    lr_encoder: float = 1.5e-4
+    batch_size: int = 4
+    grad_accum_steps: int = 4
+    epochs: int = 100
+    ema_decay: float = 0.993
+    ema_tau: int = 100
+    lr_drop: int = 100
+    checkpoint_interval: int = 10
+    warmup_epochs: int = 0
+    lr_vit_layer_decay: float = 0.8
+    lr_component_decay: float = 0.7
+    drop_path: float = 0.0
+    group_detr: int = 13
+    ia_bce_loss: bool = True
+    cls_loss_coef: float = 1.0
+    num_select: int = 300
+    dataset_file: Literal["coco", "o365", "roboflow"] = "roboflow"
+    square_resize_div_64: bool = True
+    dataset_dir: str
+    output_dir: str = "output"
+    multi_scale: bool = True
+    expanded_scales: bool = True
+    do_random_resize_via_padding: bool = False
+    use_ema: bool = True
+    num_workers: int = 2
+    weight_decay: float = 1e-4
+    early_stopping: bool = False
+    early_stopping_patience: int = 10
+    early_stopping_min_delta: float = 0.001
+    early_stopping_use_ema: bool = False
+    tensorboard: bool = True
+    wandb: bool = False
+    project: Optional[str] = None
+    run: Optional[str] = None
+    class_names: List[str] = None
+    run_test: bool = True

rfdetr/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# ------------------------------------------------------------------------
+# LW-DETR
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+import torch.utils.data
+import torchvision
+from .coco import build as build_coco
+from .o365 import build_o365
+from .coco import build_roboflow
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(image_set, args, resolution):
+    if args.dataset_file == 'coco':
+        return build_coco(image_set, args, resolution)
+    if args.dataset_file == 'o365':
+        return build_o365(image_set, args, resolution)
+    if args.dataset_file == 'roboflow':
+        return build_roboflow(image_set, args, resolution)
+    raise ValueError(f'dataset {args.dataset_file} not supported')

rfdetr/datasets/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.53 kB). View file

rfdetr/datasets/__pycache__/coco.cpython-313.pyc ADDED Viewed

Binary file (11 kB). View file

rfdetr/datasets/__pycache__/coco_eval.cpython-313.pyc ADDED Viewed

Binary file (11.8 kB). View file

rfdetr/datasets/__pycache__/o365.cpython-313.pyc ADDED Viewed

Binary file (1.93 kB). View file

rfdetr/datasets/__pycache__/transforms.cpython-313.pyc ADDED Viewed

Binary file (23.9 kB). View file

rfdetr/datasets/coco.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+import torch
+import torch.utils.data
+import torchvision
+import rfdetr.datasets.transforms as T
+def compute_multi_scale_scales(resolution, expanded_scales=False, patch_size=16, num_windows=4):
+    # round to the nearest multiple of 4*patch_size to enable both patching and windowing
+    base_num_patches_per_window = resolution // (patch_size * num_windows)
+    offsets = [-3, -2, -1, 0, 1, 2, 3, 4] if not expanded_scales else [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
+    scales = [base_num_patches_per_window + offset for offset in offsets]
+    proposed_scales = [scale * patch_size * num_windows for scale in scales]
+    proposed_scales = [scale for scale in proposed_scales if scale >= patch_size * num_windows * 2]  # ensure minimum image size
+    return proposed_scales
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCoco()
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+class ConvertCoco(object):
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        anno = target["annotations"]
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["image_id"] = image_id
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+def make_coco_transforms(image_set, resolution, multi_scale=False, expanded_scales=False, skip_random_resize=False, patch_size=16, num_windows=4):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [resolution]
+    if multi_scale:
+        # scales = [448, 512, 576, 640, 704, 768, 832, 896]
+        scales = compute_multi_scale_scales(resolution, expanded_scales, patch_size, num_windows)
+        if skip_random_resize:
+            scales = [scales[-1]]
+        print(scales)
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([resolution], max_size=1333),
+            normalize,
+        ])
+    if image_set == 'val_speed':
+        return T.Compose([
+            T.SquareResize([resolution]),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def make_coco_transforms_square_div_64(image_set, resolution, multi_scale=False, expanded_scales=False, skip_random_resize=False, patch_size=16, num_windows=4):
+    """
+    """
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [resolution]
+    if multi_scale:
+        # scales = [448, 512, 576, 640, 704, 768, 832, 896]
+        scales = compute_multi_scale_scales(resolution, expanded_scales, patch_size, num_windows)
+        if skip_random_resize:
+            scales = [scales[-1]]
+        print(scales)
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.SquareResize(scales),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.SquareResize(scales),
+                ]),
+            ),
+            normalize,
+        ])
+    if image_set == 'val':
+        return T.Compose([
+            T.SquareResize([resolution]),
+            normalize,
+        ])
+    if image_set == 'test':
+        return T.Compose([
+            T.SquareResize([resolution]),
+            normalize,
+        ])
+    if image_set == 'val_speed':
+        return T.Compose([
+            T.SquareResize([resolution]),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args, resolution):
+    root = Path(args.coco_path)
+    assert root.exists(), f'provided COCO path {root} does not exist'
+    mode = 'instances'
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
+        "val": (root /  "val2017", root / "annotations" / f'{mode}_val2017.json'),
+        "test": (root / "test2017", root / "annotations" / f'image_info_test-dev2017.json'),
+    }
+    img_folder, ann_file = PATHS[image_set.split("_")[0]]
+    try:
+        square_resize = args.square_resize
+    except:
+        square_resize = False
+    try:
+        square_resize_div_64 = args.square_resize_div_64
+    except:
+        square_resize_div_64 = False
+    if square_resize_div_64:
+        dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms_square_div_64(
+            image_set,
+            resolution,
+            multi_scale=args.multi_scale,
+            expanded_scales=args.expanded_scales,
+            skip_random_resize=not args.do_random_resize_via_padding,
+            patch_size=args.patch_size,
+            num_windows=args.num_windows
+        ))
+    else:
+        dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(
+            image_set,
+            resolution,
+            multi_scale=args.multi_scale,
+            expanded_scales=args.expanded_scales,
+            skip_random_resize=not args.do_random_resize_via_padding,
+            patch_size=args.patch_size,
+            num_windows=args.num_windows
+        ))
+    return dataset
+def build_roboflow(image_set, args, resolution):
+    root = Path(args.dataset_dir)
+    assert root.exists(), f'provided Roboflow path {root} does not exist'
+    mode = 'instances'
+    PATHS = {
+        "train": (root / "train", root / "train" / "_annotations.coco.json"),
+        "val": (root /  "valid", root / "valid" / "_annotations.coco.json"),
+        "test": (root / "test", root / "test" / "_annotations.coco.json"),
+    }
+    img_folder, ann_file = PATHS[image_set.split("_")[0]]
+    try:
+        square_resize = args.square_resize
+    except:
+        square_resize = False
+    try:
+        square_resize_div_64 = args.square_resize_div_64
+    except:
+        square_resize_div_64 = False
+    if square_resize_div_64:
+        dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms_square_div_64(
+            image_set,
+            resolution,
+            multi_scale=args.multi_scale,
+            expanded_scales=args.expanded_scales,
+            skip_random_resize=not args.do_random_resize_via_padding,
+            patch_size=args.patch_size,
+            num_windows=args.num_windows
+        ))
+    else:
+        dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(
+            image_set,
+            resolution,
+            multi_scale=args.multi_scale,
+            expanded_scales=args.expanded_scales,
+            skip_random_resize=not args.do_random_resize_via_padding,
+            patch_size=args.patch_size,
+            num_windows=args.num_windows
+        ))
+    return dataset

rfdetr/datasets/coco_eval.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+from rfdetr.util.misc import all_gather
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+            self.eval_imgs[iou_type].append(eval_imgs)
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+            masks = masks > 0.5
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+def merge(img_ids, eval_imgs):
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+    return merged_img_ids, merged_eval_imgs
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################

rfdetr/datasets/o365.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""Dataset file for Object365."""
+from pathlib import Path
+from .coco import (
+    CocoDetection, make_coco_transforms, make_coco_transforms_square_div_64
+)
+from PIL import Image
+Image.MAX_IMAGE_PIXELS = None
+def build_o365_raw(image_set, args, resolution):
+    root = Path(args.coco_path)
+    PATHS = {
+        "train": (root, root / 'zhiyuan_objv2_train_val_wo_5k.json'),
+        "val": (root, root / 'zhiyuan_objv2_minival5k.json'),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    try:
+        square_resize = args.square_resize
+    except:
+        square_resize = False
+    try:
+        square_resize_div_64 = args.square_resize_div_64
+    except:
+        square_resize_div_64 = False
+    if square_resize_div_64:
+        dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms_square_div_64(image_set, resolution, multi_scale=args.multi_scale, expanded_scales=args.expanded_scales))
+    else:
+        dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set, resolution, multi_scale=args.multi_scale, expanded_scales=args.expanded_scales))
+    return dataset
+def build_o365(image_set, args, resolution):
+    if image_set == 'train':
+        train_ds = build_o365_raw('train', args, resolution=resolution)
+        return train_ds
+    if image_set == 'val':
+        val_ds = build_o365_raw('val', args, resolution=resolution)
+        return val_ds
+    raise ValueError('Unknown image_set: {}'.format(image_set))

rfdetr/datasets/transforms.py ADDED Viewed

	@@ -0,0 +1,475 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import random
+import PIL
+import numpy as np
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from numbers import Number
+import torch
+import torchvision.transforms as T
+# from detectron2.data import transforms as DT
+import torchvision.transforms.functional as F
+from rfdetr.util.box_ops import box_xyxy_to_cxcywh
+from rfdetr.util.misc import interpolate
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area", "iscrowd"]
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+        for field in fields:
+            target[field] = target[field][keep]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(
+        float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(
+            target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+class SquareResize(object):
+    def __init__(self, sizes):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        rescaled_img=F.resize(img, (size, size))
+        w, h = rescaled_img.size
+        if target is None:
+            return rescaled_img, None
+        ratios = tuple(
+            float(s) / float(s_orig) for s, s_orig in zip(rescaled_img.size, img.size))
+        ratio_width, ratio_height = ratios
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * torch.as_tensor(
+                [ratio_width, ratio_height, ratio_width, ratio_height])
+            target["boxes"] = scaled_boxes
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+        target["size"] = torch.tensor([h, w])
+        return rescaled_img, target
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+class PILtoNdArray(object):
+    def __call__(self, img, target):
+        return np.asarray(img), target
+class NdArraytoPIL(object):
+    def __call__(self, img, target):
+        return F.to_pil_image(img.astype('uint8')), target
+class Pad(object):
+    def __init__(self,
+                 size=None,
+                 size_divisor=32,
+                 pad_mode=0,
+                 offsets=None,
+                 fill_value=(127.5, 127.5, 127.5)):
+        """
+        Pad image to a specified size or multiple of size_divisor.
+        Args:
+            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
+            size_divisor (int): size divisor, default 32
+            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
+                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
+            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
+            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
+        """
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. \
+                            Must be List, now is {}".format(type(size)))
+        if isinstance(size, int):
+            size = [size, size]
+        assert pad_mode in [
+            -1, 0, 1, 2
+        ], 'currently only supports four modes [-1, 0, 1, 2]'
+        if pad_mode == -1:
+            assert offsets, 'if pad_mode is -1, offsets should not be None'
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_mode = pad_mode
+        self.fill_value = fill_value
+        self.offsets = offsets
+    def apply_bbox(self, bbox, offsets):
+        return bbox + np.array(offsets * 2, dtype=np.float32)
+    def apply_image(self, image, offsets, im_size, size):
+        x, y = offsets
+        im_h, im_w = im_size
+        h, w = size
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
+        return canvas
+    def __call__(self, im, target):
+        im_h, im_w = im.shape[:2]
+        if self.size:
+            h, w = self.size
+            assert (
+                im_h <= h and im_w <= w
+            ), '(h, w) of target size should be greater than (im_h, im_w)'
+        else:
+            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
+            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
+        if h == im_h and w == im_w:
+            return im.astype(np.float32), target
+        if self.pad_mode == -1:
+            offset_x, offset_y = self.offsets
+        elif self.pad_mode == 0:
+            offset_y, offset_x = 0, 0
+        elif self.pad_mode == 1:
+            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
+        else:
+            offset_y, offset_x = h - im_h, w - im_w
+        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
+        im = self.apply_image(im, offsets, im_size, size)
+        if self.pad_mode == 0:
+            target["size"] = torch.tensor([h, w])
+            return im, target
+        if 'boxes' in target and len(target['boxes']) > 0:
+            boxes = np.asarray(target["boxes"])
+            target["boxes"]  = torch.from_numpy(self.apply_bbox(boxes, offsets))
+            target["size"] = torch.tensor([h, w])
+        return im, target
+class RandomExpand(object):
+    """Random expand the canvas.
+    Args:
+        ratio (float): maximum expansion ratio.
+        prob (float): probability to expand.
+        fill_value (list): color value used to fill the canvas. in RGB order.
+    """
+    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
+        assert ratio > 1.01, "expand ratio must be larger than 1.01"
+        self.ratio = ratio
+        self.prob = prob
+        assert isinstance(fill_value, (Number, Sequence)), \
+            "fill value must be either float or sequence"
+        if isinstance(fill_value, Number):
+            fill_value = (fill_value, ) * 3
+        if not isinstance(fill_value, tuple):
+            fill_value = tuple(fill_value)
+        self.fill_value = fill_value
+    def __call__(self, img, target):
+        if np.random.uniform(0., 1.) < self.prob:
+            return img, target
+        height, width = img.shape[:2]
+        ratio = np.random.uniform(1., self.ratio)
+        h = int(height * ratio)
+        w = int(width * ratio)
+        if not h > height or not w > width:
+            return img, target
+        y = np.random.randint(0, h - height)
+        x = np.random.randint(0, w - width)
+        offsets, size = [x, y], [h, w]
+        pad = Pad(size,
+                  pad_mode=-1,
+                  offsets=offsets,
+                  fill_value=self.fill_value)
+        return pad(img, target)
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string

rfdetr/deploy/__init__.py ADDED Viewed

File without changes

rfdetr/deploy/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (213 Bytes). View file

rfdetr/deploy/__pycache__/benchmark.cpython-313.pyc ADDED Viewed

Binary file (36 kB). View file

rfdetr/deploy/__pycache__/export.cpython-313.pyc ADDED Viewed

Binary file (14.5 kB). View file

rfdetr/deploy/_onnx/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# ------------------------------------------------------------------------
+# LW-DETR
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+"""
+onnx optimizer and symbolic registry
+"""
+from . import optimizer
+from . import symbolic
+from .optimizer import OnnxOptimizer
+from .symbolic import CustomOpSymbolicRegistry

rfdetr/deploy/_onnx/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (433 Bytes). View file

rfdetr/deploy/_onnx/__pycache__/optimizer.cpython-313.pyc ADDED Viewed

Binary file (46.7 kB). View file

rfdetr/deploy/_onnx/__pycache__/symbolic.cpython-313.pyc ADDED Viewed

Binary file (1.55 kB). View file

rfdetr/deploy/_onnx/optimizer.py ADDED Viewed

	@@ -0,0 +1,579 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+OnnxOptimizer
+"""
+import os
+from collections import OrderedDict
+from copy import deepcopy
+import numpy as np
+import onnx
+import torch
+from onnx import shape_inference
+import onnx_graphsurgeon as gs
+from polygraphy.backend.onnx.loader import fold_constants
+from onnx_graphsurgeon.logger.logger import G_LOGGER
+from .symbolic import CustomOpSymbolicRegistry
+class OnnxOptimizer():
+    def __init__(
+        self,
+        input,
+        severity=G_LOGGER.INFO
+    ):
+        if isinstance(input, str):
+            onnx_graph = self.load_onnx(input)
+        else:
+            onnx_graph = input
+        self.graph = gs.import_onnx(onnx_graph)
+        self.severity = severity
+        self.set_severity(severity)
+    def set_severity(self, severity):
+        G_LOGGER.severity = severity
+    def load_onnx(self, onnx_path:str):
+        """Load onnx from file
+        """
+        assert os.path.isfile(onnx_path), f"not found onnx file: {onnx_path}"
+        onnx_graph = onnx.load(onnx_path)
+        G_LOGGER.info(f"load onnx file: {onnx_path}")
+        return onnx_graph
+    def save_onnx(self, onnx_path:str):
+        onnx_graph = gs.export_onnx(self.graph)
+        G_LOGGER.info(f"save onnx file: {onnx_path}")
+        onnx.save(onnx_graph, onnx_path)
+    def info(self, prefix=''):
+        G_LOGGER.verbose(f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs")
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+    def find_node_input(self, node, name:str=None, value=None) -> int:
+        for i, inp in enumerate(node.inputs):
+            if isinstance(name, str) and inp.name == name:
+                index = i
+            elif inp == value:
+                index = i
+        assert index >= 0, f"not found {name}({value}) in node.inputs"
+        return index
+    def find_node_output(self, node, name:str=None, value=None) -> int:
+        for i, inp in enumerate(node.outputs):
+            if isinstance(name, str) and inp.name == name:
+                index = i
+            elif inp == value:
+                index = i
+        assert index >= 0, f"not found {name}({value}) in node.outputs"
+        return index
+    def common_opt(self, return_onnx=False):
+        for fn in CustomOpSymbolicRegistry._OPTIMIZER:
+            fn(self)
+            self.cleanup()
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=False)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+        self.graph = gs.import_onnx(onnx_graph)
+        self.cleanup()
+        if return_onnx:
+            return onnx_graph
+    def resize_fix(self):
+        '''
+        This function loops through the graph looking for Resize nodes that uses scales for resize (has 3 inputs).
+        It substitutes found Resize with Resize that takes the size of the output tensor instead of scales.
+        It adds Shape->Slice->Concat
+                Shape->Slice----^     subgraph to the graph to extract the shape of the output tensor.
+        This fix is required for the dynamic shape support.
+        '''
+        mResizeNodes = 0
+        for node in self.graph.nodes:
+            if node.op == "Resize" and len(node.inputs) == 3:
+                name = node.name + "/"
+                add_node = node.o().o().i(1)
+                div_node = node.i()
+                shape_hw_out = gs.Variable(name=name + "shape_hw_out", dtype=np.int64, shape=[4])
+                shape_hw = gs.Node(op="Shape", name=name+"shape_hw", inputs=[add_node.outputs[0]], outputs=[shape_hw_out])
+                const_zero = gs.Constant(name=name + "const_zero", values=np.array([0], dtype=np.int64))
+                const_two = gs.Constant(name=name + "const_two", values=np.array([2], dtype=np.int64))
+                const_four = gs.Constant(name=name + "const_four", values=np.array([4], dtype=np.int64))
+                slice_hw_out = gs.Variable(name=name + "slice_hw_out", dtype=np.int64, shape=[2])
+                slice_hw = gs.Node(op="Slice", name=name+"slice_hw", inputs=[shape_hw_out, const_two, const_four, const_zero], outputs=[slice_hw_out])
+                shape_bc_out = gs.Variable(name=name + "shape_bc_out", dtype=np.int64, shape=[2])
+                shape_bc = gs.Node(op="Shape", name=name+"shape_bc", inputs=[div_node.outputs[0]], outputs=[shape_bc_out])
+                slice_bc_out = gs.Variable(name=name + "slice_bc_out", dtype=np.int64, shape=[2])
+                slice_bc = gs.Node(op="Slice", name=name+"slice_bc", inputs=[shape_bc_out, const_zero, const_two, const_zero], outputs=[slice_bc_out])
+                concat_bchw_out = gs.Variable(name=name + "concat_bchw_out", dtype=np.int64, shape=[4])
+                concat_bchw = gs.Node(op="Concat", name=name+"concat_bchw", attrs={"axis": 0}, inputs=[slice_bc_out, slice_hw_out], outputs=[concat_bchw_out])
+                none_var = gs.Variable.empty()
+                resize_bchw = gs.Node(op="Resize", name=name+"resize_bchw", attrs=node.attrs, inputs=[node.inputs[0], none_var, none_var, concat_bchw_out], outputs=[node.outputs[0]])
+                self.graph.nodes.extend([shape_hw, slice_hw, shape_bc, slice_bc, concat_bchw, resize_bchw])
+                node.inputs = []
+                node.outputs = []
+                mResizeNodes += 1
+        self.cleanup()
+        return mResizeNodes
+    def adjustAddNode(self):
+        nAdjustAddNode = 0
+        for node in self.graph.nodes:
+            # Change the bias const to the second input to allow Gemm+BiasAdd fusion in TRT.
+            if node.op in ["Add"] and isinstance(node.inputs[0], gs.ir.tensor.Constant):
+                tensor = node.inputs[1]
+                bias = node.inputs[0]
+                node.inputs = [tensor, bias]
+                nAdjustAddNode += 1
+        self.cleanup()
+        return nAdjustAddNode
+    def decompose_instancenorms(self):
+        nRemoveInstanceNorm = 0
+        for node in self.graph.nodes:
+            if node.op == "InstanceNormalization":
+                name = node.name + "/"
+                input_tensor = node.inputs[0]
+                output_tensor = node.outputs[0]
+                mean_out = gs.Variable(name=name + "mean_out")
+                mean_node = gs.Node(op="ReduceMean", name=name + "mean_node", attrs={"axes": [-1]}, inputs=[input_tensor], outputs=[mean_out])
+                sub_out = gs.Variable(name=name + "sub_out")
+                sub_node = gs.Node(op="Sub", name=name + "sub_node", attrs={}, inputs=[input_tensor, mean_out], outputs=[sub_out])
+                pow_out = gs.Variable(name=name + "pow_out")
+                pow_const = gs.Constant(name=name + "pow_const", values=np.array([2.0], dtype=np.float32))
+                pow_node = gs.Node(op="Pow", name=name + "pow_node", attrs={}, inputs=[sub_out, pow_const], outputs=[pow_out])
+                mean2_out = gs.Variable(name=name + "mean2_out")
+                mean2_node = gs.Node(op="ReduceMean", name=name + "mean2_node", attrs={"axes": [-1]}, inputs=[pow_out], outputs=[mean2_out])
+                epsilon_out = gs.Variable(name=name + "epsilon_out")
+                epsilon_const = gs.Constant(name=name + "epsilon_const", values=np.array([node.attrs["epsilon"]], dtype=np.float32))
+                epsilon_node = gs.Node(op="Add", name=name + "epsilon_node", attrs={}, inputs=[mean2_out, epsilon_const], outputs=[epsilon_out])
+                sqrt_out = gs.Variable(name=name + "sqrt_out")
+                sqrt_node = gs.Node(op="Sqrt", name=name + "sqrt_node", attrs={}, inputs=[epsilon_out], outputs=[sqrt_out])
+                div_out = gs.Variable(name=name + "div_out")
+                div_node = gs.Node(op="Div", name=name + "div_node", attrs={}, inputs=[sub_out, sqrt_out], outputs=[div_out])
+                constantScale = gs.Constant("InstanceNormScaleV-" + str(nRemoveInstanceNorm), np.ascontiguousarray(node.inputs[1].inputs[0].attrs["value"].values.reshape(1, 32, 1)))
+                constantBias = gs.Constant("InstanceBiasV-" + str(nRemoveInstanceNorm), np.ascontiguousarray(node.inputs[2].inputs[0].attrs["value"].values.reshape(1, 32, 1)))
+                mul_out = gs.Variable(name=name + "mul_out")
+                mul_node = gs.Node(op="Mul", name=name + "mul_node", attrs={}, inputs=[div_out, constantScale], outputs=[mul_out])
+                add_node = gs.Node(op="Add", name=name + "add_node", attrs={}, inputs=[mul_out, constantBias], outputs=[output_tensor])
+                self.graph.nodes.extend([mean_node, sub_node, pow_node, mean2_node, epsilon_node, sqrt_node, div_node, mul_node, add_node])
+                node.inputs = []
+                node.outputs = []
+                nRemoveInstanceNorm += 1
+        self.cleanup()
+        return nRemoveInstanceNorm
+    def insert_groupnorm_plugin(self):
+        nGroupNormPlugin = 0
+        for node in self.graph.nodes:
+            if node.op == "Reshape" and node.outputs != [] and \
+                node.o().op == "ReduceMean" and node.o(1).op == "Sub" and node.o().o() == node.o(1) and \
+                node.o().o().o().o().o().o().o().o().o().o().o().op == "Mul" and \
+                node.o().o().o().o().o().o().o().o().o().o().o().o().op == "Add" and \
+                len(node.o().o().o().o().o().o().o().o().inputs[1].values.shape) == 3:
+                # "node.outputs != []" is added for VAE
+                inputTensor = node.inputs[0]
+                gammaNode = node.o().o().o().o().o().o().o().o().o().o().o()
+                index = [type(i) == gs.ir.tensor.Constant for i in gammaNode.inputs].index(True)
+                gamma = np.array(deepcopy(gammaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantGamma = gs.Constant("groupNormGamma-" + str(nGroupNormPlugin), np.ascontiguousarray(gamma.reshape(-1)))  # MUST use np.ascontiguousarray, or TRT will regard the shape of this Constant as (0) !!!
+                betaNode = gammaNode.o()
+                index = [type(i) == gs.ir.tensor.Constant for i in betaNode.inputs].index(True)
+                beta = np.array(deepcopy(betaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantBeta = gs.Constant("groupNormBeta-" + str(nGroupNormPlugin), np.ascontiguousarray(beta.reshape(-1)))
+                epsilon = node.o().o().o().o().o().inputs[1].values.tolist()[0]
+                if betaNode.o().op == "Sigmoid":  # need Swish
+                    bSwish = True
+                    lastNode = betaNode.o().o()  # Mul node of Swish
+                else:
+                    bSwish = False
+                    lastNode = betaNode  # Cast node after Group Norm
+                if lastNode.o().op == "Cast":
+                    lastNode = lastNode.o()
+                inputList = [inputTensor, constantGamma, constantBeta]
+                groupNormV = gs.Variable("GroupNormV-" + str(nGroupNormPlugin), np.dtype(np.float16), inputTensor.shape)
+                groupNormN = gs.Node("GroupNorm", "GroupNormN-" + str(nGroupNormPlugin), inputs=inputList, outputs=[groupNormV], attrs=OrderedDict([('epsilon', epsilon), ('bSwish', int(bSwish))]))
+                self.graph.nodes.append(groupNormN)
+                for subNode in self.graph.nodes:
+                    if lastNode.outputs[0] in subNode.inputs:
+                        index = subNode.inputs.index(lastNode.outputs[0])
+                        subNode.inputs[index] = groupNormV
+                node.inputs = []
+                lastNode.outputs = []
+                nGroupNormPlugin += 1
+        self.cleanup()
+        return nGroupNormPlugin
+    def insert_layernorm_plugin(self):
+        nLayerNormPlugin = 0
+        for node in self.graph.nodes:
+            if node.op == 'ReduceMean' and \
+                node.o().op == 'Sub' and node.o().inputs[0] == node.inputs[0] and \
+                node.o().o(0).op =='Pow' and node.o().o(1).op =='Div' and \
+                node.o().o(0).o().op == 'ReduceMean' and \
+                node.o().o(0).o().o().op == 'Add' and \
+                node.o().o(0).o().o().o().op == 'Sqrt' and \
+                node.o().o(0).o().o().o().o().op == 'Div' and node.o().o(0).o().o().o().o() == node.o().o(1) and \
+                node.o().o(0).o().o().o().o().o().op == 'Mul' and \
+                node.o().o(0).o().o().o().o().o().o().op == 'Add' and \
+                len(node.o().o(0).o().o().o().o().o().inputs[1].values.shape) == 1:
+                if node.i().op == "Add":
+                    inputTensor = node.inputs[0]  # CLIP
+                else:
+                    inputTensor = node.i().inputs[0]  # UNet and VAE
+                gammaNode = node.o().o().o().o().o().o().o()
+                index = [type(i) == gs.ir.tensor.Constant for i in gammaNode.inputs].index(True)
+                gamma = np.array(deepcopy(gammaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantGamma = gs.Constant("LayerNormGamma-" + str(nLayerNormPlugin), np.ascontiguousarray(gamma.reshape(-1)))  # MUST use np.ascontiguousarray, or TRT will regard the shape of this Constant as (0) !!!
+                betaNode = gammaNode.o()
+                index = [type(i) == gs.ir.tensor.Constant for i in betaNode.inputs].index(True)
+                beta = np.array(deepcopy(betaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantBeta = gs.Constant("LayerNormBeta-" + str(nLayerNormPlugin), np.ascontiguousarray(beta.reshape(-1)))
+                inputList = [inputTensor, constantGamma, constantBeta]
+                layerNormV = gs.Variable("LayerNormV-" + str(nLayerNormPlugin), np.dtype(np.float32), inputTensor.shape)
+                layerNormN = gs.Node("LayerNorm", "LayerNormN-" + str(nLayerNormPlugin), inputs=inputList, attrs=OrderedDict([('epsilon', 1.e-5)]), outputs=[layerNormV])
+                self.graph.nodes.append(layerNormN)
+                nLayerNormPlugin += 1
+                if betaNode.outputs[0] in self.graph.outputs:
+                    index = self.graph.outputs.index(betaNode.outputs[0])
+                    self.graph.outputs[index] = layerNormV
+                else:
+                    if betaNode.o().op == "Cast":
+                        lastNode = betaNode.o()
+                    else:
+                        lastNode = betaNode
+                    for subNode in self.graph.nodes:
+                        if lastNode.outputs[0] in subNode.inputs:
+                            index = subNode.inputs.index(lastNode.outputs[0])
+                            subNode.inputs[index] = layerNormV
+                    lastNode.outputs = []
+        self.cleanup()
+        return nLayerNormPlugin
+    def fuse_kv(self, node_k, node_v, fused_kv_idx, heads, num_dynamic=0):
+        # Get weights of K
+        weights_k = node_k.inputs[1].values
+        # Get weights of V
+        weights_v = node_v.inputs[1].values
+        # Input number of channels to K and V
+        C = weights_k.shape[0]
+        # Number of heads
+        H = heads
+        # Dimension per head
+        D = weights_k.shape[1] // H
+        # Concat and interleave weights such that the output of fused KV GEMM has [b, s_kv, h, 2, d] shape
+        weights_kv = np.dstack([weights_k.reshape(C, H, D), weights_v.reshape(C, H, D)]).reshape(C, 2 * H * D)
+        # K and V have the same input
+        input_tensor = node_k.inputs[0]
+        # K and V must have the same output which we feed into fmha plugin
+        output_tensor_k = node_k.outputs[0]
+        # Create tensor
+        constant_weights_kv = gs.Constant("Weights_KV_{}".format(fused_kv_idx), np.ascontiguousarray(weights_kv))
+        # Create fused KV node
+        fused_kv_node = gs.Node(op="MatMul", name="MatMul_KV_{}".format(fused_kv_idx), inputs=[input_tensor, constant_weights_kv], outputs=[output_tensor_k])
+        self.graph.nodes.append(fused_kv_node)
+        # Connect the output of fused node to the inputs of the nodes after K and V
+        node_v.o(num_dynamic).inputs[0] = output_tensor_k
+        node_k.o(num_dynamic).inputs[0] = output_tensor_k
+        for i in range(0,num_dynamic):
+            node_v.o().inputs.clear()
+            node_k.o().inputs.clear()
+        # Clear inputs and outputs of K and V to ge these nodes cleared
+        node_k.outputs.clear()
+        node_v.outputs.clear()
+        node_k.inputs.clear()
+        node_v.inputs.clear()
+        self.cleanup()
+        return fused_kv_node
+    def insert_fmhca(self, node_q, node_kv, final_tranpose, mhca_idx, heads, num_dynamic=0):
+        # Get inputs and outputs for the fMHCA plugin
+        # We take an output of reshape that follows the Q GEMM
+        output_q = node_q.o(num_dynamic).o().inputs[0]
+        output_kv = node_kv.o().inputs[0]
+        output_final_tranpose = final_tranpose.outputs[0]
+        # Clear the inputs of the nodes that follow the Q and KV GEMM
+        # to delete these subgraphs (it will be substituted by fMHCA plugin)
+        node_kv.outputs[0].outputs[0].inputs.clear()
+        node_kv.outputs[0].outputs[0].inputs.clear()
+        node_q.o(num_dynamic).o().inputs.clear()
+        for i in range(0,num_dynamic):
+            node_q.o(i).o().o(1).inputs.clear()
+        weights_kv = node_kv.inputs[1].values
+        dims_per_head = weights_kv.shape[1] // (heads * 2)
+        # Reshape dims
+        shape = gs.Constant("Shape_KV_{}".format(mhca_idx), np.ascontiguousarray(np.array([0, 0, heads, 2, dims_per_head], dtype=np.int64)))
+        # Reshape output tensor
+        output_reshape = gs.Variable("ReshapeKV_{}".format(mhca_idx), np.dtype(np.float16), None)
+        # Create fMHA plugin
+        reshape = gs.Node(op="Reshape", name="Reshape_{}".format(mhca_idx), inputs=[output_kv, shape], outputs=[output_reshape])
+        # Insert node
+        self.graph.nodes.append(reshape)
+        # Create fMHCA plugin
+        fmhca = gs.Node(op="fMHCA", name="fMHCA_{}".format(mhca_idx), inputs=[output_q, output_reshape], outputs=[output_final_tranpose])
+        # Insert node
+        self.graph.nodes.append(fmhca)
+        # Connect input of fMHCA to output of Q GEMM
+        node_q.o(num_dynamic).outputs[0] = output_q
+        if num_dynamic > 0:
+            reshape2_input1_out = gs.Variable("Reshape2_fmhca{}_out".format(mhca_idx), np.dtype(np.int64), None)
+            reshape2_input1_shape = gs.Node("Shape", "Reshape2_fmhca{}_shape".format(mhca_idx), inputs=[node_q.inputs[0]], outputs=[reshape2_input1_out])
+            self.graph.nodes.append(reshape2_input1_shape)
+            final_tranpose.o().inputs[1] = reshape2_input1_out
+        # Clear outputs of transpose to get this subgraph cleared
+        final_tranpose.outputs.clear()
+        self.cleanup()
+    def fuse_qkv(self, node_q, node_k, node_v, fused_qkv_idx, heads, num_dynamic=0):
+        # Get weights of Q
+        weights_q = node_q.inputs[1].values
+        # Get weights of K
+        weights_k = node_k.inputs[1].values
+        # Get weights of V
+        weights_v = node_v.inputs[1].values
+        # Input number of channels to Q, K and V
+        C = weights_k.shape[0]
+        # Number of heads
+        H = heads
+        # Hidden dimension per head
+        D = weights_k.shape[1] // H
+        # Concat and interleave weights such that the output of fused QKV GEMM has [b, s, h, 3, d] shape
+        weights_qkv = np.dstack([weights_q.reshape(C, H, D), weights_k.reshape(C, H, D), weights_v.reshape(C, H, D)]).reshape(C, 3 * H * D)
+        input_tensor = node_k.inputs[0]  # K and V have the same input
+        # Q, K and V must have the same output which we feed into fmha plugin
+        output_tensor_k = node_k.outputs[0]
+        # Concat and interleave weights such that the output of fused QKV GEMM has [b, s, h, 3, d] shape
+        constant_weights_qkv = gs.Constant("Weights_QKV_{}".format(fused_qkv_idx), np.ascontiguousarray(weights_qkv))
+        # Created a fused node
+        fused_qkv_node = gs.Node(op="MatMul", name="MatMul_QKV_{}".format(fused_qkv_idx), inputs=[input_tensor, constant_weights_qkv], outputs=[output_tensor_k])
+        self.graph.nodes.append(fused_qkv_node)
+        # Connect the output of the fused node to the inputs of the nodes after Q, K and V
+        node_q.o(num_dynamic).inputs[0] = output_tensor_k
+        node_k.o(num_dynamic).inputs[0] = output_tensor_k
+        node_v.o(num_dynamic).inputs[0] = output_tensor_k
+        for i in range(0,num_dynamic):
+            node_q.o().inputs.clear()
+            node_k.o().inputs.clear()
+            node_v.o().inputs.clear()
+        # Clear inputs and outputs of Q, K and V to ge these nodes cleared
+        node_q.outputs.clear()
+        node_k.outputs.clear()
+        node_v.outputs.clear()
+        node_q.inputs.clear()
+        node_k.inputs.clear()
+        node_v.inputs.clear()
+        self.cleanup()
+        return fused_qkv_node
+    def insert_fmha(self, node_qkv, final_tranpose, mha_idx, heads, num_dynamic=0):
+        # Get inputs and outputs for the fMHA plugin
+        output_qkv = node_qkv.o().inputs[0]
+        output_final_tranpose = final_tranpose.outputs[0]
+        # Clear the inputs of the nodes that follow the QKV GEMM
+        # to delete these subgraphs (it will be substituted by fMHA plugin)
+        node_qkv.outputs[0].outputs[2].inputs.clear()
+        node_qkv.outputs[0].outputs[1].inputs.clear()
+        node_qkv.outputs[0].outputs[0].inputs.clear()
+        weights_qkv = node_qkv.inputs[1].values
+        dims_per_head = weights_qkv.shape[1] // (heads * 3)
+        # Reshape dims
+        shape = gs.Constant("Shape_QKV_{}".format(mha_idx), np.ascontiguousarray(np.array([0, 0, heads, 3, dims_per_head], dtype=np.int64)))
+        # Reshape output tensor
+        output_shape = gs.Variable("ReshapeQKV_{}".format(mha_idx), np.dtype(np.float16), None)
+        # Create fMHA plugin
+        reshape = gs.Node(op="Reshape", name="Reshape_{}".format(mha_idx), inputs=[output_qkv, shape], outputs=[output_shape])
+        # Insert node
+        self.graph.nodes.append(reshape)
+        # Create fMHA plugin
+        fmha = gs.Node(op="fMHA_V2", name="fMHA_{}".format(mha_idx), inputs=[output_shape], outputs=[output_final_tranpose])
+        # Insert node
+        self.graph.nodes.append(fmha)
+        if num_dynamic > 0:
+            reshape2_input1_out = gs.Variable("Reshape2_{}_out".format(mha_idx), np.dtype(np.int64), None)
+            reshape2_input1_shape = gs.Node("Shape", "Reshape2_{}_shape".format(mha_idx), inputs=[node_qkv.inputs[0]], outputs=[reshape2_input1_out])
+            self.graph.nodes.append(reshape2_input1_shape)
+            final_tranpose.o().inputs[1] = reshape2_input1_out
+        # Clear outputs of transpose to get this subgraph cleared
+        final_tranpose.outputs.clear()
+        self.cleanup()
+    def mha_mhca_detected(self, node, mha):
+        # Go from V GEMM down to the S*V MatMul and all way up to K GEMM
+        # If we are looking for MHCA inputs of two matmuls (K and V) must be equal.
+        # If we are looking for MHA inputs (K and V) must be not equal.
+        if node.op == "MatMul" and len(node.outputs) == 1 and \
+            ((mha and len(node.inputs[0].inputs) > 0  and node.i().op == "Add") or \
+            (not mha and len(node.inputs[0].inputs) == 0)):
+            if node.o().op == 'Shape':
+                if node.o(1).op == 'Shape':
+                    num_dynamic_kv = 3 if node.o(2).op == 'Shape' else 2
+                else:
+                    num_dynamic_kv = 1
+                # For Cross-Attention, if batch axis is dynamic (in QKV), assume H*W (in Q) is dynamic as well
+                num_dynamic_q = num_dynamic_kv if mha else num_dynamic_kv + 1
+            else:
+                num_dynamic_kv = 0
+                num_dynamic_q = 0
+            o = node.o(num_dynamic_kv)
+            if o.op == "Reshape" and \
+                o.o().op == "Transpose" and \
+                o.o().o().op == "Reshape" and \
+                o.o().o().o().op == "MatMul" and \
+                o.o().o().o().i(0).op == "Softmax" and \
+                o.o().o().o().i(1).op == "Reshape" and \
+                o.o().o().o().i(0).i().op == "Mul" and \
+                o.o().o().o().i(0).i().i().op == "MatMul" and \
+                o.o().o().o().i(0).i().i().i(0).op == "Reshape" and \
+                o.o().o().o().i(0).i().i().i(1).op == "Transpose" and \
+                o.o().o().o().i(0).i().i().i(1).i().op == "Reshape" and \
+                o.o().o().o().i(0).i().i().i(1).i().i().op == "Transpose" and \
+                o.o().o().o().i(0).i().i().i(1).i().i().i().op == "Reshape" and \
+                o.o().o().o().i(0).i().i().i(1).i().i().i().i().op == "MatMul" and \
+                node.name != o.o().o().o().i(0).i().i().i(1).i().i().i().i().name:
+                # "len(node.outputs) == 1" to make sure we are not in the already fused node
+                node_q = o.o().o().o().i(0).i().i().i(0).i().i().i()
+                node_k = o.o().o().o().i(0).i().i().i(1).i().i().i().i()
+                node_v = node
+                final_tranpose = o.o().o().o().o(num_dynamic_q).o()
+                # Sanity check to make sure that the graph looks like expected
+                if node_q.op == "MatMul" and final_tranpose.op == "Transpose":
+                    return True, num_dynamic_q, num_dynamic_kv, node_q, node_k, node_v, final_tranpose
+        return False, 0, 0, None, None, None, None
+    def fuse_kv_insert_fmhca(self, heads, mhca_index, sm):
+        nodes = self.graph.nodes
+        # Iterate over graph and search for MHCA pattern
+        for idx, _ in enumerate(nodes):
+            # fMHCA can't be at the 2 last layers of the network. It is a guard from OOB
+            if idx + 1 > len(nodes) or idx + 2 > len(nodes):
+                continue
+            # Get anchor nodes for fusion and fMHCA plugin insertion if the MHCA is detected
+            detected, num_dynamic_q, num_dynamic_kv, node_q, node_k, node_v, final_tranpose = \
+                self.mha_mhca_detected(nodes[idx], mha=False)
+            if detected:
+                assert num_dynamic_q == 0 or num_dynamic_q == num_dynamic_kv + 1
+                # Skip the FMHCA plugin for SM75 except for when the dim per head is 40.
+                if sm == 75 and node_q.inputs[1].shape[1] // heads == 160:
+                    continue
+                # Fuse K and V GEMMS
+                node_kv = self.fuse_kv(node_k, node_v, mhca_index, heads, num_dynamic_kv)
+                # Insert fMHCA plugin
+                self.insert_fmhca(node_q, node_kv, final_tranpose, mhca_index, heads, num_dynamic_q)
+                return True
+        return False
+    def fuse_qkv_insert_fmha(self, heads, mha_index):
+        nodes = self.graph.nodes
+        # Iterate over graph and search for MHA pattern
+        for idx, _ in enumerate(nodes):
+            # fMHA can't be at the 2 last layers of the network. It is a guard from OOB
+            if idx + 1 > len(nodes) or idx + 2 > len(nodes):
+                continue
+            # Get anchor nodes for fusion and fMHA plugin insertion if the MHA is detected
+            detected, num_dynamic_q, num_dynamic_kv, node_q, node_k, node_v, final_tranpose = \
+                self.mha_mhca_detected(nodes[idx], mha=True)
+            if detected:
+                assert num_dynamic_q == num_dynamic_kv
+                # Fuse Q, K and V GEMMS
+                node_qkv = self.fuse_qkv(node_q, node_k, node_v, mha_index, heads, num_dynamic_kv)
+                # Insert fMHA plugin
+                self.insert_fmha(node_qkv, final_tranpose, mha_index, heads, num_dynamic_kv)
+                return True
+        return False
+    def insert_fmhca_plugin(self, num_heads, sm):
+        mhca_index = 0
+        while self.fuse_kv_insert_fmhca(num_heads, mhca_index, sm):
+            mhca_index += 1
+        return mhca_index
+    def insert_fmha_plugin(self, num_heads):
+        mha_index = 0
+        while self.fuse_qkv_insert_fmha(num_heads, mha_index):
+            mha_index += 1
+        return mha_index

rfdetr/deploy/_onnx/symbolic.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+CustomOpSymbolicRegistry class
+"""
+from copy import deepcopy
+import onnx
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.onnx import register_custom_op_symbolic
+from torch.onnx.symbolic_helper import parse_args
+from torch.onnx.symbolic_helper import _get_tensor_dim_size, _get_tensor_sizes
+from torch.autograd import Function
+class CustomOpSymbolicRegistry:
+    # _SYMBOLICS = {}
+    _OPTIMIZER = []
+    @classmethod
+    def optimizer(cls, fn):
+        cls._OPTIMIZER.append(fn)
+def register_optimizer():
+    def optimizer_wrapper(fn):
+        CustomOpSymbolicRegistry.optimizer(fn)
+        return fn
+    return optimizer_wrapper

rfdetr/deploy/benchmark.py ADDED Viewed

	@@ -0,0 +1,590 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+This tool provides performance benchmarks by using ONNX Runtime and TensorRT
+to run inference on a given model with the COCO validation set. It offers
+reliable measurements of inference latency using ONNX Runtime or TensorRT
+on the device.
+"""
+import argparse
+import copy
+import contextlib
+import datetime
+import json
+import os
+import os.path as osp
+import random
+import time
+import ast
+from pathlib import Path
+from collections import namedtuple, OrderedDict
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+import numpy as np
+from PIL import Image
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+import tqdm
+import pycuda.driver as cuda
+import pycuda.autoinit
+import onnxruntime as nxrun
+import tensorrt as trt
+def parser_args():
+    parser = argparse.ArgumentParser('performance benchmark tool for onnx/trt model')
+    parser.add_argument('--path', type=str, help='engine file path')
+    parser.add_argument('--coco_path', type=str, default="data/coco", help='coco dataset path')
+    parser.add_argument('--device', default=0, type=int)
+    parser.add_argument('--run_benchmark', action='store_true', help='repeat the inference to benchmark the latency')
+    parser.add_argument('--disable_eval', action='store_true', help='disable evaluation')
+    return parser.parse_args()
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = COCO(coco_gt)
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+            self.eval_imgs[iou_type].append(eval_imgs)
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # Running per image evaluation...
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    return p.imgIds, evalImgs
+def convert_to_xywh(boxes):
+    boxes[:, 2:] -= boxes[:, :2]
+    return boxes
+def get_image_list(ann_file):
+    with open(ann_file, 'r') as fin:
+        data = json.load(fin)
+    return data['images']
+def load_image(file_path):
+    return Image.open(file_path).convert("RGB")
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+class SquareResize(object):
+    def __init__(self, sizes):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        rescaled_img=F.resize(img, (size, size))
+        w, h = rescaled_img.size
+        if target is None:
+            return rescaled_img, None
+        ratios = tuple(
+            float(s) / float(s_orig) for s, s_orig in zip(rescaled_img.size, img.size))
+        ratio_width, ratio_height = ratios
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            scaled_boxes = boxes * torch.as_tensor(
+                [ratio_width, ratio_height, ratio_width, ratio_height])
+            target["boxes"] = scaled_boxes
+        if "area" in target:
+            area = target["area"]
+            scaled_area = area * (ratio_width * ratio_height)
+            target["area"] = scaled_area
+        target["size"] = torch.tensor([h, w])
+        return rescaled_img, target
+def infer_transforms():
+    normalize = Compose([
+        ToTensor(),
+        Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    return Compose([
+        SquareResize([640]),
+        normalize,
+    ])
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w.clamp(min=0.0)), (y_c - 0.5 * h.clamp(min=0.0)),
+         (x_c + 0.5 * w.clamp(min=0.0)), (y_c + 0.5 * h.clamp(min=0.0))]
+    return torch.stack(b, dim=-1)
+def post_process(outputs, target_sizes):
+    out_logits, out_bbox = outputs['labels'], outputs['dets']
+    assert len(out_logits) == len(target_sizes)
+    assert target_sizes.shape[1] == 2
+    prob = out_logits.sigmoid()
+    topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1)
+    scores = topk_values
+    topk_boxes = topk_indexes // out_logits.shape[2]
+    labels = topk_indexes % out_logits.shape[2]
+    boxes = box_cxcywh_to_xyxy(out_bbox)
+    boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+    # and from relative [0, 1] to absolute [0, height] coordinates
+    img_h, img_w = target_sizes.unbind(1)
+    scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+    boxes = boxes * scale_fct[:, None, :]
+    results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+    return results
+def infer_onnx(sess, coco_evaluator, time_profile, prefix, img_list, device, repeats=1):
+    time_list = []
+    for img_dict in tqdm.tqdm(img_list):
+        image = load_image(os.path.join(prefix, img_dict['file_name']))
+        width, height = image.size
+        orig_target_sizes = torch.Tensor([height, width])
+        image_tensor, _ = infer_transforms()(image, None)  # target is None
+        samples = image_tensor[None].numpy()
+        time_profile.reset()
+        with time_profile:
+            for _ in range(repeats):
+                res = sess.run(None, {"input": samples})
+        time_list.append(time_profile.total / repeats)
+        outputs = {}
+        outputs['labels'] = torch.Tensor(res[1]).to(device)
+        outputs['dets'] = torch.Tensor(res[0]).to(device)
+        orig_target_sizes = torch.stack([orig_target_sizes], dim=0).to(device)
+        results = post_process(outputs, orig_target_sizes)
+        res = {img_dict['id']: results[0]}
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+    print("Model latency with ONNX Runtime: {}ms".format(1000 * sum(time_list) / len(img_list)))
+    # accumulate predictions from all images
+    stats = {}
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+        stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+        print(stats)
+def infer_engine(model, coco_evaluator, time_profile, prefix, img_list, device, repeats=1):
+    time_list = []
+    for img_dict in tqdm.tqdm(img_list):
+        image = load_image(os.path.join(prefix, img_dict['file_name']))
+        width, height = image.size
+        orig_target_sizes = torch.Tensor([height, width])
+        image_tensor, _ = infer_transforms()(image, None)  # target is None
+        samples = image_tensor[None].to(device)
+        _, _, h, w = samples.shape
+        im_shape = torch.Tensor(np.array([h, w]).reshape((1, 2)).astype(np.float32)).to(device)
+        scale_factor = torch.Tensor(np.array([h / height, w / width]).reshape((1, 2)).astype(np.float32)).to(device)
+        time_profile.reset()
+        with time_profile:
+            for _ in range(repeats):
+                outputs = model({"input": samples})
+        time_list.append(time_profile.total / repeats)
+        orig_target_sizes = torch.stack([orig_target_sizes], dim=0).to(device)
+        if coco_evaluator is not None:
+            results = post_process(outputs, orig_target_sizes)
+            res = {img_dict['id']: results[0]}
+            coco_evaluator.update(res)
+    print("Model latency with TensorRT: {}ms".format(1000 * sum(time_list) / len(img_list)))
+    # accumulate predictions from all images
+    stats = {}
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+        stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+        print(stats)
+class TRTInference(object):
+    """TensorRT inference engine
+    """
+    def __init__(self, engine_path='dino.engine', device='cuda:0', sync_mode:bool=False, max_batch_size=32, verbose=False):
+        self.engine_path = engine_path
+        self.device = device
+        self.sync_mode = sync_mode
+        self.max_batch_size = max_batch_size
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
+        self.engine = self.load_engine(engine_path)
+        self.context = self.engine.create_execution_context()
+        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
+        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
+        self.input_names = self.get_input_names()
+        self.output_names = self.get_output_names()
+        if not self.sync_mode:
+            self.stream = cuda.Stream()
+        # self.time_profile = TimeProfiler()
+        self.time_profile = None
+    def get_dummy_input(self, batch_size:int):
+        blob = {}
+        for name, binding in self.bindings.items():
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                print(f"make dummy input {name} with shape {binding.shape}")
+                blob[name] = torch.rand(batch_size, *binding.shape[1:]).float().to('cuda:0')
+        return blob
+    def load_engine(self, path):
+        '''load engine
+        '''
+        trt.init_libnvinfer_plugins(self.logger, '')
+        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+    def get_input_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                names.append(name)
+        return names
+    def get_output_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                names.append(name)
+        return names
+    def get_bindings(self, engine, context, max_batch_size=32, device=None):
+        '''build binddings
+        '''
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        bindings = OrderedDict()
+        for i, name in enumerate(engine):
+            shape = engine.get_tensor_shape(name)
+            dtype = trt.nptype(engine.get_tensor_dtype(name))
+            if shape[0] == -1:
+                raise NotImplementedError
+            if False:
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    data = np.random.randn(*shape).astype(dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr)
+                else:
+                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr)
+            else:
+                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
+        return bindings
+    def run_sync(self, blob):
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        self.context.execute_v2(list(self.bindings_addr.values()))
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+        return outputs
+    def run_async(self, blob):
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
+        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+        self.stream.synchronize()
+        return outputs
+    def __call__(self, blob):
+        if self.sync_mode:
+            return self.run_sync(blob)
+        else:
+            return self.run_async(blob)
+    def synchronize(self, ):
+        if not self.sync_mode and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elif self.sync_mode:
+            self.stream.synchronize()
+    def speed(self, blob, n):
+        self.time_profile.reset()
+        with self.time_profile:
+            for _ in range(n):
+                _ = self(blob)
+        return self.time_profile.total / n
+    def build_engine(self, onnx_file_path, engine_file_path, max_batch_size=32):
+        '''Takes an ONNX file and creates a TensorRT engine to run inference with
+        http://gitlab.baidu.com/paddle-inference/benchmark/blob/main/backend_trt.py#L57
+        '''
+        EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        with trt.Builder(self.logger) as builder, \
+            builder.create_network(EXPLICIT_BATCH) as network, \
+            trt.OnnxParser(network, self.logger) as parser, \
+            builder.create_builder_config() as config:
+            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1024 MiB
+            config.set_flag(trt.BuilderFlag.FP16)
+            with open(onnx_file_path, 'rb') as model:
+                if not parser.parse(model.read()):
+                    print('ERROR: Failed to parse the ONNX file.')
+                    for error in range(parser.num_errors):
+                        print(parser.get_error(error))
+                    return None
+            serialized_engine = builder.build_serialized_network(network, config)
+            with open(engine_file_path, 'wb') as f:
+                f.write(serialized_engine)
+            return serialized_engine
+class TimeProfiler(contextlib.ContextDecorator):
+    def __init__(self, ):
+        self.total = 0
+    def __enter__(self, ):
+        self.start = self.time()
+        return self
+    def __exit__(self, type, value, traceback):
+        self.total += self.time() - self.start
+    def reset(self, ):
+        self.total = 0
+    def time(self, ):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.perf_counter()
+def main(args):
+    print(args)
+    coco_gt = osp.join(args.coco_path, 'annotations/instances_val2017.json')
+    img_list = get_image_list(coco_gt)
+    prefix = osp.join(args.coco_path, 'val2017')
+    if args.run_benchmark:
+        repeats = 10
+        print('Inference for each image will be repeated 10 times to obtain '
+              'a reliable measurement of inference latency.')
+    else:
+        repeats = 1
+    if args.disable_eval:
+        coco_evaluator = None
+    else:
+        coco_evaluator = CocoEvaluator(coco_gt, ('bbox',))
+    time_profile = TimeProfiler()
+    if args.path.endswith(".onnx"):
+        sess = nxrun.InferenceSession(args.path, providers=['CUDAExecutionProvider'])
+        infer_onnx(sess, coco_evaluator, time_profile, prefix, img_list, device=f'cuda:{args.device}', repeats=repeats)
+    elif args.path.endswith(".engine"):
+        model = TRTInference(args.path, sync_mode=True, device=f'cuda:{args.device}')
+        infer_engine(model, coco_evaluator, time_profile, prefix, img_list, device=f'cuda:{args.device}', repeats=repeats)
+    else:
+        raise NotImplementedError('Only model file names ending with ".onnx" and ".engine" are supported.')
+if __name__ == '__main__':
+    args = parser_args()
+    main(args)

rfdetr/deploy/export.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+export ONNX model and TensorRT engine for deployment
+"""
+import os
+import ast
+import random
+import argparse
+import subprocess
+import torch.nn as nn
+from pathlib import Path
+import time
+from collections import defaultdict
+import onnx
+import torch
+import onnxsim
+import numpy as np
+from PIL import Image
+import rfdetr.util.misc as utils
+import rfdetr.datasets.transforms as T
+from rfdetr.models import build_model
+from rfdetr.deploy._onnx import OnnxOptimizer
+import re
+import sys
+def run_command_shell(command, dry_run:bool = False) -> int:
+    if dry_run:
+        print("")
+        print(f"CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']} {command}")
+        print("")
+    try:
+        result = subprocess.run(command, shell=True, capture_output=True, text=True)
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"Command failed with exit code {e.returncode}")
+        print(f"Error output:\n{e.stderr.decode('utf-8')}")
+        raise
+def make_infer_image(infer_dir, shape, batch_size, device="cuda"):
+    if infer_dir is None:
+        dummy = np.random.randint(0, 256, (shape[0], shape[1], 3), dtype=np.uint8)
+        image = Image.fromarray(dummy, mode="RGB")
+    else:
+        image = Image.open(infer_dir).convert("RGB")
+    transforms = T.Compose([
+        T.SquareResize([shape[0]]),
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    inps, _ = transforms(image, None)
+    inps = inps.to(device)
+    # inps = utils.nested_tensor_from_tensor_list([inps for _ in range(args.batch_size)])
+    inps = torch.stack([inps for _ in range(batch_size)])
+    return inps
+def export_onnx(output_dir, model, input_names, input_tensors, output_names, dynamic_axes, backbone_only=False, verbose=True, opset_version=17):
+    export_name = "backbone_model" if backbone_only else "inference_model"
+    output_file = os.path.join(output_dir, f"{export_name}.onnx")
+    # Prepare model for export
+    if hasattr(model, "export"):
+        model.export()
+    torch.onnx.export(
+        model,
+        input_tensors,
+        output_file,
+        input_names=input_names,
+        output_names=output_names,
+        export_params=True,
+        keep_initializers_as_inputs=False,
+        do_constant_folding=True,
+        verbose=verbose,
+        opset_version=opset_version,
+        dynamic_axes=dynamic_axes)
+    print(f'\nSuccessfully exported ONNX model: {output_file}')
+    return output_file
+def onnx_simplify(onnx_dir:str, input_names, input_tensors, force=False):
+    sim_onnx_dir = onnx_dir.replace(".onnx", ".sim.onnx")
+    if os.path.isfile(sim_onnx_dir) and not force:
+        return sim_onnx_dir
+    if isinstance(input_tensors, torch.Tensor):
+        input_tensors = [input_tensors]
+    print(f'start simplify ONNX model: {onnx_dir}')
+    opt = OnnxOptimizer(onnx_dir)
+    opt.info('Model: original')
+    opt.common_opt()
+    opt.info('Model: optimized')
+    opt.save_onnx(sim_onnx_dir)
+    input_dict = {name: tensor.detach().cpu().numpy() for name, tensor in zip(input_names, input_tensors)}
+    model_opt, check_ok = onnxsim.simplify(
+        onnx_dir,
+        check_n = 3,
+        input_data=input_dict,
+        dynamic_input_shape=False)
+    if check_ok:
+        onnx.save(model_opt, sim_onnx_dir)
+    else:
+        raise RuntimeError("Failed to simplify ONNX model.")
+    print(f'Successfully simplified ONNX model: {sim_onnx_dir}')
+    return sim_onnx_dir
+def trtexec(onnx_dir:str, args) -> None:
+    engine_dir = onnx_dir.replace(".onnx", f".engine")
+    # Base trtexec command
+    trt_command = " ".join([
+        "trtexec",
+            f"--onnx={onnx_dir}",
+            f"--saveEngine={engine_dir}",
+            f"--memPoolSize=workspace:4096 --fp16",
+            f"--useCudaGraph --useSpinWait --warmUp=500 --avgRuns=1000 --duration=10",
+            f"{'--verbose' if args.verbose else ''}"])
+    if args.profile:
+        profile_dir = onnx_dir.replace(".onnx", f".nsys-rep")
+        # Wrap with nsys profile command
+        command = " ".join([
+            "nsys profile",
+                f"--output={profile_dir}",
+                "--trace=cuda,nvtx",
+                "--force-overwrite true",
+                trt_command
+        ])
+        print(f'Profile data will be saved to: {profile_dir}')
+    else:
+        command = trt_command
+    output = run_command_shell(command, args.dry_run)
+    stats = parse_trtexec_output(output.stdout)
+def parse_trtexec_output(output_text):
+    print(output_text)
+    # Common patterns in trtexec output
+    gpu_compute_pattern = r"GPU Compute Time: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms, median = (\d+\.\d+) ms"
+    h2d_pattern = r"Host to Device Transfer Time: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms"
+    d2h_pattern = r"Device to Host Transfer Time: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms"
+    latency_pattern = r"Latency: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms"
+    throughput_pattern = r"Throughput: (\d+\.\d+) qps"
+    stats = {}
+    # Extract compute times
+    if match := re.search(gpu_compute_pattern, output_text):
+        stats.update({
+            'compute_min_ms': float(match.group(1)),
+            'compute_max_ms': float(match.group(2)),
+            'compute_mean_ms': float(match.group(3)),
+            'compute_median_ms': float(match.group(4))
+        })
+    # Extract H2D times
+    if match := re.search(h2d_pattern, output_text):
+        stats.update({
+            'h2d_min_ms': float(match.group(1)),
+            'h2d_max_ms': float(match.group(2)),
+            'h2d_mean_ms': float(match.group(3))
+        })
+    # Extract D2H times
+    if match := re.search(d2h_pattern, output_text):
+        stats.update({
+            'd2h_min_ms': float(match.group(1)),
+            'd2h_max_ms': float(match.group(2)),
+            'd2h_mean_ms': float(match.group(3))
+        })
+    if match := re.search(latency_pattern, output_text):
+        stats.update({
+            'latency_min_ms': float(match.group(1)),
+            'latency_max_ms': float(match.group(2)),
+            'latency_mean_ms': float(match.group(3))
+        })
+    # Extract throughput
+    if match := re.search(throughput_pattern, output_text):
+        stats['throughput_qps'] = float(match.group(1))
+    return stats
+def no_batch_norm(model):
+    for module in model.modules():
+        if isinstance(module, nn.BatchNorm2d):
+            raise ValueError("BatchNorm2d found in the model. Please remove it.")
+def main(args):
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print(args)
+    # convert device to device_id
+    if args.device == 'cuda':
+        device_id = "0"
+    elif args.device == 'cpu':
+        device_id = ""
+    else:
+        device_id = str(int(args.device))
+        args.device = f"cuda:{device_id}"
+    # device for export onnx
+    # TODO: export onnx with cuda failed with onnx error
+    device = torch.device("cpu")
+    os.environ["CUDA_VISIBLE_DEVICES"] = device_id
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    model, criterion, postprocessors = build_model(args)
+    n_parameters = sum(p.numel() for p in model.parameters())
+    print(f"number of parameters: {n_parameters}")
+    n_backbone_parameters = sum(p.numel() for p in model.backbone.parameters())
+    print(f"number of backbone parameters: {n_backbone_parameters}")
+    n_projector_parameters = sum(p.numel() for p in model.backbone[0].projector.parameters())
+    print(f"number of projector parameters: {n_projector_parameters}")
+    n_backbone_encoder_parameters = sum(p.numel() for p in model.backbone[0].encoder.parameters())
+    print(f"number of backbone encoder parameters: {n_backbone_encoder_parameters}")
+    n_transformer_parameters = sum(p.numel() for p in model.transformer.parameters())
+    print(f"number of transformer parameters: {n_transformer_parameters}")
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model.load_state_dict(checkpoint['model'], strict=True)
+        print(f"load checkpoints {args.resume}")
+    if args.layer_norm:
+        no_batch_norm(model)
+    model.to(device)
+    input_tensors = make_infer_image(args, device)
+    input_names = ['input']
+    output_names = ['features'] if args.backbone_only else ['dets', 'labels']
+    dynamic_axes = None
+    # Run model inference in pytorch mode
+    model.eval().to("cuda")
+    input_tensors = input_tensors.to("cuda")
+    with torch.no_grad():
+        if args.backbone_only:
+            features = model(input_tensors)
+            print(f"PyTorch inference output shape: {features.shape}")
+        else:
+            outputs = model(input_tensors)
+            dets = outputs['pred_boxes']
+            labels = outputs['pred_logits']
+            print(f"PyTorch inference output shapes - Boxes: {dets.shape}, Labels: {labels.shape}")
+    model.cpu()
+    input_tensors = input_tensors.cpu()
+    output_file = export_onnx(model, args, input_names, input_tensors, output_names, dynamic_axes)
+    if args.simplify:
+        output_file = onnx_simplify(output_file, input_names, input_tensors, args)
+    if args.tensorrt:
+        output_file = trtexec(output_file, args)

rfdetr/detr.py ADDED Viewed

	@@ -0,0 +1,451 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import json
+import os
+from collections import defaultdict
+from logging import getLogger
+from typing import Union, List
+from copy import deepcopy
+import numpy as np
+import supervision as sv
+import torch
+import torchvision.transforms.functional as F
+from PIL import Image
+try:
+    torch.set_float32_matmul_precision('high')
+except:
+    pass
+from rfdetr.config import (
+    RFDETRBaseConfig,
+    RFDETRLargeConfig,
+    RFDETRNanoConfig,
+    RFDETRSmallConfig,
+    RFDETRMediumConfig,
+    TrainConfig,
+    ModelConfig
+)
+from rfdetr.main import Model, download_pretrain_weights
+from rfdetr.util.metrics import MetricsPlotSink, MetricsTensorBoardSink, MetricsWandBSink
+from rfdetr.util.coco_classes import COCO_CLASSES
+logger = getLogger(__name__)
+class RFDETR:
+    """
+    The base RF-DETR class implements the core methods for training RF-DETR models,
+    running inference on the models, optimising models, and uploading trained
+    models for deployment.
+    """
+    means = [0.485, 0.456, 0.406]
+    stds = [0.229, 0.224, 0.225]
+    size = None
+    def __init__(self, **kwargs):
+        self.model_config = self.get_model_config(**kwargs)
+        self.maybe_download_pretrain_weights()
+        self.model = self.get_model(self.model_config)
+        self.callbacks = defaultdict(list)
+        self.model.inference_model = None
+        self._is_optimized_for_inference = False
+        self._has_warned_about_not_being_optimized_for_inference = False
+        self._optimized_has_been_compiled = False
+        self._optimized_batch_size = None
+        self._optimized_resolution = None
+        self._optimized_dtype = None
+    def maybe_download_pretrain_weights(self):
+        """
+        Download pre-trained weights if they are not already downloaded.
+        """
+        download_pretrain_weights(self.model_config.pretrain_weights)
+    def get_model_config(self, **kwargs):
+        """
+        Retrieve the configuration parameters used by the model.
+        """
+        return ModelConfig(**kwargs)
+    def train(self, **kwargs):
+        """
+        Train an RF-DETR model.
+        """
+        config = self.get_train_config(**kwargs)
+        self.train_from_config(config, **kwargs)
+    def optimize_for_inference(self, compile=True, batch_size=1, dtype=torch.float32):
+        self.remove_optimized_model()
+        self.model.inference_model = deepcopy(self.model.model)
+        self.model.inference_model.eval()
+        self.model.inference_model.export()
+        self._optimized_resolution = self.model.resolution
+        self._is_optimized_for_inference = True
+        self.model.inference_model = self.model.inference_model.to(dtype=dtype)
+        self._optimized_dtype = dtype
+        if compile:
+            self.model.inference_model = torch.jit.trace(
+                self.model.inference_model,
+                torch.randn(
+                    batch_size, 3, self.model.resolution, self.model.resolution,
+                    device=self.model.device,
+                    dtype=dtype
+                )
+            )
+            self._optimized_has_been_compiled = True
+            self._optimized_batch_size = batch_size
+    def remove_optimized_model(self):
+        self.model.inference_model = None
+        self._is_optimized_for_inference = False
+        self._optimized_has_been_compiled = False
+        self._optimized_batch_size = None
+        self._optimized_resolution = None
+        self._optimized_half = False
+    def export(self, **kwargs):
+        """
+        Export your model to an ONNX file.
+        See [the ONNX export documentation](https://rfdetr.roboflow.com/learn/train/#onnx-export) for more information.
+        """
+        self.model.export(**kwargs)
+    def train_from_config(self, config: TrainConfig, **kwargs):
+        with open(
+            os.path.join(config.dataset_dir, "train", "_annotations.coco.json"), "r"
+        ) as f:
+            anns = json.load(f)
+            num_classes = len(anns["categories"])
+            class_names = [c["name"] for c in anns["categories"] if c["supercategory"] != "none"]
+            self.model.class_names = class_names
+        if self.model_config.num_classes != num_classes:
+            logger.warning(
+                f"num_classes mismatch: model has {self.model_config.num_classes} classes, but your dataset has {num_classes} classes\n"
+                f"reinitializing your detection head with {num_classes} classes."
+            )
+            self.model.reinitialize_detection_head(num_classes)
+        train_config = config.dict()
+        model_config = self.model_config.dict()
+        model_config.pop("num_classes")
+        if "class_names" in model_config:
+            model_config.pop("class_names")
+        if "class_names" in train_config and train_config["class_names"] is None:
+            train_config["class_names"] = class_names
+        for k, v in train_config.items():
+            if k in model_config:
+                model_config.pop(k)
+            if k in kwargs:
+                kwargs.pop(k)
+        all_kwargs = {**model_config, **train_config, **kwargs, "num_classes": num_classes}
+        metrics_plot_sink = MetricsPlotSink(output_dir=config.output_dir)
+        self.callbacks["on_fit_epoch_end"].append(metrics_plot_sink.update)
+        self.callbacks["on_train_end"].append(metrics_plot_sink.save)
+        if config.tensorboard:
+            metrics_tensor_board_sink = MetricsTensorBoardSink(output_dir=config.output_dir)
+            self.callbacks["on_fit_epoch_end"].append(metrics_tensor_board_sink.update)
+            self.callbacks["on_train_end"].append(metrics_tensor_board_sink.close)
+        if config.wandb:
+            metrics_wandb_sink = MetricsWandBSink(
+                output_dir=config.output_dir,
+                project=config.project,
+                run=config.run,
+                config=config.model_dump()
+            )
+            self.callbacks["on_fit_epoch_end"].append(metrics_wandb_sink.update)
+            self.callbacks["on_train_end"].append(metrics_wandb_sink.close)
+        if config.early_stopping:
+            from rfdetr.util.early_stopping import EarlyStoppingCallback
+            early_stopping_callback = EarlyStoppingCallback(
+                model=self.model,
+                patience=config.early_stopping_patience,
+                min_delta=config.early_stopping_min_delta,
+                use_ema=config.early_stopping_use_ema
+            )
+            self.callbacks["on_fit_epoch_end"].append(early_stopping_callback.update)
+        self.model.train(
+            **all_kwargs,
+            callbacks=self.callbacks,
+        )
+    def get_train_config(self, **kwargs):
+        """
+        Retrieve the configuration parameters that will be used for training.
+        """
+        return TrainConfig(**kwargs)
+    def get_model(self, config: ModelConfig):
+        """
+        Retrieve a model instance based on the provided configuration.
+        """
+        return Model(**config.dict())
+    # Get class_names from the model
+    @property
+    def class_names(self):
+        """
+        Retrieve the class names supported by the loaded model.
+        Returns:
+            dict: A dictionary mapping class IDs to class names. The keys are integers starting from
+        """
+        if hasattr(self.model, 'class_names') and self.model.class_names:
+            return {i+1: name for i, name in enumerate(self.model.class_names)}
+        return COCO_CLASSES
+    def predict(
+        self,
+        images: Union[str, Image.Image, np.ndarray, torch.Tensor, List[Union[str, np.ndarray, Image.Image, torch.Tensor]]],
+        threshold: float = 0.5,
+        **kwargs,
+    ) -> Union[sv.Detections, List[sv.Detections]]:
+        """Performs object detection on the input images and returns bounding box
+        predictions.
+        This method accepts a single image or a list of images in various formats
+        (file path, PIL Image, NumPy array, or torch.Tensor). The images should be in
+        RGB channel order. If a torch.Tensor is provided, it must already be normalized
+        to values in the [0, 1] range and have the shape (C, H, W).
+        Args:
+            images (Union[str, Image.Image, np.ndarray, torch.Tensor, List[Union[str, np.ndarray, Image.Image, torch.Tensor]]]):
+                A single image or a list of images to process. Images can be provided
+                as file paths, PIL Images, NumPy arrays, or torch.Tensors.
+            threshold (float, optional):
+                The minimum confidence score needed to consider a detected bounding box valid.
+            **kwargs:
+                Additional keyword arguments.
+        Returns:
+            Union[sv.Detections, List[sv.Detections]]: A single or multiple Detections
+                objects, each containing bounding box coordinates, confidence scores,
+                and class IDs.
+        """
+        if not self._is_optimized_for_inference and not self._has_warned_about_not_being_optimized_for_inference:
+            logger.warning(
+                "Model is not optimized for inference. "
+                "Latency may be higher than expected. "
+                "You can optimize the model for inference by calling model.optimize_for_inference()."
+            )
+            self._has_warned_about_not_being_optimized_for_inference = True
+            self.model.model.eval()
+        if not isinstance(images, list):
+            images = [images]
+        orig_sizes = []
+        processed_images = []
+        for img in images:
+            if isinstance(img, str):
+                img = Image.open(img)
+            if not isinstance(img, torch.Tensor):
+                img = F.to_tensor(img)
+            if (img > 1).any():
+                raise ValueError(
+                    "Image has pixel values above 1. Please ensure the image is "
+                    "normalized (scaled to [0, 1])."
+                )
+            if img.shape[0] != 3:
+                raise ValueError(
+                    f"Invalid image shape. Expected 3 channels (RGB), but got "
+                    f"{img.shape[0]} channels."
+                )
+            img_tensor = img
+            h, w = img_tensor.shape[1:]
+            orig_sizes.append((h, w))
+            img_tensor = img_tensor.to(self.model.device)
+            img_tensor = F.normalize(img_tensor, self.means, self.stds)
+            img_tensor = F.resize(img_tensor, (self.model.resolution, self.model.resolution))
+            processed_images.append(img_tensor)
+        batch_tensor = torch.stack(processed_images)
+        if self._is_optimized_for_inference:
+            if self._optimized_resolution != batch_tensor.shape[2]:
+                # this could happen if someone manually changes self.model.resolution after optimizing the model
+                raise ValueError(f"Resolution mismatch. "
+                                 f"Model was optimized for resolution {self._optimized_resolution}, "
+                                 f"but got {batch_tensor.shape[2]}. "
+                                 "You can explicitly remove the optimized model by calling model.remove_optimized_model().")
+            if self._optimized_has_been_compiled:
+                if self._optimized_batch_size != batch_tensor.shape[0]:
+                    raise ValueError(f"Batch size mismatch. "
+                                     f"Optimized model was compiled for batch size {self._optimized_batch_size}, "
+                                     f"but got {batch_tensor.shape[0]}. "
+                                     "You can explicitly remove the optimized model by calling model.remove_optimized_model(). "
+                                     "Alternatively, you can recompile the optimized model for a different batch size "
+                                     "by calling model.optimize_for_inference(batch_size=<new_batch_size>).")
+        with torch.inference_mode():
+            if self._is_optimized_for_inference:
+                predictions = self.model.inference_model(batch_tensor.to(dtype=self._optimized_dtype))
+            else:
+                predictions = self.model.model(batch_tensor)
+            if isinstance(predictions, tuple):
+                predictions = {
+                    "pred_logits": predictions[1],
+                    "pred_boxes": predictions[0]
+                }
+            target_sizes = torch.tensor(orig_sizes, device=self.model.device)
+            results = self.model.postprocessors["bbox"](predictions, target_sizes=target_sizes)
+        detections_list = []
+        for result in results:
+            scores = result["scores"]
+            labels = result["labels"]
+            boxes = result["boxes"]
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            detections = sv.Detections(
+                xyxy=boxes.float().cpu().numpy(),
+                confidence=scores.float().cpu().numpy(),
+                class_id=labels.cpu().numpy(),
+            )
+            detections_list.append(detections)
+        return detections_list if len(detections_list) > 1 else detections_list[0]
+    def deploy_to_roboflow(self, workspace: str, project_id: str, version: str, api_key: str = None, size: str = None):
+        """
+        Deploy the trained RF-DETR model to Roboflow.
+        Deploying with Roboflow will create a Serverless API to which you can make requests.
+        You can also download weights into a Roboflow Inference deployment for use in Roboflow Workflows and on-device deployment.
+        Args:
+            workspace (str): The name of the Roboflow workspace to deploy to.
+            project_ids (List[str]): A list of project IDs to which the model will be deployed
+            api_key (str, optional): Your Roboflow API key. If not provided,
+                it will be read from the environment variable `ROBOFLOW_API_KEY`.
+            size (str, optional): The size of the model to deploy. If not provided,
+                it will default to the size of the model being trained (e.g., "rfdetr-base", "rfdetr-large", etc.).
+            model_name (str, optional): The name you want to give the uploaded model.
+            If not provided, it will default to "<size>-uploaded".
+        Raises:
+            ValueError: If the `api_key` is not provided and not found in the environment
+                variable `ROBOFLOW_API_KEY`, or if the `size` is not set for custom architectures.
+        """
+        from roboflow import Roboflow
+        import shutil
+        if api_key is None:
+            api_key = os.getenv("ROBOFLOW_API_KEY")
+            if api_key is None:
+                raise ValueError("Set api_key=<KEY> in deploy_to_roboflow or export ROBOFLOW_API_KEY=<KEY>")
+        rf = Roboflow(api_key=api_key)
+        workspace = rf.workspace(workspace)
+        if self.size is None and size is None:
+            raise ValueError("Must set size for custom architectures")
+        size = self.size or size
+        tmp_out_dir = ".roboflow_temp_upload"
+        os.makedirs(tmp_out_dir, exist_ok=True)
+        outpath = os.path.join(tmp_out_dir, "weights.pt")
+        torch.save(
+            {
+                "model": self.model.model.state_dict(),
+                "args": self.model.args
+            }, outpath
+        )
+        project = workspace.project(project_id)
+        version = project.version(version)
+        version.deploy(
+            model_type=size,
+            model_path=tmp_out_dir,
+            filename="weights.pt"
+        )
+        shutil.rmtree(tmp_out_dir)
+class RFDETRBase(RFDETR):
+    """
+    Train an RF-DETR Base model (29M parameters).
+    """
+    size = "rfdetr-base"
+    def get_model_config(self, **kwargs):
+        return RFDETRBaseConfig(**kwargs)
+    def get_train_config(self, **kwargs):
+        return TrainConfig(**kwargs)
+class RFDETRLarge(RFDETR):
+    """
+    Train an RF-DETR Large model.
+    """
+    size = "rfdetr-large"
+    def get_model_config(self, **kwargs):
+        return RFDETRLargeConfig(**kwargs)
+    def get_train_config(self, **kwargs):
+        return TrainConfig(**kwargs)
+class RFDETRNano(RFDETR):
+    """
+    Train an RF-DETR Nano model.
+    """
+    size = "rfdetr-nano"
+    def get_model_config(self, **kwargs):
+        return RFDETRNanoConfig(**kwargs)
+    def get_train_config(self, **kwargs):
+        return TrainConfig(**kwargs)
+class RFDETRSmall(RFDETR):
+    """
+    Train an RF-DETR Small model.
+    """
+    size = "rfdetr-small"
+    def get_model_config(self, **kwargs):
+        return RFDETRSmallConfig(**kwargs)
+    def get_train_config(self, **kwargs):
+        return TrainConfig(**kwargs)
+class RFDETRMedium(RFDETR):
+    """
+    Train an RF-DETR Medium model.
+    """
+    size = "rfdetr-medium"
+    def get_model_config(self, **kwargs):
+        return RFDETRMediumConfig(**kwargs)
+    def get_train_config(self, **kwargs):
+        return TrainConfig(**kwargs)

rfdetr/engine.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Train and eval functions used in main.py
+"""
+import math
+import sys
+from typing import Iterable
+import random
+import torch
+import torch.nn.functional as F
+import rfdetr.util.misc as utils
+from rfdetr.datasets.coco_eval import CocoEvaluator
+from rfdetr.datasets.coco import compute_multi_scale_scales
+try:
+    from torch.amp import autocast, GradScaler
+    DEPRECATED_AMP = False
+except ImportError:
+    from torch.cuda.amp import autocast, GradScaler
+    DEPRECATED_AMP = True
+from typing import DefaultDict, List, Callable
+from rfdetr.util.misc import NestedTensor
+import numpy as np
+def get_autocast_args(args):
+    if DEPRECATED_AMP:
+        return {'enabled': args.amp, 'dtype': torch.bfloat16}
+    else:
+        return {'device_type': 'cuda', 'enabled': args.amp, 'dtype': torch.bfloat16}
+def train_one_epoch(
+    model: torch.nn.Module,
+    criterion: torch.nn.Module,
+    lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
+    data_loader: Iterable,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    epoch: int,
+    batch_size: int,
+    max_norm: float = 0,
+    ema_m: torch.nn.Module = None,
+    schedules: dict = {},
+    num_training_steps_per_epoch=None,
+    vit_encoder_num_layers=None,
+    args=None,
+    callbacks: DefaultDict[str, List[Callable]] = None,
+):
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
+    metric_logger.add_meter(
+        "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")
+    )
+    header = "Epoch: [{}]".format(epoch)
+    print_freq = 10
+    start_steps = epoch * num_training_steps_per_epoch
+    print("Grad accum steps: ", args.grad_accum_steps)
+    print("Total batch size: ", batch_size * utils.get_world_size())
+    # Add gradient scaler for AMP
+    if DEPRECATED_AMP:
+        scaler = GradScaler(enabled=args.amp)
+    else:
+        scaler = GradScaler('cuda', enabled=args.amp)
+    optimizer.zero_grad()
+    assert batch_size % args.grad_accum_steps == 0
+    sub_batch_size = batch_size // args.grad_accum_steps
+    print("LENGTH OF DATA LOADER:", len(data_loader))
+    for data_iter_step, (samples, targets) in enumerate(
+        metric_logger.log_every(data_loader, print_freq, header)
+    ):
+        it = start_steps + data_iter_step
+        callback_dict = {
+            "step": it,
+            "model": model,
+            "epoch": epoch,
+        }
+        for callback in callbacks["on_train_batch_start"]:
+            callback(callback_dict)
+        if "dp" in schedules:
+            if args.distributed:
+                model.module.update_drop_path(
+                    schedules["dp"][it], vit_encoder_num_layers
+                )
+            else:
+                model.update_drop_path(schedules["dp"][it], vit_encoder_num_layers)
+        if "do" in schedules:
+            if args.distributed:
+                model.module.update_dropout(schedules["do"][it])
+            else:
+                model.update_dropout(schedules["do"][it])
+        if args.multi_scale and not args.do_random_resize_via_padding:
+            scales = compute_multi_scale_scales(args.resolution, args.expanded_scales, args.patch_size, args.num_windows)
+            random.seed(it)
+            scale = random.choice(scales)
+            with torch.inference_mode():
+                samples.tensors = F.interpolate(samples.tensors, size=scale, mode='bilinear', align_corners=False)
+                samples.mask = F.interpolate(samples.mask.unsqueeze(1).float(), size=scale, mode='nearest').squeeze(1).bool()
+        for i in range(args.grad_accum_steps):
+            start_idx = i * sub_batch_size
+            final_idx = start_idx + sub_batch_size
+            new_samples_tensors = samples.tensors[start_idx:final_idx]
+            new_samples = NestedTensor(new_samples_tensors, samples.mask[start_idx:final_idx])
+            new_samples = new_samples.to(device)
+            new_targets = [{k: v.to(device) for k, v in t.items()} for t in targets[start_idx:final_idx]]
+            with autocast(**get_autocast_args(args)):
+                outputs = model(new_samples, new_targets)
+                loss_dict = criterion(outputs, new_targets)
+                weight_dict = criterion.weight_dict
+                losses = sum(
+                    (1 / args.grad_accum_steps) * loss_dict[k] * weight_dict[k]
+                    for k in loss_dict.keys()
+                    if k in weight_dict
+                )
+            scaler.scale(losses).backward()
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_unscaled = {
+            f"{k}_unscaled": v for k, v in loss_dict_reduced.items()
+        }
+        loss_dict_reduced_scaled = {
+            k:  v * weight_dict[k]
+            for k, v in loss_dict_reduced.items()
+            if k in weight_dict
+        }
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+        loss_value = losses_reduced_scaled.item()
+        if not math.isfinite(loss_value):
+            print(loss_dict_reduced)
+            raise ValueError("Loss is {}, stopping training".format(loss_value))
+        if max_norm > 0:
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        scaler.step(optimizer)
+        scaler.update()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+        if ema_m is not None:
+            if epoch >= 0:
+                ema_m.update(model)
+        metric_logger.update(
+            loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled
+        )
+        metric_logger.update(class_error=loss_dict_reduced["class_error"])
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+def coco_extended_metrics(coco_eval):
+    """
+    Safe version: ignores the –1 sentinel entries so precision/F1 never explode.
+    """
+    iou_thrs, rec_thrs = coco_eval.params.iouThrs, coco_eval.params.recThrs
+    iou50_idx, area_idx, maxdet_idx = (
+        int(np.argwhere(np.isclose(iou_thrs, 0.50))), 0, 2)
+    P = coco_eval.eval["precision"]
+    S = coco_eval.eval["scores"]
+    prec_raw = P[iou50_idx, :, :, area_idx, maxdet_idx]
+    prec = prec_raw.copy().astype(float)
+    prec[prec < 0] = np.nan
+    f1_cls   = 2 * prec * rec_thrs[:, None] / (prec + rec_thrs[:, None])
+    f1_macro = np.nanmean(f1_cls, axis=1)
+    best_j   = int(f1_macro.argmax())
+    macro_precision = float(np.nanmean(prec[best_j]))
+    macro_recall    = float(rec_thrs[best_j])
+    macro_f1        = float(f1_macro[best_j])
+    score_vec = S[iou50_idx, best_j, :, area_idx, maxdet_idx].astype(float)
+    score_vec[prec_raw[best_j] < 0] = np.nan
+    score_thr = float(np.nanmean(score_vec))
+    map_50_95, map_50 = float(coco_eval.stats[0]), float(coco_eval.stats[1])
+    per_class = []
+    cat_ids = coco_eval.params.catIds
+    cat_id_to_name = {c["id"]: c["name"] for c in coco_eval.cocoGt.loadCats(cat_ids)}
+    for k, cid in enumerate(cat_ids):
+        p_slice = P[:, :, k, area_idx, maxdet_idx]
+        valid   = p_slice > -1
+        ap_50_95 = float(p_slice[valid].mean()) if valid.any() else float("nan")
+        ap_50    = float(p_slice[iou50_idx][p_slice[iou50_idx] > -1].mean()) if (p_slice[iou50_idx] > -1).any() else float("nan")
+        pc = float(prec[best_j, k]) if prec_raw[best_j, k] > -1 else float("nan")
+        rc = macro_recall
+        #Doing to this to filter out dataset class
+        if np.isnan(ap_50_95) or np.isnan(ap_50) or np.isnan(pc) or np.isnan(rc):
+            continue
+        per_class.append({
+            "class"      : cat_id_to_name[int(cid)],
+            "map@50:95"  : ap_50_95,
+            "map@50"     : ap_50,
+            "precision"  : pc,
+            "recall"     : rc,
+        })
+    per_class.append({
+        "class"     : "all",
+        "map@50:95" : map_50_95,
+        "map@50"    : map_50,
+        "precision" : macro_precision,
+        "recall"    : macro_recall,
+    })
+    return {
+        "class_map": per_class,
+        "map"      : map_50,
+        "precision": macro_precision,
+        "recall"   : macro_recall
+    }
+def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, args=None):
+    model.eval()
+    if args.fp16_eval:
+        model.half()
+    criterion.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter(
+        "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")
+    )
+    header = "Test:"
+    iou_types = tuple(k for k in ("segm", "bbox") if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        if args.fp16_eval:
+            samples.tensors = samples.tensors.half()
+        # Add autocast for evaluation
+        with autocast(**get_autocast_args(args)):
+            outputs = model(samples)
+        if args.fp16_eval:
+            for key in outputs.keys():
+                if key == "enc_outputs":
+                    for sub_key in outputs[key].keys():
+                        outputs[key][sub_key] = outputs[key][sub_key].float()
+                elif key == "aux_outputs":
+                    for idx in range(len(outputs[key])):
+                        for sub_key in outputs[key][idx].keys():
+                            outputs[key][idx][sub_key] = outputs[key][idx][
+                                sub_key
+                            ].float()
+                else:
+                    outputs[key] = outputs[key].float()
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {
+            k: v * weight_dict[k]
+            for k, v in loss_dict_reduced.items()
+            if k in weight_dict
+        }
+        loss_dict_reduced_unscaled = {
+            f"{k}_unscaled": v for k, v in loss_dict_reduced.items()
+        }
+        metric_logger.update(
+            loss=sum(loss_dict_reduced_scaled.values()),
+            **loss_dict_reduced_scaled,
+            **loss_dict_reduced_unscaled,
+        )
+        metric_logger.update(class_error=loss_dict_reduced["class_error"])
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors["bbox"](outputs, orig_target_sizes)
+        res = {
+            target["image_id"].item(): output
+            for target, output in zip(targets, results)
+        }
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        results_json = coco_extended_metrics(coco_evaluator.coco_eval["bbox"])
+        stats["results_json"] = results_json
+        if "bbox" in postprocessors.keys():
+            stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist()
+        if "segm" in postprocessors.keys():
+            stats["coco_eval_masks"] = coco_evaluator.coco_eval["segm"].stats.tolist()
+    return stats, coco_evaluator

rfdetr/main.py ADDED Viewed

	@@ -0,0 +1,1062 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+cleaned main file
+"""
+import argparse
+import ast
+import copy
+import datetime
+import json
+import math
+import os
+import random
+import shutil
+import time
+from copy import deepcopy
+from logging import getLogger
+from pathlib import Path
+from typing import DefaultDict, List, Callable
+import numpy as np
+import torch
+from peft import LoraConfig, get_peft_model
+from torch.utils.data import DataLoader, DistributedSampler
+import rfdetr.util.misc as utils
+from rfdetr.datasets import build_dataset, get_coco_api_from_dataset
+from rfdetr.engine import evaluate, train_one_epoch
+from rfdetr.models import build_model, build_criterion_and_postprocessors
+from rfdetr.util.benchmark import benchmark
+from rfdetr.util.drop_scheduler import drop_scheduler
+from rfdetr.util.files import download_file
+from rfdetr.util.get_param_dicts import get_param_dict
+from rfdetr.util.utils import ModelEma, BestMetricHolder, clean_state_dict
+if str(os.environ.get("USE_FILE_SYSTEM_SHARING", "False")).lower() in ["true", "1"]:
+    import torch.multiprocessing
+    torch.multiprocessing.set_sharing_strategy('file_system')
+logger = getLogger(__name__)
+HOSTED_MODELS = {
+    "rf-detr-base.pth": "https://storage.googleapis.com/rfdetr/rf-detr-base-coco.pth",
+    # below is a less converged model that may be better for finetuning but worse for inference
+    "rf-detr-base-2.pth": "https://storage.googleapis.com/rfdetr/rf-detr-base-2.pth",
+    "rf-detr-large.pth": "https://storage.googleapis.com/rfdetr/rf-detr-large.pth",
+    "rf-detr-nano.pth": "https://storage.googleapis.com/rfdetr/nano_coco/checkpoint_best_regular.pth",
+    "rf-detr-small.pth": "https://storage.googleapis.com/rfdetr/small_coco/checkpoint_best_regular.pth",
+    "rf-detr-medium.pth": "https://storage.googleapis.com/rfdetr/medium_coco/checkpoint_best_regular.pth",
+}
+def download_pretrain_weights(pretrain_weights: str, redownload=False):
+    if pretrain_weights in HOSTED_MODELS:
+        if redownload or not os.path.exists(pretrain_weights):
+            logger.info(
+                f"Downloading pretrained weights for {pretrain_weights}"
+            )
+            download_file(
+                HOSTED_MODELS[pretrain_weights],
+                pretrain_weights,
+            )
+class Model:
+    def __init__(self, **kwargs):
+        args = populate_args(**kwargs)
+        self.args = args
+        self.resolution = args.resolution
+        self.model = build_model(args)
+        self.device = torch.device(args.device)
+        if args.pretrain_weights is not None:
+            print("Loading pretrain weights")
+            try:
+                checkpoint = torch.load(args.pretrain_weights, map_location='cpu', weights_only=False)
+            except Exception as e:
+                print(f"Failed to load pretrain weights: {e}")
+                # re-download weights if they are corrupted
+                print("Failed to load pretrain weights, re-downloading")
+                download_pretrain_weights(args.pretrain_weights, redownload=True)
+                checkpoint = torch.load(args.pretrain_weights, map_location='cpu', weights_only=False)
+            # Extract class_names from checkpoint if available
+            if 'args' in checkpoint and hasattr(checkpoint['args'], 'class_names'):
+                self.args.class_names = checkpoint['args'].class_names
+                self.class_names = checkpoint['args'].class_names
+            checkpoint_num_classes = checkpoint['model']['class_embed.bias'].shape[0]
+            if checkpoint_num_classes != args.num_classes + 1:
+                logger.warning(
+                    f"num_classes mismatch: pretrain weights has {checkpoint_num_classes - 1} classes, but your model has {args.num_classes} classes\n"
+                    f"reinitializing detection head with {checkpoint_num_classes - 1} classes"
+                )
+                self.reinitialize_detection_head(checkpoint_num_classes)
+            # add support to exclude_keys
+            # e.g., when load object365 pretrain, do not load `class_embed.[weight, bias]`
+            if args.pretrain_exclude_keys is not None:
+                assert isinstance(args.pretrain_exclude_keys, list)
+                for exclude_key in args.pretrain_exclude_keys:
+                    checkpoint['model'].pop(exclude_key)
+            if args.pretrain_keys_modify_to_load is not None:
+                from util.obj365_to_coco_model import get_coco_pretrain_from_obj365
+                assert isinstance(args.pretrain_keys_modify_to_load, list)
+                for modify_key_to_load in args.pretrain_keys_modify_to_load:
+                    try:
+                        checkpoint['model'][modify_key_to_load] = get_coco_pretrain_from_obj365(
+                            model_without_ddp.state_dict()[modify_key_to_load],
+                            checkpoint['model'][modify_key_to_load]
+                        )
+                    except:
+                        print(f"Failed to load {modify_key_to_load}, deleting from checkpoint")
+                        checkpoint['model'].pop(modify_key_to_load)
+            # we may want to resume training with a smaller number of groups for group detr
+            num_desired_queries = args.num_queries * args.group_detr
+            query_param_names = ["refpoint_embed.weight", "query_feat.weight"]
+            for name, state in checkpoint['model'].items():
+                if any(name.endswith(x) for x in query_param_names):
+                    checkpoint['model'][name] = state[:num_desired_queries]
+            self.model.load_state_dict(checkpoint['model'], strict=False)
+        if args.backbone_lora:
+            print("Applying LORA to backbone")
+            lora_config = LoraConfig(
+                r=16,
+                lora_alpha=16,
+                use_dora=True,
+                target_modules=[
+                    "q_proj", "v_proj", "k_proj",  # covers OWL-ViT
+                    "qkv", # covers open_clip ie Siglip2
+                    "query", "key", "value", "cls_token", "register_tokens", # covers Dinov2 with windowed attn
+                ]
+            )
+            self.model.backbone[0].encoder = get_peft_model(self.model.backbone[0].encoder, lora_config)
+        self.model = self.model.to(self.device)
+        self.criterion, self.postprocessors = build_criterion_and_postprocessors(args)
+        self.stop_early = False
+    def reinitialize_detection_head(self, num_classes):
+        self.model.reinitialize_detection_head(num_classes)
+    def request_early_stop(self):
+        self.stop_early = True
+        print("Early stopping requested, will complete current epoch and stop")
+    def train(self, callbacks: DefaultDict[str, List[Callable]], **kwargs):
+        currently_supported_callbacks = ["on_fit_epoch_end", "on_train_batch_start", "on_train_end"]
+        for key in callbacks.keys():
+            if key not in currently_supported_callbacks:
+                raise ValueError(
+                    f"Callback {key} is not currently supported, please file an issue if you need it!\n"
+                    f"Currently supported callbacks: {currently_supported_callbacks}"
+                )
+        args = populate_args(**kwargs)
+        if getattr(args, 'class_names') is not None:
+            self.args.class_names = args.class_names
+            self.args.num_classes = args.num_classes
+        utils.init_distributed_mode(args)
+        print("git:\n  {}\n".format(utils.get_sha()))
+        print(args)
+        device = torch.device(args.device)
+        # fix the seed for reproducibility
+        seed = args.seed + utils.get_rank()
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+        criterion, postprocessors = build_criterion_and_postprocessors(args)
+        model = self.model
+        model.to(device)
+        model_without_ddp = model
+        if args.distributed:
+            if args.sync_bn:
+                model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+            model_without_ddp = model.module
+        n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print('number of params:', n_parameters)
+        param_dicts = get_param_dict(args, model_without_ddp)
+        param_dicts = [p for p in param_dicts if p['params'].requires_grad]
+        optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                    weight_decay=args.weight_decay)
+        # Choose the learning rate scheduler based on the new argument
+        dataset_train = build_dataset(image_set='train', args=args, resolution=args.resolution)
+        dataset_val = build_dataset(image_set='val', args=args, resolution=args.resolution)
+        dataset_test = build_dataset(image_set='test', args=args, resolution=args.resolution)
+        # for cosine annealing, calculate total training steps and warmup steps
+        total_batch_size_for_lr = args.batch_size * utils.get_world_size() * args.grad_accum_steps
+        num_training_steps_per_epoch_lr = (len(dataset_train) + total_batch_size_for_lr - 1) // total_batch_size_for_lr
+        total_training_steps_lr = num_training_steps_per_epoch_lr * args.epochs
+        warmup_steps_lr = num_training_steps_per_epoch_lr * args.warmup_epochs
+        def lr_lambda(current_step: int):
+            if current_step < warmup_steps_lr:
+                # Linear warmup
+                return float(current_step) / float(max(1, warmup_steps_lr))
+            else:
+                # Cosine annealing from multiplier 1.0 down to lr_min_factor
+                if args.lr_scheduler == 'cosine':
+                    progress = float(current_step - warmup_steps_lr) / float(max(1, total_training_steps_lr - warmup_steps_lr))
+                    return args.lr_min_factor + (1 - args.lr_min_factor) * 0.5 * (1 + math.cos(math.pi * progress))
+                elif args.lr_scheduler == 'step':
+                    if current_step < args.lr_drop * num_training_steps_per_epoch_lr:
+                        return 1.0
+                    else:
+                        return 0.1
+        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+        if args.distributed:
+            sampler_train = DistributedSampler(dataset_train)
+            sampler_val = DistributedSampler(dataset_val, shuffle=False)
+            sampler_test = DistributedSampler(dataset_test, shuffle=False)
+        else:
+            sampler_train = torch.utils.data.RandomSampler(dataset_train)
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+            sampler_test = torch.utils.data.SequentialSampler(dataset_test)
+        effective_batch_size = args.batch_size * args.grad_accum_steps
+        min_batches = kwargs.get('min_batches', 5)
+        if len(dataset_train) < effective_batch_size * min_batches:
+            logger.info(
+                f"Training with uniform sampler because dataset is too small: {len(dataset_train)} < {effective_batch_size * min_batches}"
+            )
+            sampler = torch.utils.data.RandomSampler(
+                dataset_train,
+                replacement=True,
+                num_samples=effective_batch_size * min_batches,
+            )
+            data_loader_train = DataLoader(
+                dataset_train,
+                batch_size=effective_batch_size,
+                collate_fn=utils.collate_fn,
+                num_workers=args.num_workers,
+                sampler=sampler,
+            )
+        else:
+            batch_sampler_train = torch.utils.data.BatchSampler(
+                sampler_train, effective_batch_size, drop_last=True)
+            data_loader_train = DataLoader(
+                dataset_train,
+                batch_sampler=batch_sampler_train,
+                collate_fn=utils.collate_fn,
+                num_workers=args.num_workers
+            )
+        data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
+                                    drop_last=False, collate_fn=utils.collate_fn,
+                                    num_workers=args.num_workers)
+        data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_test,
+                                    drop_last=False, collate_fn=utils.collate_fn,
+                                    num_workers=args.num_workers)
+        base_ds = get_coco_api_from_dataset(dataset_val)
+        base_ds_test = get_coco_api_from_dataset(dataset_test)
+        if args.use_ema:
+            self.ema_m = ModelEma(model_without_ddp, decay=args.ema_decay, tau=args.ema_tau)
+        else:
+            self.ema_m = None
+        output_dir = Path(args.output_dir)
+        if  utils.is_main_process():
+            print("Get benchmark")
+            if args.do_benchmark:
+                benchmark_model = copy.deepcopy(model_without_ddp)
+                bm = benchmark(benchmark_model.float(), dataset_val, output_dir)
+                print(json.dumps(bm, indent=2))
+                del benchmark_model
+        if args.resume:
+            checkpoint = torch.load(args.resume, map_location='cpu', weights_only=False)
+            model_without_ddp.load_state_dict(checkpoint['model'], strict=True)
+            if args.use_ema:
+                if 'ema_model' in checkpoint:
+                    self.ema_m.module.load_state_dict(clean_state_dict(checkpoint['ema_model']))
+                else:
+                    del self.ema_m
+                    self.ema_m = ModelEma(model, decay=args.ema_decay, tau=args.ema_tau)
+            if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+                args.start_epoch = checkpoint['epoch'] + 1
+        if args.eval:
+            test_stats, coco_evaluator = evaluate(
+                model, criterion, postprocessors, data_loader_val, base_ds, device, args)
+            if args.output_dir:
+                utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
+            return
+        # for drop
+        total_batch_size = effective_batch_size * utils.get_world_size()
+        num_training_steps_per_epoch = (len(dataset_train) + total_batch_size - 1) // total_batch_size
+        schedules = {}
+        if args.dropout > 0:
+            schedules['do'] = drop_scheduler(
+                args.dropout, args.epochs, num_training_steps_per_epoch,
+                args.cutoff_epoch, args.drop_mode, args.drop_schedule)
+            print("Min DO = %.7f, Max DO = %.7f" % (min(schedules['do']), max(schedules['do'])))
+        if args.drop_path > 0:
+            schedules['dp'] = drop_scheduler(
+                args.drop_path, args.epochs, num_training_steps_per_epoch,
+                args.cutoff_epoch, args.drop_mode, args.drop_schedule)
+            print("Min DP = %.7f, Max DP = %.7f" % (min(schedules['dp']), max(schedules['dp'])))
+        print("Start training")
+        start_time = time.time()
+        best_map_holder = BestMetricHolder(use_ema=args.use_ema)
+        best_map_5095 = 0
+        best_map_50 = 0
+        best_map_ema_5095 = 0
+        best_map_ema_50 = 0
+        for epoch in range(args.start_epoch, args.epochs):
+            epoch_start_time = time.time()
+            if args.distributed:
+                sampler_train.set_epoch(epoch)
+            model.train()
+            criterion.train()
+            train_stats = train_one_epoch(
+                model, criterion, lr_scheduler, data_loader_train, optimizer, device, epoch,
+                effective_batch_size, args.clip_max_norm, ema_m=self.ema_m, schedules=schedules,
+                num_training_steps_per_epoch=num_training_steps_per_epoch,
+                vit_encoder_num_layers=args.vit_encoder_num_layers, args=args, callbacks=callbacks)
+            train_epoch_time = time.time() - epoch_start_time
+            train_epoch_time_str = str(datetime.timedelta(seconds=int(train_epoch_time)))
+            if args.output_dir:
+                checkpoint_paths = [output_dir / 'checkpoint.pth']
+                # extra checkpoint before LR drop and every `checkpoint_interval` epochs
+                if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % args.checkpoint_interval == 0:
+                    checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+                for checkpoint_path in checkpoint_paths:
+                    weights = {
+                        'model': model_without_ddp.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'lr_scheduler': lr_scheduler.state_dict(),
+                        'epoch': epoch,
+                        'args': args,
+                    }
+                    if args.use_ema:
+                        weights.update({
+                            'ema_model': self.ema_m.module.state_dict(),
+                        })
+                    if not args.dont_save_weights:
+                        # create checkpoint dir
+                        checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
+                        utils.save_on_master(weights, checkpoint_path)
+            with torch.inference_mode():
+                test_stats, coco_evaluator = evaluate(
+                    model, criterion, postprocessors, data_loader_val, base_ds, device, args=args
+                )
+            map_regular = test_stats["coco_eval_bbox"][0]
+            _isbest = best_map_holder.update(map_regular, epoch, is_ema=False)
+            if _isbest:
+                best_map_5095 = max(best_map_5095, map_regular)
+                best_map_50 = max(best_map_50, test_stats["coco_eval_bbox"][1])
+                checkpoint_path = output_dir / 'checkpoint_best_regular.pth'
+                if not args.dont_save_weights:
+                    utils.save_on_master({
+                        'model': model_without_ddp.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'lr_scheduler': lr_scheduler.state_dict(),
+                        'epoch': epoch,
+                        'args': args,
+                    }, checkpoint_path)
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        **{f'test_{k}': v for k, v in test_stats.items()},
+                        'epoch': epoch,
+                        'n_parameters': n_parameters}
+            if args.use_ema:
+                ema_test_stats, _ = evaluate(
+                    self.ema_m.module, criterion, postprocessors, data_loader_val, base_ds, device, args=args
+                )
+                log_stats.update({f'ema_test_{k}': v for k,v in ema_test_stats.items()})
+                map_ema = ema_test_stats["coco_eval_bbox"][0]
+                best_map_ema_5095 = max(best_map_ema_5095, map_ema)
+                _isbest = best_map_holder.update(map_ema, epoch, is_ema=True)
+                if _isbest:
+                    best_map_ema_50 = max(best_map_ema_50, ema_test_stats["coco_eval_bbox"][1])
+                    checkpoint_path = output_dir / 'checkpoint_best_ema.pth'
+                    if not args.dont_save_weights:
+                        utils.save_on_master({
+                            'model': self.ema_m.module.state_dict(),
+                            'optimizer': optimizer.state_dict(),
+                            'lr_scheduler': lr_scheduler.state_dict(),
+                            'epoch': epoch,
+                            'args': args,
+                        }, checkpoint_path)
+            log_stats.update(best_map_holder.summary())
+            # epoch parameters
+            ep_paras = {
+                    'epoch': epoch,
+                    'n_parameters': n_parameters
+                }
+            log_stats.update(ep_paras)
+            try:
+                log_stats.update({'now_time': str(datetime.datetime.now())})
+            except:
+                pass
+            log_stats['train_epoch_time'] = train_epoch_time_str
+            epoch_time = time.time() - epoch_start_time
+            epoch_time_str = str(datetime.timedelta(seconds=int(epoch_time)))
+            log_stats['epoch_time'] = epoch_time_str
+            if args.output_dir and utils.is_main_process():
+                with (output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+                # for evaluation logs
+                if coco_evaluator is not None:
+                    (output_dir / 'eval').mkdir(exist_ok=True)
+                    if "bbox" in coco_evaluator.coco_eval:
+                        filenames = ['latest.pth']
+                        if epoch % 50 == 0:
+                            filenames.append(f'{epoch:03}.pth')
+                        for name in filenames:
+                            torch.save(coco_evaluator.coco_eval["bbox"].eval,
+                                    output_dir / "eval" / name)
+            for callback in callbacks["on_fit_epoch_end"]:
+                callback(log_stats)
+            if self.stop_early:
+                print(f"Early stopping requested, stopping at epoch {epoch}")
+                break
+        best_is_ema = best_map_ema_5095 > best_map_5095
+        if utils.is_main_process():
+            if best_is_ema:
+                shutil.copy2(output_dir / 'checkpoint_best_ema.pth', output_dir / 'checkpoint_best_total.pth')
+            else:
+                shutil.copy2(output_dir / 'checkpoint_best_regular.pth', output_dir / 'checkpoint_best_total.pth')
+            utils.strip_checkpoint(output_dir / 'checkpoint_best_total.pth')
+            best_map_5095 = max(best_map_5095, best_map_ema_5095)
+            if best_is_ema:
+                results = ema_test_stats["results_json"]
+            else:
+                results = test_stats["results_json"]
+            class_map = results["class_map"]
+            results["class_map"] = {"valid": class_map}
+            with open(output_dir / "results.json", "w") as f:
+                json.dump(results, f)
+            total_time = time.time() - start_time
+            total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+            print('Training time {}'.format(total_time_str))
+            print('Results saved to {}'.format(output_dir / "results.json"))
+        if best_is_ema:
+            self.model = self.ema_m.module
+        self.model.eval()
+        if args.run_test:
+            best_state_dict = torch.load(output_dir / 'checkpoint_best_total.pth', map_location='cpu', weights_only=False)['model']
+            model.load_state_dict(best_state_dict)
+            model.eval()
+            test_stats, _ = evaluate(
+                model, criterion, postprocessors, data_loader_test, base_ds_test, device, args=args
+            )
+            print(f"Test results: {test_stats}")
+            with open(output_dir / "results.json", "r") as f:
+                results = json.load(f)
+            test_metrics = test_stats["results_json"]["class_map"]
+            results["class_map"]["test"] = test_metrics
+            with open(output_dir / "results.json", "w") as f:
+                json.dump(results, f)
+        for callback in callbacks["on_train_end"]:
+            callback()
+    def export(self, output_dir="output", infer_dir=None, simplify=False,  backbone_only=False, opset_version=17, verbose=True, force=False, shape=None, batch_size=1, **kwargs):
+        """Export the trained model to ONNX format"""
+        print(f"Exporting model to ONNX format")
+        try:
+            from rfdetr.deploy.export import export_onnx, onnx_simplify, make_infer_image
+        except ImportError:
+            print("It seems some dependencies for ONNX export are missing. Please run `pip install rfdetr[onnxexport]` and try again.")
+            raise
+        device = self.device
+        model = deepcopy(self.model.to("cpu"))
+        model.to(device)
+        os.makedirs(output_dir, exist_ok=True)
+        output_dir = Path(output_dir)
+        if shape is None:
+            shape = (self.resolution, self.resolution)
+        else:
+            if shape[0] % 14 != 0 or shape[1] % 14 != 0:
+                raise ValueError("Shape must be divisible by 14")
+        input_tensors = make_infer_image(infer_dir, shape, batch_size, device).to(device)
+        input_names = ['input']
+        output_names = ['features'] if backbone_only else ['dets', 'labels']
+        dynamic_axes = None
+        self.model.eval()
+        with torch.no_grad():
+            if backbone_only:
+                features = model(input_tensors)
+                print(f"PyTorch inference output shape: {features.shape}")
+            else:
+                outputs = model(input_tensors)
+                dets = outputs['pred_boxes']
+                labels = outputs['pred_logits']
+                print(f"PyTorch inference output shapes - Boxes: {dets.shape}, Labels: {labels.shape}")
+        model.cpu()
+        input_tensors = input_tensors.cpu()
+        # Export to ONNX
+        output_file = export_onnx(
+            output_dir=output_dir,
+            model=model,
+            input_names=input_names,
+            input_tensors=input_tensors,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            backbone_only=backbone_only,
+            verbose=verbose,
+            opset_version=opset_version
+        )
+        print(f"Successfully exported ONNX model to: {output_file}")
+        if simplify:
+            sim_output_file = onnx_simplify(
+                onnx_dir=output_file,
+                input_names=input_names,
+                input_tensors=input_tensors,
+                force=force
+            )
+            print(f"Successfully simplified ONNX model to: {sim_output_file}")
+        print("ONNX export completed successfully")
+        self.model = self.model.to(device)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('LWDETR training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    config = vars(args)  # Convert Namespace to dictionary
+    if args.subcommand == 'distill':
+        distill(**config)
+    elif args.subcommand is None:
+        main(**config)
+    elif args.subcommand == 'export_model':
+        filter_keys = [
+            "num_classes",
+            "grad_accum_steps",
+            "lr",
+            "lr_encoder",
+            "weight_decay",
+            "epochs",
+            "lr_drop",
+            "clip_max_norm",
+            "lr_vit_layer_decay",
+            "lr_component_decay",
+            "dropout",
+            "drop_path",
+            "drop_mode",
+            "drop_schedule",
+            "cutoff_epoch",
+            "pretrained_encoder",
+            "pretrain_weights",
+            "pretrain_exclude_keys",
+            "pretrain_keys_modify_to_load",
+            "freeze_florence",
+            "freeze_aimv2",
+            "decoder_norm",
+            "set_cost_class",
+            "set_cost_bbox",
+            "set_cost_giou",
+            "cls_loss_coef",
+            "bbox_loss_coef",
+            "giou_loss_coef",
+            "focal_alpha",
+            "aux_loss",
+            "sum_group_losses",
+            "use_varifocal_loss",
+            "use_position_supervised_loss",
+            "ia_bce_loss",
+            "dataset_file",
+            "coco_path",
+            "dataset_dir",
+            "square_resize_div_64",
+            "output_dir",
+            "checkpoint_interval",
+            "seed",
+            "resume",
+            "start_epoch",
+            "eval",
+            "use_ema",
+            "ema_decay",
+            "ema_tau",
+            "num_workers",
+            "device",
+            "world_size",
+            "dist_url",
+            "sync_bn",
+            "fp16_eval",
+            "infer_dir",
+            "verbose",
+            "opset_version",
+            "dry_run",
+            "shape",
+        ]
+        for key in filter_keys:
+            config.pop(key, None)  # Use pop with None to avoid KeyError
+        from deploy.export import main as export_main
+        if args.batch_size != 1:
+            config['batch_size'] = 1
+            print(f"Only batch_size 1 is supported for onnx export, \
+                 but got batchsize = {args.batch_size}. batch_size is forcibly set to 1.")
+        export_main(**config)
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    parser.add_argument('--num_classes', default=2, type=int)
+    parser.add_argument('--grad_accum_steps', default=1, type=int)
+    parser.add_argument('--amp', default=False, type=bool)
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--lr_encoder', default=1.5e-4, type=float)
+    parser.add_argument('--batch_size', default=2, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--epochs', default=12, type=int)
+    parser.add_argument('--lr_drop', default=11, type=int)
+    parser.add_argument('--clip_max_norm', default=0.1, type=float,
+                        help='gradient clipping max norm')
+    parser.add_argument('--lr_vit_layer_decay', default=0.8, type=float)
+    parser.add_argument('--lr_component_decay', default=1.0, type=float)
+    parser.add_argument('--do_benchmark', action='store_true', help='benchmark the model')
+    # drop args
+    # dropout and stochastic depth drop rate; set at most one to non-zero
+    parser.add_argument('--dropout', type=float, default=0,
+                        help='Drop path rate (default: 0.0)')
+    parser.add_argument('--drop_path', type=float, default=0,
+                        help='Drop path rate (default: 0.0)')
+    # early / late dropout and stochastic depth settings
+    parser.add_argument('--drop_mode', type=str, default='standard',
+                        choices=['standard', 'early', 'late'], help='drop mode')
+    parser.add_argument('--drop_schedule', type=str, default='constant',
+                        choices=['constant', 'linear'],
+                        help='drop schedule for early dropout / s.d. only')
+    parser.add_argument('--cutoff_epoch', type=int, default=0,
+                        help='if drop_mode is early / late, this is the epoch where dropout ends / starts')
+    # Model parameters
+    parser.add_argument('--pretrained_encoder', type=str, default=None,
+                        help="Path to the pretrained encoder.")
+    parser.add_argument('--pretrain_weights', type=str, default=None,
+                        help="Path to the pretrained model.")
+    parser.add_argument('--pretrain_exclude_keys', type=str, default=None, nargs='+',
+                        help="Keys you do not want to load.")
+    parser.add_argument('--pretrain_keys_modify_to_load', type=str, default=None, nargs='+',
+                        help="Keys you want to modify to load. Only used when loading objects365 pre-trained weights.")
+    # * Backbone
+    parser.add_argument('--encoder', default='vit_tiny', type=str,
+                        help="Name of the transformer or convolutional encoder to use")
+    parser.add_argument('--vit_encoder_num_layers', default=12, type=int,
+                        help="Number of layers used in ViT encoder")
+    parser.add_argument('--window_block_indexes', default=None, type=int, nargs='+')
+    parser.add_argument('--position_embedding', default='sine', type=str,
+                        choices=('sine', 'learned'),
+                        help="Type of positional embedding to use on top of the image features")
+    parser.add_argument('--out_feature_indexes', default=[-1], type=int, nargs='+', help='only for vit now')
+    parser.add_argument("--freeze_encoder", action="store_true", dest="freeze_encoder")
+    parser.add_argument("--layer_norm", action="store_true", dest="layer_norm")
+    parser.add_argument("--rms_norm", action="store_true", dest="rms_norm")
+    parser.add_argument("--backbone_lora", action="store_true", dest="backbone_lora")
+    parser.add_argument("--force_no_pretrain", action="store_true", dest="force_no_pretrain")
+    # * Transformer
+    parser.add_argument('--dec_layers', default=3, type=int,
+                        help="Number of decoding layers in the transformer")
+    parser.add_argument('--dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the transformer blocks")
+    parser.add_argument('--hidden_dim', default=256, type=int,
+                        help="Size of the embeddings (dimension of the transformer)")
+    parser.add_argument('--sa_nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's self-attentions")
+    parser.add_argument('--ca_nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's cross-attentions")
+    parser.add_argument('--num_queries', default=300, type=int,
+                        help="Number of query slots")
+    parser.add_argument('--group_detr', default=13, type=int,
+                        help="Number of groups to speed up detr training")
+    parser.add_argument('--two_stage', action='store_true')
+    parser.add_argument('--projector_scale', default='P4', type=str, nargs='+', choices=('P3', 'P4', 'P5', 'P6'))
+    parser.add_argument('--lite_refpoint_refine', action='store_true', help='lite refpoint refine mode for speed-up')
+    parser.add_argument('--num_select', default=100, type=int,
+                        help='the number of predictions selected for evaluation')
+    parser.add_argument('--dec_n_points', default=4, type=int,
+                        help='the number of sampling points')
+    parser.add_argument('--decoder_norm', default='LN', type=str)
+    parser.add_argument('--bbox_reparam', action='store_true')
+    parser.add_argument('--freeze_batch_norm', action='store_true')
+    # * Matcher
+    parser.add_argument('--set_cost_class', default=2, type=float,
+                        help="Class coefficient in the matching cost")
+    parser.add_argument('--set_cost_bbox', default=5, type=float,
+                        help="L1 box coefficient in the matching cost")
+    parser.add_argument('--set_cost_giou', default=2, type=float,
+                        help="giou box coefficient in the matching cost")
+    # * Loss coefficients
+    parser.add_argument('--cls_loss_coef', default=2, type=float)
+    parser.add_argument('--bbox_loss_coef', default=5, type=float)
+    parser.add_argument('--giou_loss_coef', default=2, type=float)
+    parser.add_argument('--focal_alpha', default=0.25, type=float)
+    # Loss
+    parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
+                        help="Disables auxiliary decoding losses (loss at each layer)")
+    parser.add_argument('--sum_group_losses', action='store_true',
+                        help="To sum losses across groups or mean losses.")
+    parser.add_argument('--use_varifocal_loss', action='store_true')
+    parser.add_argument('--use_position_supervised_loss', action='store_true')
+    parser.add_argument('--ia_bce_loss', action='store_true')
+    # dataset parameters
+    parser.add_argument('--dataset_file', default='coco')
+    parser.add_argument('--coco_path', type=str)
+    parser.add_argument('--dataset_dir', type=str)
+    parser.add_argument('--square_resize_div_64', action='store_true')
+    parser.add_argument('--output_dir', default='output',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--dont_save_weights', action='store_true')
+    parser.add_argument('--checkpoint_interval', default=10, type=int,
+                        help='epoch interval to save checkpoint')
+    parser.add_argument('--seed', default=42, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true')
+    parser.add_argument('--use_ema', action='store_true')
+    parser.add_argument('--ema_decay', default=0.9997, type=float)
+    parser.add_argument('--ema_tau', default=0, type=float)
+    parser.add_argument('--num_workers', default=2, type=int)
+    # distributed training parameters
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    parser.add_argument('--sync_bn', default=True, type=bool,
+                        help='setup synchronized BatchNorm for distributed training')
+    # fp16
+    parser.add_argument('--fp16_eval', default=False, action='store_true',
+                        help='evaluate in fp16 precision.')
+    # custom args
+    parser.add_argument('--encoder_only', action='store_true', help='Export and benchmark encoder only')
+    parser.add_argument('--backbone_only', action='store_true', help='Export and benchmark backbone only')
+    parser.add_argument('--resolution', type=int, default=640, help="input resolution")
+    parser.add_argument('--use_cls_token', action='store_true', help='use cls token')
+    parser.add_argument('--multi_scale', action='store_true', help='use multi scale')
+    parser.add_argument('--expanded_scales', action='store_true', help='use expanded scales')
+    parser.add_argument('--do_random_resize_via_padding', action='store_true', help='use random resize via padding')
+    parser.add_argument('--warmup_epochs', default=1, type=float,
+        help='Number of warmup epochs for linear warmup before cosine annealing')
+    # Add scheduler type argument: 'step' or 'cosine'
+    parser.add_argument(
+        '--lr_scheduler',
+        default='step',
+        choices=['step', 'cosine'],
+        help="Type of learning rate scheduler to use: 'step' (default) or 'cosine'"
+    )
+    parser.add_argument('--lr_min_factor', default=0.0, type=float,
+        help='Minimum learning rate factor (as a fraction of initial lr) at the end of cosine annealing')
+    # Early stopping parameters
+    parser.add_argument('--early_stopping', action='store_true',
+                        help='Enable early stopping based on mAP improvement')
+    parser.add_argument('--early_stopping_patience', default=10, type=int,
+                        help='Number of epochs with no improvement after which training will be stopped')
+    parser.add_argument('--early_stopping_min_delta', default=0.001, type=float,
+                        help='Minimum change in mAP to qualify as an improvement')
+    parser.add_argument('--early_stopping_use_ema', action='store_true',
+                        help='Use EMA model metrics for early stopping')
+    # subparsers
+    subparsers = parser.add_subparsers(title='sub-commands', dest='subcommand',
+        description='valid subcommands', help='additional help')
+    # subparser for export model
+    parser_export = subparsers.add_parser('export_model', help='LWDETR model export')
+    parser_export.add_argument('--infer_dir', type=str, default=None)
+    parser_export.add_argument('--verbose', type=ast.literal_eval, default=False, nargs="?", const=True)
+    parser_export.add_argument('--opset_version', type=int, default=17)
+    parser_export.add_argument('--simplify', action='store_true', help="Simplify onnx model")
+    parser_export.add_argument('--tensorrt', '--trtexec', '--trt', action='store_true',
+                               help="build tensorrt engine")
+    parser_export.add_argument('--dry-run', '--test', '-t', action='store_true', help="just print command")
+    parser_export.add_argument('--profile', action='store_true', help='Run nsys profiling during TensorRT export')
+    parser_export.add_argument('--shape', type=int, nargs=2, default=(640, 640), help="input shape (width, height)")
+    return parser
+def populate_args(
+    # Basic training parameters
+    num_classes=2,
+    grad_accum_steps=1,
+    amp=False,
+    lr=1e-4,
+    lr_encoder=1.5e-4,
+    batch_size=2,
+    weight_decay=1e-4,
+    epochs=12,
+    lr_drop=11,
+    clip_max_norm=0.1,
+    lr_vit_layer_decay=0.8,
+    lr_component_decay=1.0,
+    do_benchmark=False,
+    # Drop parameters
+    dropout=0,
+    drop_path=0,
+    drop_mode='standard',
+    drop_schedule='constant',
+    cutoff_epoch=0,
+    # Model parameters
+    pretrained_encoder=None,
+    pretrain_weights=None,
+    pretrain_exclude_keys=None,
+    pretrain_keys_modify_to_load=None,
+    pretrained_distiller=None,
+    # Backbone parameters
+    encoder='vit_tiny',
+    vit_encoder_num_layers=12,
+    window_block_indexes=None,
+    position_embedding='sine',
+    out_feature_indexes=[-1],
+    freeze_encoder=False,
+    layer_norm=False,
+    rms_norm=False,
+    backbone_lora=False,
+    force_no_pretrain=False,
+    # Transformer parameters
+    dec_layers=3,
+    dim_feedforward=2048,
+    hidden_dim=256,
+    sa_nheads=8,
+    ca_nheads=8,
+    num_queries=300,
+    group_detr=13,
+    two_stage=False,
+    projector_scale='P4',
+    lite_refpoint_refine=False,
+    num_select=100,
+    dec_n_points=4,
+    decoder_norm='LN',
+    bbox_reparam=False,
+    freeze_batch_norm=False,
+    # Matcher parameters
+    set_cost_class=2,
+    set_cost_bbox=5,
+    set_cost_giou=2,
+    # Loss coefficients
+    cls_loss_coef=2,
+    bbox_loss_coef=5,
+    giou_loss_coef=2,
+    focal_alpha=0.25,
+    aux_loss=True,
+    sum_group_losses=False,
+    use_varifocal_loss=False,
+    use_position_supervised_loss=False,
+    ia_bce_loss=False,
+    # Dataset parameters
+    dataset_file='coco',
+    coco_path=None,
+    dataset_dir=None,
+    square_resize_div_64=False,
+    # Output parameters
+    output_dir='output',
+    dont_save_weights=False,
+    checkpoint_interval=10,
+    seed=42,
+    resume='',
+    start_epoch=0,
+    eval=False,
+    use_ema=False,
+    ema_decay=0.9997,
+    ema_tau=0,
+    num_workers=2,
+    # Distributed training parameters
+    device='cuda',
+    world_size=1,
+    dist_url='env://',
+    sync_bn=True,
+    # FP16
+    fp16_eval=False,
+    # Custom args
+    encoder_only=False,
+    backbone_only=False,
+    resolution=640,
+    use_cls_token=False,
+    multi_scale=False,
+    expanded_scales=False,
+    do_random_resize_via_padding=False,
+    warmup_epochs=1,
+    lr_scheduler='step',
+    lr_min_factor=0.0,
+    # Early stopping parameters
+    early_stopping=True,
+    early_stopping_patience=10,
+    early_stopping_min_delta=0.001,
+    early_stopping_use_ema=False,
+    gradient_checkpointing=False,
+    # Additional
+    subcommand=None,
+    **extra_kwargs  # To handle any unexpected arguments
+):
+    args = argparse.Namespace(
+        num_classes=num_classes,
+        grad_accum_steps=grad_accum_steps,
+        amp=amp,
+        lr=lr,
+        lr_encoder=lr_encoder,
+        batch_size=batch_size,
+        weight_decay=weight_decay,
+        epochs=epochs,
+        lr_drop=lr_drop,
+        clip_max_norm=clip_max_norm,
+        lr_vit_layer_decay=lr_vit_layer_decay,
+        lr_component_decay=lr_component_decay,
+        do_benchmark=do_benchmark,
+        dropout=dropout,
+        drop_path=drop_path,
+        drop_mode=drop_mode,
+        drop_schedule=drop_schedule,
+        cutoff_epoch=cutoff_epoch,
+        pretrained_encoder=pretrained_encoder,
+        pretrain_weights=pretrain_weights,
+        pretrain_exclude_keys=pretrain_exclude_keys,
+        pretrain_keys_modify_to_load=pretrain_keys_modify_to_load,
+        pretrained_distiller=pretrained_distiller,
+        encoder=encoder,
+        vit_encoder_num_layers=vit_encoder_num_layers,
+        window_block_indexes=window_block_indexes,
+        position_embedding=position_embedding,
+        out_feature_indexes=out_feature_indexes,
+        freeze_encoder=freeze_encoder,
+        layer_norm=layer_norm,
+        rms_norm=rms_norm,
+        backbone_lora=backbone_lora,
+        force_no_pretrain=force_no_pretrain,
+        dec_layers=dec_layers,
+        dim_feedforward=dim_feedforward,
+        hidden_dim=hidden_dim,
+        sa_nheads=sa_nheads,
+        ca_nheads=ca_nheads,
+        num_queries=num_queries,
+        group_detr=group_detr,
+        two_stage=two_stage,
+        projector_scale=projector_scale,
+        lite_refpoint_refine=lite_refpoint_refine,
+        num_select=num_select,
+        dec_n_points=dec_n_points,
+        decoder_norm=decoder_norm,
+        bbox_reparam=bbox_reparam,
+        freeze_batch_norm=freeze_batch_norm,
+        set_cost_class=set_cost_class,
+        set_cost_bbox=set_cost_bbox,
+        set_cost_giou=set_cost_giou,
+        cls_loss_coef=cls_loss_coef,
+        bbox_loss_coef=bbox_loss_coef,
+        giou_loss_coef=giou_loss_coef,
+        focal_alpha=focal_alpha,
+        aux_loss=aux_loss,
+        sum_group_losses=sum_group_losses,
+        use_varifocal_loss=use_varifocal_loss,
+        use_position_supervised_loss=use_position_supervised_loss,
+        ia_bce_loss=ia_bce_loss,
+        dataset_file=dataset_file,
+        coco_path=coco_path,
+        dataset_dir=dataset_dir,
+        square_resize_div_64=square_resize_div_64,
+        output_dir=output_dir,
+        dont_save_weights=dont_save_weights,
+        checkpoint_interval=checkpoint_interval,
+        seed=seed,
+        resume=resume,
+        start_epoch=start_epoch,
+        eval=eval,
+        use_ema=use_ema,
+        ema_decay=ema_decay,
+        ema_tau=ema_tau,
+        num_workers=num_workers,
+        device=device,
+        world_size=world_size,
+        dist_url=dist_url,
+        sync_bn=sync_bn,
+        fp16_eval=fp16_eval,
+        encoder_only=encoder_only,
+        backbone_only=backbone_only,
+        resolution=resolution,
+        use_cls_token=use_cls_token,
+        multi_scale=multi_scale,
+        expanded_scales=expanded_scales,
+        do_random_resize_via_padding=do_random_resize_via_padding,
+        warmup_epochs=warmup_epochs,
+        lr_scheduler=lr_scheduler,
+        lr_min_factor=lr_min_factor,
+        early_stopping=early_stopping,
+        early_stopping_patience=early_stopping_patience,
+        early_stopping_min_delta=early_stopping_min_delta,
+        early_stopping_use_ema=early_stopping_use_ema,
+        gradient_checkpointing=gradient_checkpointing,
+        **extra_kwargs
+    )
+    return args

rfdetr/models/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+from .lwdetr import build_model, build_criterion_and_postprocessors

rfdetr/models/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (309 Bytes). View file

rfdetr/models/__pycache__/lwdetr.cpython-313.pyc ADDED Viewed

Binary file (39.9 kB). View file

rfdetr/models/__pycache__/matcher.cpython-313.pyc ADDED Viewed

Binary file (6.68 kB). View file

rfdetr/models/__pycache__/position_encoding.cpython-313.pyc ADDED Viewed

Binary file (8.77 kB). View file

rfdetr/models/__pycache__/transformer.cpython-313.pyc ADDED Viewed

Binary file (29.4 kB). View file

rfdetr/models/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+from typing import Dict, List
+import torch
+from torch import nn
+from rfdetr.util.misc import NestedTensor
+from rfdetr.models.position_encoding import build_position_encoding
+from rfdetr.models.backbone.backbone import *
+from typing import Callable
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+        self._export = False
+    def forward(self, tensor_list: NestedTensor):
+        """ """
+        x = self[0](tensor_list)
+        pos = []
+        for x_ in x:
+            pos.append(self[1](x_, align_dim_orders=False).to(x_.tensors.dtype))
+        return x, pos
+    def export(self):
+        self._export = True
+        self._forward_origin = self.forward
+        self.forward = self.forward_export
+        for name, m in self.named_modules():
+            if (
+                hasattr(m, "export")
+                and isinstance(m.export, Callable)
+                and hasattr(m, "_export")
+                and not m._export
+            ):
+                m.export()
+    def forward_export(self, inputs: torch.Tensor):
+        feats, masks = self[0](inputs)
+        poss = []
+        for feat, mask in zip(feats, masks):
+            poss.append(self[1](mask, align_dim_orders=False).to(feat.dtype))
+        return feats, None, poss
+def build_backbone(
+    encoder,
+    vit_encoder_num_layers,
+    pretrained_encoder,
+    window_block_indexes,
+    drop_path,
+    out_channels,
+    out_feature_indexes,
+    projector_scale,
+    use_cls_token,
+    hidden_dim,
+    position_embedding,
+    freeze_encoder,
+    layer_norm,
+    target_shape,
+    rms_norm,
+    backbone_lora,
+    force_no_pretrain,
+    gradient_checkpointing,
+    load_dinov2_weights,
+    patch_size,
+    num_windows,
+    positional_encoding_size,
+):
+    """
+    Useful args:
+        - encoder: encoder name
+        - lr_encoder:
+        - dilation
+        - use_checkpoint: for swin only for now
+    """
+    position_embedding = build_position_encoding(hidden_dim, position_embedding)
+    backbone = Backbone(
+        encoder,
+        pretrained_encoder,
+        window_block_indexes=window_block_indexes,
+        drop_path=drop_path,
+        out_channels=out_channels,
+        out_feature_indexes=out_feature_indexes,
+        projector_scale=projector_scale,
+        use_cls_token=use_cls_token,
+        layer_norm=layer_norm,
+        freeze_encoder=freeze_encoder,
+        target_shape=target_shape,
+        rms_norm=rms_norm,
+        backbone_lora=backbone_lora,
+        gradient_checkpointing=gradient_checkpointing,
+        load_dinov2_weights=load_dinov2_weights,
+        patch_size=patch_size,
+        num_windows=num_windows,
+        positional_encoding_size=positional_encoding_size,
+    )
+    model = Joiner(backbone, position_embedding)
+    return model

rfdetr/models/backbone/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (3.85 kB). View file

rfdetr/models/backbone/__pycache__/backbone.cpython-313.pyc ADDED Viewed

Binary file (8.13 kB). View file

rfdetr/models/backbone/__pycache__/base.cpython-313.pyc ADDED Viewed

Binary file (1.05 kB). View file

rfdetr/models/backbone/__pycache__/dinov2.cpython-313.pyc ADDED Viewed

Binary file (8.66 kB). View file

rfdetr/models/backbone/__pycache__/dinov2_with_windowed_attn.cpython-313.pyc ADDED Viewed

Binary file (61.2 kB). View file

rfdetr/models/backbone/__pycache__/projector.cpython-313.pyc ADDED Viewed

Binary file (15 kB). View file

rfdetr/models/backbone/backbone.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Backbone modules.
+"""
+from functools import partial
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel, AutoProcessor, AutoModelForCausalLM, AutoConfig, AutoBackbone
+from peft import LoraConfig, get_peft_model, PeftModel
+from rfdetr.util.misc import NestedTensor, is_main_process
+from rfdetr.models.backbone.base import BackboneBase
+from rfdetr.models.backbone.projector import MultiScaleProjector
+from rfdetr.models.backbone.dinov2 import DinoV2
+__all__ = ["Backbone"]
+class Backbone(BackboneBase):
+    """backbone."""
+    def __init__(self,
+                 name: str,
+                 pretrained_encoder: str=None,
+                 window_block_indexes: list=None,
+                 drop_path=0.0,
+                 out_channels=256,
+                 out_feature_indexes: list=None,
+                 projector_scale: list=None,
+                 use_cls_token: bool = False,
+                 freeze_encoder: bool = False,
+                 layer_norm: bool = False,
+                 target_shape: tuple[int, int] = (640, 640),
+                 rms_norm: bool = False,
+                 backbone_lora: bool = False,
+                 gradient_checkpointing: bool = False,
+                 load_dinov2_weights: bool = True,
+                 patch_size: int = 14,
+                 num_windows: int = 4,
+                 positional_encoding_size: bool = False,
+                 ):
+        super().__init__()
+        # an example name here would be "dinov2_base" or "dinov2_registers_windowed_base"
+        # if "registers" is in the name, then use_registers is set to True, otherwise it is set to False
+        # similarly, if "windowed" is in the name, then use_windowed_attn is set to True, otherwise it is set to False
+        # the last part of the name should be the size
+        # and the start should be dinov2
+        name_parts = name.split("_")
+        assert name_parts[0] == "dinov2"
+        size = name_parts[-1]
+        use_registers = False
+        if "registers" in name_parts:
+            use_registers = True
+            name_parts.remove("registers")
+        use_windowed_attn = False
+        if "windowed" in name_parts:
+            use_windowed_attn = True
+            name_parts.remove("windowed")
+        assert len(name_parts) == 2, "name should be dinov2, then either registers, windowed, both, or none, then the size"
+        self.encoder = DinoV2(
+            size=name_parts[-1],
+            out_feature_indexes=out_feature_indexes,
+            shape=target_shape,
+            use_registers=use_registers,
+            use_windowed_attn=use_windowed_attn,
+            gradient_checkpointing=gradient_checkpointing,
+            load_dinov2_weights=load_dinov2_weights,
+            patch_size=patch_size,
+            num_windows=num_windows,
+            positional_encoding_size=positional_encoding_size,
+        )
+        # build encoder + projector as backbone module
+        if freeze_encoder:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+        self.projector_scale = projector_scale
+        assert len(self.projector_scale) > 0
+        # x[0]
+        assert (
+            sorted(self.projector_scale) == self.projector_scale
+        ), "only support projector scale P3/P4/P5/P6 in ascending order."
+        level2scalefactor = dict(P3=2.0, P4=1.0, P5=0.5, P6=0.25)
+        scale_factors = [level2scalefactor[lvl] for lvl in self.projector_scale]
+        self.projector = MultiScaleProjector(
+            in_channels=self.encoder._out_feature_channels,
+            out_channels=out_channels,
+            scale_factors=scale_factors,
+            layer_norm=layer_norm,
+            rms_norm=rms_norm,
+        )
+        self._export = False
+    def export(self):
+        self._export = True
+        self._forward_origin = self.forward
+        self.forward = self.forward_export
+        if isinstance(self.encoder, PeftModel):
+            print("Merging and unloading LoRA weights")
+            self.encoder.merge_and_unload()
+    def forward(self, tensor_list: NestedTensor):
+        """ """
+        # (H, W, B, C)
+        feats = self.encoder(tensor_list.tensors)
+        feats = self.projector(feats)
+        # x: [(B, C, H, W)]
+        out = []
+        for feat in feats:
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=feat.shape[-2:]).to(torch.bool)[
+                0
+            ]
+            out.append(NestedTensor(feat, mask))
+        return out
+    def forward_export(self, tensors: torch.Tensor):
+        feats = self.encoder(tensors)
+        feats = self.projector(feats)
+        out_feats = []
+        out_masks = []
+        for feat in feats:
+            # x: [(B, C, H, W)]
+            b, _, h, w = feat.shape
+            out_masks.append(
+                torch.zeros((b, h, w), dtype=torch.bool, device=feat.device)
+            )
+            out_feats.append(feat)
+        return out_feats, out_masks
+    def get_named_param_lr_pairs(self, args, prefix: str = "backbone.0"):
+        num_layers = args.out_feature_indexes[-1] + 1
+        backbone_key = "backbone.0.encoder"
+        named_param_lr_pairs = {}
+        for n, p in self.named_parameters():
+            n = prefix + "." + n
+            if backbone_key in n and p.requires_grad:
+                lr = (
+                    args.lr_encoder
+                    * get_dinov2_lr_decay_rate(
+                        n,
+                        lr_decay_rate=args.lr_vit_layer_decay,
+                        num_layers=num_layers,
+                    )
+                    * args.lr_component_decay**2
+                )
+                wd = args.weight_decay * get_dinov2_weight_decay_rate(n)
+                named_param_lr_pairs[n] = {
+                    "params": p,
+                    "lr": lr,
+                    "weight_decay": wd,
+                }
+        return named_param_lr_pairs
+def get_dinov2_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if "embeddings" in name:
+            layer_id = 0
+        elif ".layer." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".layer.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+def get_dinov2_weight_decay_rate(name, weight_decay_rate=1.0):
+    if (
+        ("gamma" in name)
+        or ("pos_embed" in name)
+        or ("rel_pos" in name)
+        or ("bias" in name)
+        or ("norm" in name)
+        or ("embeddings" in name)
+    ):
+        weight_decay_rate = 0.0
+    return weight_decay_rate

rfdetr/models/backbone/base.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR)
+# Copyright (c) 2024 Baidu. All Rights Reserved.
+# ------------------------------------------------------------------------
+import torch
+import torch.nn.functional as F
+from torch import nn
+class BackboneBase(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def get_named_param_lr_pairs(self, args, prefix:str):
+        raise NotImplementedError

rfdetr/models/backbone/dinov2.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# ------------------------------------------------------------------------
+# RF-DETR
+# Copyright (c) 2025 Roboflow. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from transformers import AutoBackbone
+import torch.nn.functional as F
+import types
+import math
+import json
+import os
+from .dinov2_with_windowed_attn import WindowedDinov2WithRegistersConfig, WindowedDinov2WithRegistersBackbone
+size_to_width = {
+    "tiny": 192,
+    "small": 384,
+    "base": 768,
+    "large": 1024,
+}
+size_to_config = {
+    "small": "dinov2_small.json",
+    "base": "dinov2_base.json",
+    "large": "dinov2_large.json",
+}
+size_to_config_with_registers = {
+    "small": "dinov2_with_registers_small.json",
+    "base": "dinov2_with_registers_base.json",
+    "large": "dinov2_with_registers_large.json",
+}
+def get_config(size, use_registers):
+    config_dict = size_to_config_with_registers if use_registers else size_to_config
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    configs_dir = os.path.join(current_dir, "dinov2_configs")
+    config_path = os.path.join(configs_dir, config_dict[size])
+    with open(config_path, "r") as f:
+        dino_config = json.load(f)
+    return dino_config
+class DinoV2(nn.Module):
+    def __init__(self,
+            shape=(640, 640),
+            out_feature_indexes=[2, 4, 5, 9],
+            size="base",
+            use_registers=True,
+            use_windowed_attn=True,
+            gradient_checkpointing=False,
+            load_dinov2_weights=True,
+            patch_size=14,
+            num_windows=4,
+            positional_encoding_size=37,
+            ):
+        super().__init__()
+        name = f"facebook/dinov2-with-registers-{size}" if use_registers else f"facebook/dinov2-{size}"
+        self.shape = shape
+        self.patch_size = patch_size
+        self.num_windows = num_windows
+        # Create the encoder
+        if not use_windowed_attn:
+            assert not gradient_checkpointing, "Gradient checkpointing is not supported for non-windowed attention"
+            assert load_dinov2_weights, "Using non-windowed attention requires loading dinov2 weights from hub"
+            self.encoder = AutoBackbone.from_pretrained(
+                name,
+                out_features=[f"stage{i}" for i in out_feature_indexes],
+                return_dict=False,
+            )
+        else:
+            window_block_indexes = set(range(out_feature_indexes[-1] + 1))
+            window_block_indexes.difference_update(out_feature_indexes)
+            window_block_indexes = list(window_block_indexes)
+            dino_config = get_config(size, use_registers)
+            dino_config["return_dict"] = False
+            dino_config["out_features"] = [f"stage{i}" for i in out_feature_indexes]
+            implied_resolution = positional_encoding_size * patch_size
+            if implied_resolution != dino_config["image_size"]:
+                print(f"Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.")
+                dino_config["image_size"] = implied_resolution
+                load_dinov2_weights = False
+            if patch_size != 14:
+                print(f"Using patch size {patch_size} instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.")
+                dino_config["patch_size"] = patch_size
+                load_dinov2_weights = False
+            if use_registers:
+                windowed_dino_config = WindowedDinov2WithRegistersConfig(
+                    **dino_config,
+                    num_windows=num_windows,
+                    window_block_indexes=window_block_indexes,
+                    gradient_checkpointing=gradient_checkpointing,
+                )
+            else:
+                windowed_dino_config = WindowedDinov2WithRegistersConfig(
+                    **dino_config,
+                    num_windows=num_windows,
+                    window_block_indexes=window_block_indexes,
+                    num_register_tokens=0,
+                    gradient_checkpointing=gradient_checkpointing,
+                )
+            self.encoder = WindowedDinov2WithRegistersBackbone.from_pretrained(
+                name,
+                config=windowed_dino_config,
+            ) if load_dinov2_weights else WindowedDinov2WithRegistersBackbone(windowed_dino_config)
+        self._out_feature_channels = [size_to_width[size]] * len(out_feature_indexes)
+        self._export = False
+    def export(self):
+        if self._export:
+            return
+        self._export = True
+        shape = self.shape
+        def make_new_interpolated_pos_encoding(
+            position_embeddings, patch_size, height, width
+        ):
+            num_positions = position_embeddings.shape[1] - 1
+            dim = position_embeddings.shape[-1]
+            height = height // patch_size
+            width = width // patch_size
+            class_pos_embed = position_embeddings[:, 0]
+            patch_pos_embed = position_embeddings[:, 1:]
+            # Reshape and permute
+            patch_pos_embed = patch_pos_embed.reshape(
+                1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+            )
+            patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+            # Use bilinear interpolation without antialias
+            patch_pos_embed = F.interpolate(
+                patch_pos_embed,
+                size=(height, width),
+                mode="bicubic",
+                align_corners=False,
+                antialias=True,
+            )
+            # Reshape back
+            patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).reshape(1, -1, dim)
+            return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+        # If the shape of self.encoder.embeddings.position_embeddings
+        # matches the shape of your new tensor, use copy_:
+        with torch.no_grad():
+            new_positions = make_new_interpolated_pos_encoding(
+                self.encoder.embeddings.position_embeddings,
+                self.encoder.config.patch_size,
+                shape[0],
+                shape[1],
+            )
+        # Create a new Parameter with the new size
+        old_interpolate_pos_encoding = self.encoder.embeddings.interpolate_pos_encoding
+        def new_interpolate_pos_encoding(self_mod, embeddings, height, width):
+            num_patches = embeddings.shape[1] - 1
+            num_positions = self_mod.position_embeddings.shape[1] - 1
+            if num_patches == num_positions and height == width:
+                return self_mod.position_embeddings
+            return old_interpolate_pos_encoding(embeddings, height, width)
+        self.encoder.embeddings.position_embeddings = nn.Parameter(new_positions)
+        self.encoder.embeddings.interpolate_pos_encoding = types.MethodType(
+            new_interpolate_pos_encoding,
+            self.encoder.embeddings
+        )
+    def forward(self, x):
+        block_size = self.patch_size * self.num_windows
+        assert x.shape[2] % block_size == 0 and x.shape[3] % block_size == 0, f"Backbone requires input shape to be divisible by {block_size}, but got {x.shape}"
+        x = self.encoder(x)
+        return list(x[0])
+if __name__ == "__main__":
+    model = DinoV2()
+    model.export()
+    x = torch.randn(1, 3, 640, 640)
+    print(model(x))
+    for j in model(x):
+        print(j.shape)