Spaces:
Build error
Build error
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # | |
| # This source code is licensed under the Apache License, Version 2.0 | |
| # found in the LICENSE file in the root directory of this source tree. | |
| from enum import Enum | |
| from typing import Union | |
| import torch | |
| _DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2" | |
| def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str: | |
| compact_arch_name = arch_name.replace("_", "")[:4] | |
| registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else "" | |
| return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}" | |
| class Weights(Enum): | |
| LVD142M = "LVD142M" | |
| def _make_dinov2_model( | |
| *, | |
| arch_name: str = "vit_large", | |
| img_size: int = 518, | |
| patch_size: int = 14, | |
| init_values: float = 1.0, | |
| ffn_layer: str = "mlp", | |
| block_chunks: int = 0, | |
| num_register_tokens: int = 0, | |
| interpolate_antialias: bool = False, | |
| interpolate_offset: float = 0.1, | |
| pretrained: bool = True, | |
| weights: Union[Weights, str] = Weights.LVD142M, | |
| **kwargs, | |
| ): | |
| import vision_transformer as vits | |
| if isinstance(weights, str): | |
| try: | |
| weights = Weights[weights] | |
| except KeyError: | |
| raise AssertionError(f"Unsupported weights: {weights}") | |
| model_base_name = _make_dinov2_model_name(arch_name, patch_size) | |
| vit_kwargs = dict( | |
| img_size=img_size, | |
| patch_size=patch_size, | |
| init_values=init_values, | |
| ffn_layer=ffn_layer, | |
| block_chunks=block_chunks, | |
| num_register_tokens=num_register_tokens, | |
| interpolate_antialias=interpolate_antialias, | |
| interpolate_offset=interpolate_offset, | |
| ) | |
| vit_kwargs.update(**kwargs) | |
| model = vits.__dict__[arch_name](**vit_kwargs) | |
| if pretrained: | |
| model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens) | |
| url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth" | |
| state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") | |
| model.load_state_dict(state_dict, strict=True) | |
| return model | |
| def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs) | |
| def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs) | |
| def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs) | |
| def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model( | |
| arch_name="vit_giant2", | |
| ffn_layer="swiglufused", | |
| weights=weights, | |
| pretrained=pretrained, | |
| **kwargs, | |
| ) | |
| def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model( | |
| arch_name="vit_small", | |
| pretrained=pretrained, | |
| weights=weights, | |
| num_register_tokens=4, | |
| interpolate_antialias=True, | |
| interpolate_offset=0.0, | |
| **kwargs, | |
| ) | |
| def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model( | |
| arch_name="vit_base", | |
| pretrained=pretrained, | |
| weights=weights, | |
| num_register_tokens=4, | |
| interpolate_antialias=True, | |
| interpolate_offset=0.0, | |
| **kwargs, | |
| ) | |
| def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model( | |
| arch_name="vit_large", | |
| pretrained=pretrained, | |
| weights=weights, | |
| num_register_tokens=4, | |
| interpolate_antialias=True, | |
| interpolate_offset=0.0, | |
| **kwargs, | |
| ) | |
| def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs): | |
| """ | |
| DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset. | |
| """ | |
| return _make_dinov2_model( | |
| arch_name="vit_giant2", | |
| ffn_layer="swiglufused", | |
| weights=weights, | |
| pretrained=pretrained, | |
| num_register_tokens=4, | |
| interpolate_antialias=True, | |
| interpolate_offset=0.0, | |
| **kwargs, | |
| ) | |