add VitSAM

sageyou · sageyou · commit 078ef5b4f5e9 · 2023-10-30T14:41:10.000+08:00
diff --git a/mindcv/models/__init__.py b/mindcv/models/__init__.py
@@ -52,6 +52,7 @@
     vgg,
     visformer,
     vit,
+    vit_sam,
     volo,
     xception,
     xcit,
@@ -109,6 +110,7 @@
 from .vgg import *
 from .visformer import *
 from .vit import *
+from .vit_sam import *
 from .volo import *
 from .xception import *
 from .xcit import *
@@ -168,6 +170,7 @@
 __all__.extend(vgg.__all__)
 __all__.extend(visformer.__all__)
 __all__.extend(vit.__all__)
+__all__.extend(vit_sam.__all__)
 __all__.extend(volo.__all__)
 __all__.extend(["Xception", "xception"])
 __all__.extend(xcit.__all__)
diff --git a/mindcv/models/layers/format.py b/mindcv/models/layers/format.py
@@ -1,5 +1,4 @@
 from enum import Enum
-from typing import Union
 
 import mindspore
 
@@ -11,9 +10,6 @@ class Format(str, Enum):
     NLC = 'NLC'
 
 
-FormatT = Union[str, Format]
-
-
 def nchw_to(x: mindspore.Tensor, fmt: Format):
     if fmt == Format.NHWC:
         x = x.permute(0, 2, 3, 1)
diff --git a/mindcv/models/layers/patch_dropout.py b/mindcv/models/layers/patch_dropout.py
@@ -8,41 +8,41 @@ class PatchDropout(nn.Cell):
     """
     https://arxiv.org/abs/2212.00794
     """
+
     def __init__(
-            self,
-            prob: float = 0.5,
-            num_prefix_tokens: int = 1,
-            ordered: bool = False,
-            return_indices: bool = False,
+        self,
+        prob: float = 0.5,
+        num_prefix_tokens: int = 1,
+        ordered: bool = False,
+        return_indices: bool = False,
     ):
         super().__init__()
-        assert 0 <= prob < 1.
+        assert 0 <= prob < 1.0
         self.prob = prob
         self.num_prefix_tokens = num_prefix_tokens  # exclude CLS token (or other prefix tokens)
         self.ordered = ordered
         self.return_indices = return_indices
-        self.sort = ops.Sort()
 
-    def forward(self, x):
-        if not self.training or self.prob == 0.:
+    def construct(self, x):
+        if not self.training or self.prob == 0.0:
             if self.return_indices:
                 return x, None
             return x
 
         if self.num_prefix_tokens:
-            prefix_tokens, x = x[:, :self.num_prefix_tokens], x[:, self.num_prefix_tokens:]
+            prefix_tokens, x = x[:, : self.num_prefix_tokens], x[:, self.num_prefix_tokens :]
         else:
             prefix_tokens = None
 
         B = x.shape[0]
         L = x.shape[1]
-        num_keep = max(1, int(L * (1. - self.prob)))
-        _, indices = self.sort(ms.Tensor(np.random.rand(B, L)).astype(ms.float32))
+        num_keep = max(1, int(L * (1.0 - self.prob)))
+        _, indices = ops.sort(ms.Tensor(np.random.rand(B, L)).astype(ms.float32))
         keep_indices = indices[:, :num_keep]
         if self.ordered:
             # NOTE does not need to maintain patch order in typical transformer use,
             # but possibly useful for debug / visualization
-            keep_indices, _ = self.sort(keep_indices)
+            keep_indices, _ = ops.sort(keep_indices)
         keep_indices = ops.broadcast_to(ops.expand_dims(keep_indices, axis=-1), (-1, -1, x.shape[2]))
         x = ops.gather_elements(x, dim=1, index=keep_indices)
 
diff --git a/mindcv/models/layers/patch_embed.py b/mindcv/models/layers/patch_embed.py
@@ -18,6 +18,7 @@ class PatchEmbed(nn.Cell):
         embed_dim (int): Number of linear projection output channels. Default: 96.
         norm_layer (nn.Cell, optional): Normalization layer. Default: None
     """
+
     output_fmt: Format
 
     def __init__(
@@ -37,11 +38,11 @@ def __init__(
         self.patch_size = to_2tuple(patch_size)
         if image_size is not None:
             self.image_size = to_2tuple(image_size)
-            self.patches_resolution = tuple([s // p for s, p in zip(self.image_size, self.patch_size)])
-            self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
+            self.grid_size = tuple([s // p for s, p in zip(self.image_size, self.patch_size)])
+            self.num_patches = self.grid_size[0] * self.grid_size[1]
         else:
             self.image_size = None
-            self.patches_resolution = None
+            self.grid_size = None
             self.num_patches = None
 
         if output_fmt is not None:
@@ -86,8 +87,8 @@ def construct(self, x: Tensor) -> Tensor:
         # FIXME look at relaxing size constraints
         x = self.proj(x)
         if self.flatten:
-            x = ops.Reshape()(x, (B, self.embed_dim, -1))  # B Ph*Pw C
-            x = ops.Transpose()(x, (0, 2, 1))
+            x = ops.reshape(x, (B, self.embed_dim, -1))  # B Ph*Pw C
+            x = ops.transpose(x, (0, 2, 1))
         elif self.output_fmt != "NCHW":
             x = nchw_to(x, self.output_fmt)
         if self.norm is not None:
diff --git a/mindcv/models/layers/pos_embed.py b/mindcv/models/layers/pos_embed.py
@@ -11,11 +11,11 @@
 
 
 def resample_abs_pos_embed(
-        posemb,
-        new_size: List[int],
-        old_size: Optional[List[int]] = None,
-        num_prefix_tokens: int = 1,
-        interpolation: str = 'nearest',
+    posemb,
+    new_size: List[int],
+    old_size: Optional[List[int]] = None,
+    num_prefix_tokens: int = 1,
+    interpolation: str = 'nearest',
 ):
     # sort out sizes, assume square if old size not provided
     num_pos_tokens = posemb.shape[1]
diff --git a/mindcv/models/vit_sam.py b/mindcv/models/vit_sam.py