Add tf code for efficientformer

huggingface · May 13, 2023 · 2cc5a5e · 2cc5a5e
1 parent cf11493
commit 2cc5a5e
Show file tree

Hide file tree

Showing 12 changed files with 1,658 additions and 9 deletions.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -311,7 +311,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |

diff --git a/docs/source/en/model_doc/efficientformer.mdx b/docs/source/en/model_doc/efficientformer.mdx
@@ -37,7 +37,7 @@ EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work pr
 reach extremely low latency on mobile devices while maintaining high performance.*
 
 This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
-The original code can be found [here](https://github.com/snap-research/EfficientFormer).
+The original code can be found [here](https://github.com/snap-research/EfficientFormer). The TensorFlow version of this model was added by [D-Roberts](https://huggingface.co/D-Roberts).
 
 ## Documentation resources
 
@@ -66,3 +66,18 @@ The original code can be found [here](https://github.com/snap-research/Efficient
 
 [[autodoc]] EfficientFormerForImageClassificationWithTeacher
     - forward
+
+## TFEfficientFormerModel
+
+[[autodoc]] TFEfficientFormerModel
+    - call
+
+## TFEfficientFormerForImageClassification
+
+[[autodoc]] TFEfficientFormerForImageClassification
+    - call
+
+## TFEfficientFormerForImageClassificationWithTeacher
+
+[[autodoc]] TFEfficientFormerForImageClassificationWithTeacher
+    - call
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -3129,6 +3129,15 @@
             "TFDPRReader",
         ]
     )
+    _import_structure["models.efficientformer"].extend(
+        [
+            "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFEfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerModel",
+            "TFEfficientFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.electra"].extend(
         [
             "TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6431,6 +6440,13 @@
             TFDPRQuestionEncoder,
             TFDPRReader,
         )
+        from .models.efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
         from .models.electra import (
             TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFElectraForMaskedLM,

diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
@@ -47,6 +47,7 @@
         ("deit", "TFDeiTModel"),
         ("distilbert", "TFDistilBertModel"),
         ("dpr", "TFDPRQuestionEncoder"),
+        ("efficientformer", "TFEfficientFormerModel"),
         ("electra", "TFElectraModel"),
         ("esm", "TFEsmModel"),
         ("flaubert", "TFFlaubertModel"),
@@ -201,6 +202,10 @@
         ("cvt", "TFCvtForImageClassification"),
         ("data2vec-vision", "TFData2VecVisionForImageClassification"),
         ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
+        (
+            "efficientformer",
+            ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
+        ),
         ("mobilevit", "TFMobileViTForImageClassification"),
         ("regnet", "TFRegNetForImageClassification"),
         ("resnet", "TFResNetForImageClassification"),

diff --git a/src/transformers/models/efficientformer/__init__.py b/src/transformers/models/efficientformer/__init__.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {
@@ -45,6 +51,20 @@
         "EfficientFormerPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_efficientformer"] = [
+        "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFEfficientFormerForImageClassification",
+        "TFEfficientFormerForImageClassificationWithTeacher",
+        "TFEfficientFormerModel",
+        "TFEfficientFormerPreTrainedModel",
+    ]
+
 if TYPE_CHECKING:
     from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
 
@@ -69,6 +89,19 @@
             EfficientFormerModel,
             EfficientFormerPreTrainedModel,
         )
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
 
 else:
     import sys

diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py
@@ -52,7 +52,7 @@ class EfficientFormerConfig(PretrainedConfig):
             The size of the key in meta3D block.
         attention_ratio (`int`, *optional*, defaults to 4):
             Ratio of the dimension of the query and value to the dimension of the key in MSHA block
-        resolution (`int`, *optional*, defaults to 5)
+        resolution (`int`, *optional*, defaults to 7)
             Size of each patch
         num_hidden_layers (`int`, *optional*, defaults to 5):
             Number of hidden layers in the Transformer encoder.
@@ -91,6 +91,8 @@ class EfficientFormerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
 
     Example:
 
@@ -136,6 +138,7 @@ def __init__(
         hidden_act: str = "gelu",
         initializer_range: float = 0.02,
         layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -165,3 +168,4 @@ def __init__(
         self.distillation = distillation
         self.use_layer_scale = use_layer_scale
         self.layer_scale_init_value = layer_scale_init_value
+        self.image_size = image_size
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -43,7 +43,7 @@
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
@@ -224,7 +224,7 @@ def __init__(
         hidden_features = hidden_features or in_features
 
         self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
-        self.actvation = ACT2FN[config.hidden_act]
+        self.activation = ACT2FN[config.hidden_act]
         self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
         self.dropout = nn.Dropout(drop)
 
@@ -235,13 +235,13 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         hidden_state = self.convolution1(hidden_state)
         hidden_state = self.batchnorm_before(hidden_state)
 
-        hidden_state = self.actvation(hidden_state)
+        hidden_state = self.activation(hidden_state)
         hidden_state = self.dropout(hidden_state)
         hidden_state = self.convolution2(hidden_state)
 
         hidden_state = self.batchnorm_after(hidden_state)
-
         hidden_state = self.dropout(hidden_state)
+
         return hidden_state
 
 
@@ -266,7 +266,7 @@ def drop_path(input, drop_prob: float = 0.0, training: bool = False):
     return output
 
 
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
 class EfficientFormerDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
@@ -349,6 +349,7 @@ def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False)
         for layer_module in self.blocks:
             if isinstance(hidden_states, tuple):
                 hidden_states = hidden_states[0]
+
             hidden_states = layer_module(hidden_states, output_attentions)
             if output_attentions:
                 all_attention_outputs = all_attention_outputs + (hidden_states[1],)
@@ -379,6 +380,7 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
 
         if self.use_layer_scale:
             layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
+
             layer_output = layer_output + self.drop_path(
                 self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
             )
@@ -398,6 +400,7 @@ def __init__(self, config: EfficientFormerConfig, stage_idx: int):
         drop_paths = [
             config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
         ]
+
         self.blocks = nn.ModuleList(
             [
                 EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
@@ -446,6 +449,7 @@ def __init__(self, config: EfficientFormerConfig):
             for i in range(num_intermediate_stages)
         ]
         intermediate_stages = []
+
         for i in range(num_intermediate_stages):
             intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
             if downsamples[i]: