Add TensorFlow implementation of EfficientFormer (#22620)

* Add tf code for efficientformer * Fix return dict bug - return last hidden state after last stage * Fix corresponding return dict bug * Override test tol * Change default values of training to False * Set training to default False X3 * Rm axis from ln * Set init in dense projection * Rm debug stuff * Make style; all tests pass. * Modify year to 2023 * Fix attention biases codes * Update the shape list logic * Add a batch norm eps config * Remove extract comments in test files * Add conditional attn and hidden states return for serving output * Change channel dim checking logic * Add exception for withteacher model in training mode * Revert layer count for now * Add layer count for conditional layer naming * Transpose for conv happens only in main layer * Make tests smaller * Make style * Update doc * Rm from_pt * Change to actual expect image class label * Remove stray print in tests * Update image processor test * Remove the old serving output logic * Make style * Make style * Complete test
huggingface · May 31, 2023 · 88f50a1 · 88f50a1
1 parent 9fea71b
commit 88f50a1
Show file tree

Hide file tree

Showing 12 changed files with 1,537 additions and 23 deletions.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -313,7 +313,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           DonutSwin           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |              DPR              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              DPT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        EfficientFormer        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |         EfficientNet          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ELECTRA            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        Encoder decoder        |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |

diff --git a/docs/source/en/model_doc/efficientformer.mdx b/docs/source/en/model_doc/efficientformer.mdx
@@ -37,7 +37,7 @@ EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work pr
 reach extremely low latency on mobile devices while maintaining high performance.*
 
 This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
-The original code can be found [here](https://github.com/snap-research/EfficientFormer).
+The original code can be found [here](https://github.com/snap-research/EfficientFormer). The TensorFlow version of this model was added by [D-Roberts](https://huggingface.co/D-Roberts).
 
 ## Documentation resources
 
@@ -66,3 +66,18 @@ The original code can be found [here](https://github.com/snap-research/Efficient
 
 [[autodoc]] EfficientFormerForImageClassificationWithTeacher
     - forward
+
+## TFEfficientFormerModel
+
+[[autodoc]] TFEfficientFormerModel
+    - call
+
+## TFEfficientFormerForImageClassification
+
+[[autodoc]] TFEfficientFormerForImageClassification
+    - call
+
+## TFEfficientFormerForImageClassificationWithTeacher
+
+[[autodoc]] TFEfficientFormerForImageClassificationWithTeacher
+    - call
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -3142,6 +3142,15 @@
             "TFDPRReader",
         ]
     )
+    _import_structure["models.efficientformer"].extend(
+        [
+            "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFEfficientFormerForImageClassification",
+            "TFEfficientFormerForImageClassificationWithTeacher",
+            "TFEfficientFormerModel",
+            "TFEfficientFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.electra"].extend(
         [
             "TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6471,6 +6480,13 @@
             TFDPRQuestionEncoder,
             TFDPRReader,
         )
+        from .models.efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
         from .models.electra import (
             TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFElectraForMaskedLM,

diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
@@ -47,6 +47,7 @@
         ("deit", "TFDeiTModel"),
         ("distilbert", "TFDistilBertModel"),
         ("dpr", "TFDPRQuestionEncoder"),
+        ("efficientformer", "TFEfficientFormerModel"),
         ("electra", "TFElectraModel"),
         ("esm", "TFEsmModel"),
         ("flaubert", "TFFlaubertModel"),
@@ -202,6 +203,10 @@
         ("cvt", "TFCvtForImageClassification"),
         ("data2vec-vision", "TFData2VecVisionForImageClassification"),
         ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
+        (
+            "efficientformer",
+            ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
+        ),
         ("mobilevit", "TFMobileViTForImageClassification"),
         ("regnet", "TFRegNetForImageClassification"),
         ("resnet", "TFResNetForImageClassification"),

diff --git a/src/transformers/models/efficientformer/__init__.py b/src/transformers/models/efficientformer/__init__.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {
@@ -45,6 +51,20 @@
         "EfficientFormerPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_efficientformer"] = [
+        "TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFEfficientFormerForImageClassification",
+        "TFEfficientFormerForImageClassificationWithTeacher",
+        "TFEfficientFormerModel",
+        "TFEfficientFormerPreTrainedModel",
+    ]
+
 if TYPE_CHECKING:
     from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
 
@@ -69,6 +89,19 @@
             EfficientFormerModel,
             EfficientFormerPreTrainedModel,
         )
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_efficientformer import (
+            TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFEfficientFormerForImageClassification,
+            TFEfficientFormerForImageClassificationWithTeacher,
+            TFEfficientFormerModel,
+            TFEfficientFormerPreTrainedModel,
+        )
 
 else:
     import sys

diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py
@@ -52,7 +52,7 @@ class EfficientFormerConfig(PretrainedConfig):
             The size of the key in meta3D block.
         attention_ratio (`int`, *optional*, defaults to 4):
             Ratio of the dimension of the query and value to the dimension of the key in MSHA block
-        resolution (`int`, *optional*, defaults to 5)
+        resolution (`int`, *optional*, defaults to 7)
             Size of each patch
         num_hidden_layers (`int`, *optional*, defaults to 5):
             Number of hidden layers in the Transformer encoder.
@@ -91,6 +91,8 @@ class EfficientFormerConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
 
     Example:
 
@@ -136,6 +138,8 @@ def __init__(
         hidden_act: str = "gelu",
         initializer_range: float = 0.02,
         layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        batch_norm_eps: float = 1e-05,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -165,3 +169,5 @@ def __init__(
         self.distillation = distillation
         self.use_layer_scale = use_layer_scale
         self.layer_scale_init_value = layer_scale_init_value
+        self.image_size = image_size
+        self.batch_norm_eps = batch_norm_eps
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -43,7 +43,7 @@
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
-_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
 
 # Image classification docstring
 _IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
@@ -73,7 +73,7 @@ def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim:
             stride=config.downsample_stride,
             padding=config.downsample_pad,
         )
-        self.norm = nn.BatchNorm2d(embed_dim) if apply_norm else nn.Identity()
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
@@ -157,10 +157,10 @@ def __init__(self, config: EfficientFormerConfig, out_channels: int):
         super().__init__()
 
         self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
-        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2)
+        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
 
         self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
-        self.batchnorm_after = nn.BatchNorm2d(out_channels)
+        self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
 
         self.activation = nn.ReLU()
 
@@ -224,24 +224,24 @@ def __init__(
         hidden_features = hidden_features or in_features
 
         self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
-        self.actvation = ACT2FN[config.hidden_act]
+        self.activation = ACT2FN[config.hidden_act]
         self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
         self.dropout = nn.Dropout(drop)
 
-        self.batchnorm_before = nn.BatchNorm2d(hidden_features)
-        self.batchnorm_after = nn.BatchNorm2d(out_features)
+        self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
+        self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
 
     def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         hidden_state = self.convolution1(hidden_state)
         hidden_state = self.batchnorm_before(hidden_state)
 
-        hidden_state = self.actvation(hidden_state)
+        hidden_state = self.activation(hidden_state)
         hidden_state = self.dropout(hidden_state)
         hidden_state = self.convolution2(hidden_state)
 
         hidden_state = self.batchnorm_after(hidden_state)
-
         hidden_state = self.dropout(hidden_state)
+
         return hidden_state
 
 
@@ -266,7 +266,7 @@ def drop_path(input, drop_prob: float = 0.0, training: bool = False):
     return output
 
 
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
 class EfficientFormerDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
@@ -301,8 +301,10 @@ def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0
             attention_ratio=config.attention_ratio,
             resolution=config.resolution,
         )
-        self.layernorm1 = nn.LayerNorm(dim)
-        self.layernorm2 = nn.LayerNorm(dim)
+
+        self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
         mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
         self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
 
@@ -346,15 +348,20 @@ def __init__(self, config: EfficientFormerConfig):
 
     def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
         all_attention_outputs = () if output_attentions else None
+
         for layer_module in self.blocks:
             if isinstance(hidden_states, tuple):
                 hidden_states = hidden_states[0]
+
             hidden_states = layer_module(hidden_states, output_attentions)
+
             if output_attentions:
                 all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
         if output_attentions:
             outputs = (hidden_states[0],) + all_attention_outputs
             return outputs
+
         return hidden_states
 
 
@@ -379,6 +386,7 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
 
         if self.use_layer_scale:
             layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
+
             layer_output = layer_output + self.drop_path(
                 self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
             )
@@ -398,6 +406,7 @@ def __init__(self, config: EfficientFormerConfig, stage_idx: int):
         drop_paths = [
             config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
         ]
+
         self.blocks = nn.ModuleList(
             [
                 EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
@@ -446,6 +455,7 @@ def __init__(self, config: EfficientFormerConfig):
             for i in range(num_intermediate_stages)
         ]
         intermediate_stages = []
+
         for i in range(num_intermediate_stages):
             intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
             if downsamples[i]:
@@ -475,14 +485,15 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
         layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
+
         if output_attentions:
             all_self_attentions = all_self_attentions + layer_output[1:]
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (layer_output[0],)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
 
         return BaseModelOutput(
             last_hidden_state=layer_output[0],