diff --git a/Makefile b/Makefile
index cfa40b7bd6ee6e..d3998327cc71f1 100644
--- a/Makefile
+++ b/Makefile
@@ -53,7 +53,6 @@ quality:
 	@python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
 	ruff check $(check_dirs) setup.py conftest.py
 	ruff format --check $(check_dirs) setup.py conftest.py
-	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
 	python utils/check_doc_toc.py
 	python utils/check_docstrings.py --check_all
@@ -62,7 +61,6 @@ quality:
 # Format source code automatically and check is there are any problems left that need manual fixing
 
 extra_style_checks:
-	python utils/custom_init_isort.py
 	python utils/sort_auto_mappings.py
 	python utils/check_doc_toc.py --fix_and_overwrite
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4f4b17ac84f1fb..faed0cea81d824 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1499,7 +1499,6 @@
             "BertForQuestionAnswering",
             "BertForSequenceClassification",
             "BertForTokenClassification",
-            "BertLayer",
             "BertLMHeadModel",
             "BertModel",
             "BertPreTrainedModel",
@@ -1523,7 +1522,6 @@
             "BigBirdForQuestionAnswering",
             "BigBirdForSequenceClassification",
             "BigBirdForTokenClassification",
-            "BigBirdLayer",
             "BigBirdModel",
             "BigBirdPreTrainedModel",
             "load_tf_weights_in_big_bird",
@@ -1642,7 +1640,6 @@
             "CanineForQuestionAnswering",
             "CanineForSequenceClassification",
             "CanineForTokenClassification",
-            "CanineLayer",
             "CanineModel",
             "CaninePreTrainedModel",
             "load_tf_weights_in_canine",
@@ -1729,7 +1726,6 @@
             "ConvBertForQuestionAnswering",
             "ConvBertForSequenceClassification",
             "ConvBertForTokenClassification",
-            "ConvBertLayer",
             "ConvBertModel",
             "ConvBertPreTrainedModel",
             "load_tf_weights_in_convbert",
@@ -1958,7 +1954,6 @@
             "QDQBertForQuestionAnswering",
             "QDQBertForSequenceClassification",
             "QDQBertForTokenClassification",
-            "QDQBertLayer",
             "QDQBertLMHeadModel",
             "QDQBertModel",
             "QDQBertPreTrainedModel",
@@ -2210,7 +2205,6 @@
             "FNetForQuestionAnswering",
             "FNetForSequenceClassification",
             "FNetForTokenClassification",
-            "FNetLayer",
             "FNetModel",
             "FNetPreTrainedModel",
         ]
@@ -2311,7 +2305,6 @@
             "GPTNeoXForQuestionAnswering",
             "GPTNeoXForSequenceClassification",
             "GPTNeoXForTokenClassification",
-            "GPTNeoXLayer",
             "GPTNeoXModel",
             "GPTNeoXPreTrainedModel",
         ]
@@ -2319,7 +2312,6 @@
     _import_structure["models.gpt_neox_japanese"].extend(
         [
             "GPTNeoXJapaneseForCausalLM",
-            "GPTNeoXJapaneseLayer",
             "GPTNeoXJapaneseModel",
             "GPTNeoXJapanesePreTrainedModel",
         ]
@@ -2551,7 +2543,6 @@
             "LongformerForTokenClassification",
             "LongformerModel",
             "LongformerPreTrainedModel",
-            "LongformerSelfAttention",
         ]
     )
     _import_structure["models.longt5"].extend(
@@ -2584,7 +2575,6 @@
             "LxmertModel",
             "LxmertPreTrainedModel",
             "LxmertVisualFeatureEncoder",
-            "LxmertXLayer",
         ]
     )
     _import_structure["models.m2m_100"].extend(
@@ -2608,7 +2598,9 @@
             "Mamba2PreTrainedModel",
         ]
     )
-    _import_structure["models.marian"].extend(["MarianForCausalLM", "MarianModel", "MarianMTModel"])
+    _import_structure["models.marian"].extend(
+        ["MarianForCausalLM", "MarianModel", "MarianMTModel", "MarianPreTrainedModel"]
+    )
     _import_structure["models.markuplm"].extend(
         [
             "MarkupLMForQuestionAnswering",
@@ -2691,7 +2683,6 @@
             "MobileBertForQuestionAnswering",
             "MobileBertForSequenceClassification",
             "MobileBertForTokenClassification",
-            "MobileBertLayer",
             "MobileBertModel",
             "MobileBertPreTrainedModel",
             "load_tf_weights_in_mobilebert",
@@ -2737,7 +2728,6 @@
             "MPNetForQuestionAnswering",
             "MPNetForSequenceClassification",
             "MPNetForTokenClassification",
-            "MPNetLayer",
             "MPNetModel",
             "MPNetPreTrainedModel",
         ]
@@ -2827,7 +2817,6 @@
             "NystromformerForQuestionAnswering",
             "NystromformerForSequenceClassification",
             "NystromformerForTokenClassification",
-            "NystromformerLayer",
             "NystromformerModel",
             "NystromformerPreTrainedModel",
         ]
@@ -2941,7 +2930,6 @@
             "PerceiverForMultimodalAutoencoding",
             "PerceiverForOpticalFlow",
             "PerceiverForSequenceClassification",
-            "PerceiverLayer",
             "PerceiverModel",
             "PerceiverPreTrainedModel",
         ]
@@ -3077,11 +3065,9 @@
     )
     _import_structure["models.reformer"].extend(
         [
-            "ReformerAttention",
             "ReformerForMaskedLM",
             "ReformerForQuestionAnswering",
             "ReformerForSequenceClassification",
-            "ReformerLayer",
             "ReformerModel",
             "ReformerModelWithLMHead",
             "ReformerPreTrainedModel",
@@ -3102,7 +3088,6 @@
             "RemBertForQuestionAnswering",
             "RemBertForSequenceClassification",
             "RemBertForTokenClassification",
-            "RemBertLayer",
             "RemBertModel",
             "RemBertPreTrainedModel",
             "load_tf_weights_in_rembert",
@@ -3149,7 +3134,6 @@
             "RoCBertForQuestionAnswering",
             "RoCBertForSequenceClassification",
             "RoCBertForTokenClassification",
-            "RoCBertLayer",
             "RoCBertModel",
             "RoCBertPreTrainedModel",
             "load_tf_weights_in_roc_bert",
@@ -3163,7 +3147,6 @@
             "RoFormerForQuestionAnswering",
             "RoFormerForSequenceClassification",
             "RoFormerForTokenClassification",
-            "RoFormerLayer",
             "RoFormerModel",
             "RoFormerPreTrainedModel",
             "load_tf_weights_in_roformer",
@@ -3220,7 +3203,6 @@
             "SegformerDecodeHead",
             "SegformerForImageClassification",
             "SegformerForSemanticSegmentation",
-            "SegformerLayer",
             "SegformerModel",
             "SegformerPreTrainedModel",
         ]
@@ -3279,7 +3261,6 @@
         [
             "SplinterForPreTraining",
             "SplinterForQuestionAnswering",
-            "SplinterLayer",
             "SplinterModel",
             "SplinterPreTrainedModel",
         ]
@@ -3292,7 +3273,6 @@
             "SqueezeBertForSequenceClassification",
             "SqueezeBertForTokenClassification",
             "SqueezeBertModel",
-            "SqueezeBertModule",
             "SqueezeBertPreTrainedModel",
         ]
     )
@@ -3491,7 +3471,6 @@
             "ViltForMaskedLM",
             "ViltForQuestionAnswering",
             "ViltForTokenClassification",
-            "ViltLayer",
             "ViltModel",
             "ViltPreTrainedModel",
         ]
@@ -3511,7 +3490,6 @@
             "VisualBertForQuestionAnswering",
             "VisualBertForRegionToPhraseAlignment",
             "VisualBertForVisualReasoning",
-            "VisualBertLayer",
             "VisualBertModel",
             "VisualBertPreTrainedModel",
         ]
@@ -3527,7 +3505,6 @@
     _import_structure["models.vit_mae"].extend(
         [
             "ViTMAEForPreTraining",
-            "ViTMAELayer",
             "ViTMAEModel",
             "ViTMAEPreTrainedModel",
         ]
@@ -3707,7 +3684,6 @@
             "YosoForQuestionAnswering",
             "YosoForSequenceClassification",
             "YosoForTokenClassification",
-            "YosoLayer",
             "YosoModel",
             "YosoPreTrainedModel",
         ]
@@ -3854,7 +3830,6 @@
     )
     _import_structure["models.bert"].extend(
         [
-            "TFBertEmbeddings",
             "TFBertForMaskedLM",
             "TFBertForMultipleChoice",
             "TFBertForNextSentencePrediction",
@@ -3920,7 +3895,6 @@
             "TFConvBertForQuestionAnswering",
             "TFConvBertForSequenceClassification",
             "TFConvBertForTokenClassification",
-            "TFConvBertLayer",
             "TFConvBertModel",
             "TFConvBertPreTrainedModel",
         ]
@@ -4151,7 +4125,6 @@
             "TFLongformerForTokenClassification",
             "TFLongformerModel",
             "TFLongformerPreTrainedModel",
-            "TFLongformerSelfAttention",
         ]
     )
     _import_structure["models.lxmert"].extend(
@@ -4252,7 +4225,6 @@
             "TFRemBertForQuestionAnswering",
             "TFRemBertForSequenceClassification",
             "TFRemBertForTokenClassification",
-            "TFRemBertLayer",
             "TFRemBertModel",
             "TFRemBertPreTrainedModel",
         ]
@@ -4298,7 +4270,6 @@
             "TFRoFormerForQuestionAnswering",
             "TFRoFormerForSequenceClassification",
             "TFRoFormerForTokenClassification",
-            "TFRoFormerLayer",
             "TFRoFormerModel",
             "TFRoFormerPreTrainedModel",
         ]
@@ -5827,7 +5798,8 @@
         from .models.llama import LlamaTokenizer
         from .models.m2m_100 import M2M100Tokenizer
         from .models.marian import MarianTokenizer
-        from .models.mbart import MBart50Tokenizer, MBartTokenizer
+        from .models.mbart import MBartTokenizer
+        from .models.mbart50 import MBart50Tokenizer
         from .models.mluke import MLukeTokenizer
         from .models.mt5 import MT5Tokenizer
         from .models.nllb import NllbTokenizer
@@ -6298,7 +6270,6 @@
             BertForQuestionAnswering,
             BertForSequenceClassification,
             BertForTokenClassification,
-            BertLayer,
             BertLMHeadModel,
             BertModel,
             BertPreTrainedModel,
@@ -6318,7 +6289,6 @@
             BigBirdForQuestionAnswering,
             BigBirdForSequenceClassification,
             BigBirdForTokenClassification,
-            BigBirdLayer,
             BigBirdModel,
             BigBirdPreTrainedModel,
             load_tf_weights_in_big_bird,
@@ -6413,7 +6383,6 @@
             CanineForQuestionAnswering,
             CanineForSequenceClassification,
             CanineForTokenClassification,
-            CanineLayer,
             CanineModel,
             CaninePreTrainedModel,
             load_tf_weights_in_canine,
@@ -6486,7 +6455,6 @@
             ConvBertForQuestionAnswering,
             ConvBertForSequenceClassification,
             ConvBertForTokenClassification,
-            ConvBertLayer,
             ConvBertModel,
             ConvBertPreTrainedModel,
             load_tf_weights_in_convbert,
@@ -6671,7 +6639,6 @@
             QDQBertForQuestionAnswering,
             QDQBertForSequenceClassification,
             QDQBertForTokenClassification,
-            QDQBertLayer,
             QDQBertLMHeadModel,
             QDQBertModel,
             QDQBertPreTrainedModel,
@@ -6870,7 +6837,6 @@
             FNetForQuestionAnswering,
             FNetForSequenceClassification,
             FNetForTokenClassification,
-            FNetLayer,
             FNetModel,
             FNetPreTrainedModel,
         )
@@ -6958,13 +6924,11 @@
             GPTNeoXForQuestionAnswering,
             GPTNeoXForSequenceClassification,
             GPTNeoXForTokenClassification,
-            GPTNeoXLayer,
             GPTNeoXModel,
             GPTNeoXPreTrainedModel,
         )
         from .models.gpt_neox_japanese import (
             GPTNeoXJapaneseForCausalLM,
-            GPTNeoXJapaneseLayer,
             GPTNeoXJapaneseModel,
             GPTNeoXJapanesePreTrainedModel,
         )
@@ -7140,7 +7104,6 @@
             LongformerForTokenClassification,
             LongformerModel,
             LongformerPreTrainedModel,
-            LongformerSelfAttention,
         )
         from .models.longt5 import (
             LongT5EncoderModel,
@@ -7167,7 +7130,6 @@
             LxmertModel,
             LxmertPreTrainedModel,
             LxmertVisualFeatureEncoder,
-            LxmertXLayer,
         )
         from .models.m2m_100 import (
             M2M100ForConditionalGeneration,
@@ -7184,7 +7146,7 @@
             Mamba2Model,
             Mamba2PreTrainedModel,
         )
-        from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel
+        from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel, MarianPreTrainedModel
         from .models.markuplm import (
             MarkupLMForQuestionAnswering,
             MarkupLMForSequenceClassification,
@@ -7250,7 +7212,6 @@
             MobileBertForQuestionAnswering,
             MobileBertForSequenceClassification,
             MobileBertForTokenClassification,
-            MobileBertLayer,
             MobileBertModel,
             MobileBertPreTrainedModel,
             load_tf_weights_in_mobilebert,
@@ -7286,7 +7247,6 @@
             MPNetForQuestionAnswering,
             MPNetForSequenceClassification,
             MPNetForTokenClassification,
-            MPNetLayer,
             MPNetModel,
             MPNetPreTrainedModel,
         )
@@ -7358,7 +7318,6 @@
             NystromformerForQuestionAnswering,
             NystromformerForSequenceClassification,
             NystromformerForTokenClassification,
-            NystromformerLayer,
             NystromformerModel,
             NystromformerPreTrainedModel,
         )
@@ -7446,7 +7405,6 @@
             PerceiverForMultimodalAutoencoding,
             PerceiverForOpticalFlow,
             PerceiverForSequenceClassification,
-            PerceiverLayer,
             PerceiverModel,
             PerceiverPreTrainedModel,
         )
@@ -7548,11 +7506,9 @@
             RecurrentGemmaPreTrainedModel,
         )
         from .models.reformer import (
-            ReformerAttention,
             ReformerForMaskedLM,
             ReformerForQuestionAnswering,
             ReformerForSequenceClassification,
-            ReformerLayer,
             ReformerModel,
             ReformerModelWithLMHead,
             ReformerPreTrainedModel,
@@ -7569,7 +7525,6 @@
             RemBertForQuestionAnswering,
             RemBertForSequenceClassification,
             RemBertForTokenClassification,
-            RemBertLayer,
             RemBertModel,
             RemBertPreTrainedModel,
             load_tf_weights_in_rembert,
@@ -7608,7 +7563,6 @@
             RoCBertForQuestionAnswering,
             RoCBertForSequenceClassification,
             RoCBertForTokenClassification,
-            RoCBertLayer,
             RoCBertModel,
             RoCBertPreTrainedModel,
             load_tf_weights_in_roc_bert,
@@ -7620,7 +7574,6 @@
             RoFormerForQuestionAnswering,
             RoFormerForSequenceClassification,
             RoFormerForTokenClassification,
-            RoFormerLayer,
             RoFormerModel,
             RoFormerPreTrainedModel,
             load_tf_weights_in_roformer,
@@ -7665,7 +7618,6 @@
             SegformerDecodeHead,
             SegformerForImageClassification,
             SegformerForSemanticSegmentation,
-            SegformerLayer,
             SegformerModel,
             SegformerPreTrainedModel,
         )
@@ -7710,7 +7662,6 @@
         from .models.splinter import (
             SplinterForPreTraining,
             SplinterForQuestionAnswering,
-            SplinterLayer,
             SplinterModel,
             SplinterPreTrainedModel,
         )
@@ -7721,7 +7672,6 @@
             SqueezeBertForSequenceClassification,
             SqueezeBertForTokenClassification,
             SqueezeBertModel,
-            SqueezeBertModule,
             SqueezeBertPreTrainedModel,
         )
         from .models.stablelm import (
@@ -7870,7 +7820,6 @@
             ViltForMaskedLM,
             ViltForQuestionAnswering,
             ViltForTokenClassification,
-            ViltLayer,
             ViltModel,
             ViltPreTrainedModel,
         )
@@ -7886,7 +7835,6 @@
             VisualBertForQuestionAnswering,
             VisualBertForRegionToPhraseAlignment,
             VisualBertForVisualReasoning,
-            VisualBertLayer,
             VisualBertModel,
             VisualBertPreTrainedModel,
         )
@@ -7898,7 +7846,6 @@
         )
         from .models.vit_mae import (
             ViTMAEForPreTraining,
-            ViTMAELayer,
             ViTMAEModel,
             ViTMAEPreTrainedModel,
         )
@@ -8040,7 +7987,6 @@
             YosoForQuestionAnswering,
             YosoForSequenceClassification,
             YosoForTokenClassification,
-            YosoLayer,
             YosoModel,
             YosoPreTrainedModel,
         )
@@ -8174,7 +8120,6 @@
             TFBartPretrainedModel,
         )
         from .models.bert import (
-            TFBertEmbeddings,
             TFBertForMaskedLM,
             TFBertForMultipleChoice,
             TFBertForNextSentencePrediction,
@@ -8228,7 +8173,6 @@
             TFConvBertForQuestionAnswering,
             TFConvBertForSequenceClassification,
             TFConvBertForTokenClassification,
-            TFConvBertLayer,
             TFConvBertModel,
             TFConvBertPreTrainedModel,
         )
@@ -8413,7 +8357,6 @@
             TFLongformerForTokenClassification,
             TFLongformerModel,
             TFLongformerPreTrainedModel,
-            TFLongformerSelfAttention,
         )
         from .models.lxmert import (
             TFLxmertForPreTraining,
@@ -8503,7 +8446,6 @@
             TFRemBertForQuestionAnswering,
             TFRemBertForSequenceClassification,
             TFRemBertForTokenClassification,
-            TFRemBertLayer,
             TFRemBertModel,
             TFRemBertPreTrainedModel,
         )
@@ -8541,7 +8483,6 @@
             TFRoFormerForQuestionAnswering,
             TFRoFormerForSequenceClassification,
             TFRoFormerForTokenClassification,
-            TFRoFormerLayer,
             TFRoFormerModel,
             TFRoFormerPreTrainedModel,
         )
diff --git a/src/transformers/models/albert/__init__.py b/src/transformers/models/albert/__init__.py
index 1d0a4a4d02845c..57b5747909e091 100644
--- a/src/transformers/models/albert/__init__.py
+++ b/src/transformers/models/albert/__init__.py
@@ -11,165 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_flax_available,
-    is_sentencepiece_available,
-    is_tf_available,
-    is_tokenizers_available,
-    is_torch_available,
-)
-
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
-_import_structure = {
-    "configuration_albert": ["AlbertConfig", "AlbertOnnxConfig"],
-}
-
-try:
-    if not is_sentencepiece_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_albert"] = ["AlbertTokenizer"]
-
-try:
-    if not is_tokenizers_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_albert_fast"] = ["AlbertTokenizerFast"]
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_albert"] = [
-        "AlbertForMaskedLM",
-        "AlbertForMultipleChoice",
-        "AlbertForPreTraining",
-        "AlbertForQuestionAnswering",
-        "AlbertForSequenceClassification",
-        "AlbertForTokenClassification",
-        "AlbertModel",
-        "AlbertPreTrainedModel",
-        "load_tf_weights_in_albert",
-    ]
-
-try:
-    if not is_tf_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_tf_albert"] = [
-        "TFAlbertForMaskedLM",
-        "TFAlbertForMultipleChoice",
-        "TFAlbertForPreTraining",
-        "TFAlbertForQuestionAnswering",
-        "TFAlbertForSequenceClassification",
-        "TFAlbertForTokenClassification",
-        "TFAlbertMainLayer",
-        "TFAlbertModel",
-        "TFAlbertPreTrainedModel",
-    ]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flax_albert"] = [
-        "FlaxAlbertForMaskedLM",
-        "FlaxAlbertForMultipleChoice",
-        "FlaxAlbertForPreTraining",
-        "FlaxAlbertForQuestionAnswering",
-        "FlaxAlbertForSequenceClassification",
-        "FlaxAlbertForTokenClassification",
-        "FlaxAlbertModel",
-        "FlaxAlbertPreTrainedModel",
-    ]
 
 if TYPE_CHECKING:
-    from .configuration_albert import AlbertConfig, AlbertOnnxConfig
-
-    try:
-        if not is_sentencepiece_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_albert import AlbertTokenizer
-
-    try:
-        if not is_tokenizers_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_albert_fast import AlbertTokenizerFast
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_albert import (
-            AlbertForMaskedLM,
-            AlbertForMultipleChoice,
-            AlbertForPreTraining,
-            AlbertForQuestionAnswering,
-            AlbertForSequenceClassification,
-            AlbertForTokenClassification,
-            AlbertModel,
-            AlbertPreTrainedModel,
-            load_tf_weights_in_albert,
-        )
-
-    try:
-        if not is_tf_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_tf_albert import (
-            TFAlbertForMaskedLM,
-            TFAlbertForMultipleChoice,
-            TFAlbertForPreTraining,
-            TFAlbertForQuestionAnswering,
-            TFAlbertForSequenceClassification,
-            TFAlbertForTokenClassification,
-            TFAlbertMainLayer,
-            TFAlbertModel,
-            TFAlbertPreTrainedModel,
-        )
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flax_albert import (
-            FlaxAlbertForMaskedLM,
-            FlaxAlbertForMultipleChoice,
-            FlaxAlbertForPreTraining,
-            FlaxAlbertForQuestionAnswering,
-            FlaxAlbertForSequenceClassification,
-            FlaxAlbertForTokenClassification,
-            FlaxAlbertModel,
-            FlaxAlbertPreTrainedModel,
-        )
+    from .configuration_albert import *
+    from .modeling_albert import *
+    from .modeling_flax_albert import *
+    from .modeling_tf_albert import *
+    from .tokenization_albert import *
+    from .tokenization_albert_fast import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index bae88486e10209..e1e2d4547cc4e2 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -165,3 +165,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]:
                 ("token_type_ids", dynamic_axis),
             ]
         )
+
+
+__all__ = ["AlbertConfig", "AlbertOnnxConfig"]
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 6ccb266009e193..dca1fe7f600295 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -1466,3 +1466,16 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+__all__ = [
+    "load_tf_weights_in_albert",
+    "AlbertPreTrainedModel",
+    "AlbertModel",
+    "AlbertForPreTraining",
+    "AlbertForMaskedLM",
+    "AlbertForSequenceClassification",
+    "AlbertForTokenClassification",
+    "AlbertForQuestionAnswering",
+    "AlbertForMultipleChoice",
+]
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index b2c01ded3619ca..b5b49219aebf63 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -1119,3 +1119,14 @@ class FlaxAlbertForQuestionAnswering(FlaxAlbertPreTrainedModel):
     FlaxQuestionAnsweringModelOutput,
     _CONFIG_FOR_DOC,
 )
+
+__all__ = [
+    "FlaxAlbertPreTrainedModel",
+    "FlaxAlbertModel",
+    "FlaxAlbertForPreTraining",
+    "FlaxAlbertForMaskedLM",
+    "FlaxAlbertForSequenceClassification",
+    "FlaxAlbertForMultipleChoice",
+    "FlaxAlbertForTokenClassification",
+    "FlaxAlbertForQuestionAnswering",
+]
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 3a50eeb20ea750..24a25658a4d41a 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -1558,3 +1558,16 @@ def build(self, input_shape=None):
         if getattr(self, "classifier", None) is not None:
             with tf.name_scope(self.classifier.name):
                 self.classifier.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFAlbertPreTrainedModel",
+    "TFAlbertModel",
+    "TFAlbertForPreTraining",
+    "TFAlbertForMaskedLM",
+    "TFAlbertForSequenceClassification",
+    "TFAlbertForTokenClassification",
+    "TFAlbertForQuestionAnswering",
+    "TFAlbertForMultipleChoice",
+    "TFAlbertMainLayer",
+]
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 4068c7aad87635..4971d0511f47bd 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -23,6 +23,7 @@
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
+from ...utils.import_utils import export
 
 
 logger = logging.get_logger(__name__)
@@ -32,6 +33,7 @@
 SPIECE_UNDERLINE = "▁"
 
 
+@export(backends=("sentencepiece",))
 class AlbertTokenizer(PreTrainedTokenizer):
     """
     Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
@@ -343,3 +345,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                 fi.write(content_spiece_model)
 
         return (out_vocab_file,)
+
+
+__all__ = ["AlbertTokenizer"]
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index eadfdcecfc5c28..6e7b110b0afad7 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -207,3 +207,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             copyfile(self.vocab_file, out_vocab_file)
 
         return (out_vocab_file,)
+
+
+__all__ = ["AlbertTokenizerFast"]
diff --git a/src/transformers/models/align/__init__.py b/src/transformers/models/align/__init__.py
index 650b25c3e5d1ee..aaa64dfb6064b1 100644
--- a/src/transformers/models/align/__init__.py
+++ b/src/transformers/models/align/__init__.py
@@ -13,57 +13,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
-_import_structure = {
-    "configuration_align": [
-        "AlignConfig",
-        "AlignTextConfig",
-        "AlignVisionConfig",
-    ],
-    "processing_align": ["AlignProcessor"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_align"] = [
-        "AlignModel",
-        "AlignPreTrainedModel",
-        "AlignTextModel",
-        "AlignVisionModel",
-    ]
-
 if TYPE_CHECKING:
-    from .configuration_align import (
-        AlignConfig,
-        AlignTextConfig,
-        AlignVisionConfig,
-    )
-    from .processing_align import AlignProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_align import (
-            AlignModel,
-            AlignPreTrainedModel,
-            AlignTextModel,
-            AlignVisionModel,
-        )
-
+    from .configuration_align import *
+    from .modeling_align import *
+    from .processing_align import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index efec77b4b31280..99fa81b4a9350d 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -378,3 +378,6 @@ def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: A
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+__all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"]
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 1b744d0f208d46..dea035618a3341 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -1636,3 +1636,6 @@ def forward(
             text_model_output=text_outputs,
             vision_model_output=vision_outputs,
         )
+
+
+__all__ = ["AlignPreTrainedModel", "AlignTextModel", "AlignVisionModel", "AlignModel"]
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 5fdaf051404845..a5846a87d23696 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -162,3 +162,6 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["AlignProcessor"]
diff --git a/src/transformers/models/altclip/__init__.py b/src/transformers/models/altclip/__init__.py
index 4e3cb99bbb16c9..a30de8a2527567 100755
--- a/src/transformers/models/altclip/__init__.py
+++ b/src/transformers/models/altclip/__init__.py
@@ -13,55 +13,16 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
-
-
-_import_structure = {
-    "configuration_altclip": [
-        "AltCLIPConfig",
-        "AltCLIPTextConfig",
-        "AltCLIPVisionConfig",
-    ],
-    "processing_altclip": ["AltCLIPProcessor"],
-}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_altclip"] = [
-        "AltCLIPPreTrainedModel",
-        "AltCLIPModel",
-        "AltCLIPTextModel",
-        "AltCLIPVisionModel",
-    ]
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
 
 
 if TYPE_CHECKING:
-    from .configuration_altclip import (
-        AltCLIPConfig,
-        AltCLIPTextConfig,
-        AltCLIPVisionConfig,
-    )
-    from .processing_altclip import AltCLIPProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_altclip import (
-            AltCLIPModel,
-            AltCLIPPreTrainedModel,
-            AltCLIPTextModel,
-            AltCLIPVisionModel,
-        )
-
-
+    from .configuration_altclip import *
+    from .modeling_altclip import *
+    from .processing_altclip import *
 else:
     import sys
 
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index 1cefeccd347ab8..7333fa63a35280 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -398,3 +398,6 @@ def from_text_vision_configs(cls, text_config: AltCLIPTextConfig, vision_config:
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+__all__ = ["AltCLIPTextConfig", "AltCLIPVisionConfig", "AltCLIPConfig"]
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 0d344cc54b137f..4ed0930605e899 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -1694,3 +1694,6 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
     mask = input_ids.ne(padding_idx).int()
     incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
     return incremental_indices.long() + padding_idx
+
+
+__all__ = ["AltCLIPPreTrainedModel", "AltCLIPVisionModel", "AltCLIPTextModel", "AltCLIPModel"]
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 2814b2d7f26e89..5343498842832c 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -130,3 +130,6 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["AltCLIPProcessor"]
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ddf7608155e843..5c84f97319ecbb 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1262,13 +1262,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class BertLMHeadModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1368,13 +1361,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BigBirdLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class BigBirdModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1862,13 +1848,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class CanineLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class CanineModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2230,13 +2209,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ConvBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ConvBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -3144,13 +3116,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class QDQBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class QDQBertLMHeadModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4133,13 +4098,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class FNetLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class FNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4572,13 +4530,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class GPTNeoXLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class GPTNeoXModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4600,13 +4551,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class GPTNeoXJapaneseLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class GPTNeoXJapaneseModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5437,13 +5381,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LongformerSelfAttention(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class LongT5EncoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5584,13 +5521,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class LxmertXLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class M2M100ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5675,6 +5605,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class MarianPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MarkupLMForQuestionAnswering(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6011,13 +5948,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MobileBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class MobileBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6184,13 +6114,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class MPNetLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class MPNetModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6562,13 +6485,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class NystromformerLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class NystromformerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6993,13 +6909,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class PerceiverLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class PerceiverModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7469,13 +7378,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ReformerAttention(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ReformerForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7497,13 +7399,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ReformerLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ReformerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7588,13 +7483,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RemBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class RemBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7802,13 +7690,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RoCBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class RoCBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7869,13 +7750,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RoFormerLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class RoFormerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8097,13 +7971,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SegformerLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class SegformerModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8314,13 +8181,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SplinterLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class SplinterModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8377,13 +8237,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class SqueezeBertModule(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class SqueezeBertPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9092,13 +8945,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViltLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ViltModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9176,13 +9022,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class VisualBertLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class VisualBertModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9232,13 +9071,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class ViTMAELayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ViTMAEModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -9957,13 +9789,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class YosoLayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class YosoModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 8977b4f51b6308..7931e0fe6584bb 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -128,14 +128,14 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
-class MBart50Tokenizer(metaclass=DummyObject):
+class MBartTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
-class MBartTokenizer(metaclass=DummyObject):
+class MBart50Tokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 942a7afced4bc3..6e1674c9173e78 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -478,13 +478,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFBertEmbeddings(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
 class TFBertForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -772,13 +765,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFConvBertLayer(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
 class TFConvBertModel(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -1717,13 +1703,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFLongformerSelfAttention(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
 class TFLxmertForPreTraining(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -2179,13 +2158,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFRemBertLayer(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
 class TFRemBertModel(metaclass=DummyObject):
     _backends = ["tf"]
 
@@ -2389,13 +2361,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFRoFormerLayer(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
 class TFRoFormerModel(metaclass=DummyObject):
     _backends = ["tf"]
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 8ae133d0ffe0f9..092e43b489f4af 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -15,6 +15,7 @@
 Import utilities: Utilities related to imports and our lazy inits.
 """
 
+import importlib.machinery
 import importlib.metadata
 import importlib.util
 import json
@@ -27,7 +28,7 @@
 from functools import lru_cache
 from itertools import chain
 from types import ModuleType
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Dict, FrozenSet, Optional, Set, Tuple, Union
 
 from packaging import version
 
@@ -1375,6 +1376,11 @@ def is_liger_kernel_available():
 Please note that you may need to restart your runtime after installation.
 """
 
+# docstyle-ignore
+TORCHAUDIO_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. Please install it and restart your
+runtime.
+"""
 
 # docstyle-ignore
 PANDAS_IMPORT_ERROR = """
@@ -1550,6 +1556,7 @@ def is_liger_kernel_available():
         ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
         ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
         ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
+        ("torchaudio", (is_torchaudio_available, TORCHAUDIO_IMPORT_ERROR)),
         ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
         ("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),
         ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
@@ -1608,6 +1615,10 @@ def is_torch_fx_proxy(x):
     return False
 
 
+BACKENDS_T = FrozenSet[str]
+IMPORT_STRUCTURE_T = Dict[BACKENDS_T, Dict[str, Set[str]]]
+
+
 class _LazyModule(ModuleType):
     """
     Module class that surfaces all objects but only performs associated imports when the objects are requested.
@@ -1615,21 +1626,71 @@ class _LazyModule(ModuleType):
 
     # Very heavily inspired by optuna.integration._IntegrationModule
     # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
-    def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
+    def __init__(
+        self,
+        name: str,
+        module_file: str,
+        import_structure: IMPORT_STRUCTURE_T,
+        module_spec: importlib.machinery.ModuleSpec = None,
+        extra_objects: Dict[str, object] = None,
+    ):
         super().__init__(name)
-        self._modules = set(import_structure.keys())
-        self._class_to_module = {}
-        for key, values in import_structure.items():
-            for value in values:
-                self._class_to_module[value] = key
-        # Needed for autocompletion in an IDE
-        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
-        self.__file__ = module_file
-        self.__spec__ = module_spec
-        self.__path__ = [os.path.dirname(module_file)]
-        self._objects = {} if extra_objects is None else extra_objects
-        self._name = name
-        self._import_structure = import_structure
+
+        self._object_missing_backend = {}
+        if any(isinstance(key, frozenset) for key in import_structure.keys()):
+            self._modules = set()
+            self._class_to_module = {}
+            self.__all__ = []
+
+            _import_structure = {}
+
+            for backends, module in import_structure.items():
+                missing_backends = []
+                for backend in backends:
+                    if backend not in BACKENDS_MAPPING:
+                        raise ValueError(
+                            f"Error: the following backend: '{backend}' was specified around object {module} but isn't specified in the backends mapping."
+                        )
+                    callable, error = BACKENDS_MAPPING[backend]
+                    if not callable():
+                        missing_backends.append(backend)
+                self._modules = self._modules.union(set(module.keys()))
+
+                for key, values in module.items():
+                    if len(missing_backends):
+                        self._object_missing_backend[key] = missing_backends
+
+                    for value in values:
+                        self._class_to_module[value] = key
+                        if len(missing_backends):
+                            self._object_missing_backend[value] = missing_backends
+                    _import_structure.setdefault(key, []).extend(values)
+
+                # Needed for autocompletion in an IDE
+                self.__all__.extend(list(module.keys()) + list(chain(*module.values())))
+
+            self.__file__ = module_file
+            self.__spec__ = module_spec
+            self.__path__ = [os.path.dirname(module_file)]
+            self._objects = {} if extra_objects is None else extra_objects
+            self._name = name
+            self._import_structure = _import_structure
+
+        # This can be removed once every exportable object has a `export()` export.
+        else:
+            self._modules = set(import_structure.keys())
+            self._class_to_module = {}
+            for key, values in import_structure.items():
+                for value in values:
+                    self._class_to_module[value] = key
+            # Needed for autocompletion in an IDE
+            self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+            self.__file__ = module_file
+            self.__spec__ = module_spec
+            self.__path__ = [os.path.dirname(module_file)]
+            self._objects = {} if extra_objects is None else extra_objects
+            self._name = name
+            self._import_structure = import_structure
 
     # Needed for autocompletion in an IDE
     def __dir__(self):
@@ -1646,6 +1707,19 @@ def __getattr__(self, name: str) -> Any:
             return self._objects[name]
         if name in self._modules:
             value = self._get_module(name)
+        elif name in self._object_missing_backend.keys():
+            missing_backends = self._object_missing_backend[name]
+
+            class Placeholder(metaclass=DummyObject):
+                _backends = missing_backends
+
+                def __init__(self, *args, **kwargs):
+                    requires_backends(self, missing_backends)
+
+            Placeholder.__name__ = name
+            Placeholder.__module__ = self.__spec__
+
+            value = Placeholder
         elif name in self._class_to_module.keys():
             module = self._get_module(self._class_to_module[name])
             value = getattr(module, name)
@@ -1689,3 +1763,385 @@ def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
     spec.loader.exec_module(module)
     module = sys.modules[name]
     return module
+
+
+def export(*, backends=()):
+    """
+    This decorator enables two things:
+    - Attaching a `__backends` tuple to an object to see what are the necessary backends for it
+      to execute correctly without instantiating it
+    - The '@export' string is used to dynamically import objects
+    """
+    for backend in backends:
+        if backend not in BACKENDS_MAPPING:
+            raise ValueError(f"Backend should be defined in the BACKENDS_MAPPING. Offending backend: {backend}")
+
+    if not isinstance(backends, tuple):
+        raise ValueError("Backends should be a tuple.")
+
+    def inner_fn(fun):
+        fun.__backends = backends
+        return fun
+
+    return inner_fn
+
+
+BASE_FILE_REQUIREMENTS = {
+    lambda e: "modeling_tf_" in e: ("tf",),
+    lambda e: "modeling_flax_" in e: ("flax",),
+    lambda e: "modeling_" in e: ("torch",),
+    lambda e: e.startswith("tokenization_") and e.endswith("_fast"): ("tokenizers",),
+}
+
+
+def fetch__all__(file_content):
+    """
+    Returns the content of the __all__ variable in the file content.
+    Returns None if not defined, otherwise returns a list of strings.
+    """
+
+    if "__all__" not in file_content:
+        return []
+
+    lines = file_content.splitlines()
+    for index, line in enumerate(lines):
+        if line.startswith("__all__"):
+            start_index = index
+
+    lines = lines[start_index:]
+
+    if not lines[0].startswith("__all__"):
+        raise ValueError(
+            "fetch__all__ accepts a list of lines, with the first line being the __all__ variable declaration"
+        )
+
+    # __all__ is defined on a single line
+    if lines[0].endswith("]"):
+        return [obj.strip("\"' ") for obj in lines[0].split("=")[1].strip(" []").split(",")]
+
+    # __all__ is defined on multiple lines
+    else:
+        _all = []
+        for __all__line_index in range(1, len(lines)):
+            if lines[__all__line_index].strip() == "]":
+                return _all
+            else:
+                _all.append(lines[__all__line_index].strip("\"', "))
+
+        return _all
+
+
+@lru_cache()
+def create_import_structure_from_path(module_path):
+    """
+    This method takes the path to a file/a folder and returns the import structure.
+    If a file is given, it will return the import structure of the parent folder.
+
+    Import structures are designed to be digestible by `_LazyModule` objects. They are
+    created from the __all__ definitions in each files as well as the `@export` decorators
+    above methods and objects.
+
+    The import structure allows explicit display of the required backends for a given object.
+    These backends are specified in two ways:
+
+    1. Through their `@export`, if they are exported with that decorator. This `@export` decorator
+       accepts a `backend` tuple kwarg mentioning which backends are required to run this object.
+
+    2. If an object is defined in a file with "default" backends, it will have, at a minimum, this
+       backend specified. The default backends are defined according to the filename:
+
+       - If a file is named like `modeling_*.py`, it will have a `torch` backend
+       - If a file is named like `modeling_tf_*.py`, it will have a `tf` backend
+       - If a file is named like `modeling_flax_*.py`, it will have a `flax` backend
+       - If a file is named like `tokenization_*_fast.py`, it will have a `tokenizers` backend
+
+    Backends serve the purpose of displaying a clear error message to the user in case the backends are not installed.
+    Should an object be imported without its required backends being in the environment, any attempt to use the
+    object will raise an error mentioning which backend(s) should be added to the environment in order to use
+    that object.
+
+    Here's an example of an input import structure at the src.transformers.models level:
+
+    {
+        'albert': {
+            frozenset(): {
+                'configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'}
+            },
+            frozenset({'tokenizers'}): {
+                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+            },
+        },
+        'align': {
+            frozenset(): {
+                'configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+                'processing_align': {'AlignProcessor'}
+            },
+        },
+        'altclip': {
+            frozenset(): {
+                'configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+                'processing_altclip': {'AltCLIPProcessor'},
+            }
+        }
+    }
+    """
+    import_structure = {}
+    if os.path.isdir(module_path):
+        directory = module_path
+        adjacent_modules = []
+
+        for f in os.listdir(module_path):
+            if f != "__pycache__" and os.path.isdir(os.path.join(module_path, f)):
+                import_structure[f] = create_import_structure_from_path(os.path.join(module_path, f))
+
+            elif not os.path.isdir(os.path.join(directory, f)):
+                adjacent_modules.append(f)
+
+    else:
+        directory = os.path.dirname(module_path)
+        adjacent_modules = [f for f in os.listdir(directory) if not os.path.isdir(os.path.join(directory, f))]
+
+    # We're only taking a look at files different from __init__.py
+    # We could theoretically export things directly from the __init__.py
+    # files, but this is not supported at this time.
+    if "__init__.py" in adjacent_modules:
+        adjacent_modules.remove("__init__.py")
+
+    module_requirements = {}
+    for module_name in adjacent_modules:
+        # Only modules ending in `.py` are accepted here.
+        if not module_name.endswith(".py"):
+            continue
+
+        with open(os.path.join(directory, module_name)) as f:
+            file_content = f.read()
+
+        # Remove the .py suffix
+        module_name = module_name[:-3]
+
+        previous_line = ""
+        previous_index = 0
+
+        # Some files have some requirements by default.
+        # For example, any file named `modeling_tf_xxx.py`
+        # should have TensorFlow as a required backend.
+        base_requirements = ()
+        for string_check, requirements in BASE_FILE_REQUIREMENTS.items():
+            if string_check(module_name):
+                base_requirements = requirements
+                break
+
+        # Objects that have a `@export` assigned to them will get exported
+        # with the backends specified in the decorator as well as the file backends.
+        exported_objects = set()
+        if "@export" in file_content:
+            lines = file_content.split("\n")
+            for index, line in enumerate(lines):
+                # This allows exporting items with other decorators. We'll take a look
+                # at the line that follows at the same indentation level.
+                if line.startswith((" ", "\t", "@", ")")) and not line.startswith("@export"):
+                    continue
+
+                # Skipping line enables putting whatever we want between the
+                # export() call and the actual class/method definition.
+                # This is what enables having # Copied from statements, docs, etc.
+                skip_line = False
+
+                if "@export" in previous_line:
+                    skip_line = False
+
+                    # Backends are defined on the same line as export
+                    if "backends" in previous_line:
+                        backends_string = previous_line.split("backends=")[1].split("(")[1].split(")")[0]
+                        backends = tuple(sorted([b.strip("'\",") for b in backends_string.split(", ") if b]))
+
+                    # Backends are defined in the lines following export, for example such as:
+                    # @export(
+                    #     backends=(
+                    #             "sentencepiece",
+                    #             "torch",
+                    #             "tf",
+                    #     )
+                    # )
+                    #
+                    # or
+                    #
+                    # @export(
+                    #     backends=(
+                    #             "sentencepiece", "tf"
+                    #     )
+                    # )
+                    elif "backends" in lines[previous_index + 1]:
+                        backends = []
+                        for backend_line in lines[previous_index:index]:
+                            if "backends" in backend_line:
+                                backend_line = backend_line.split("=")[1]
+                            if '"' in backend_line or "'" in backend_line:
+                                if ", " in backend_line:
+                                    backends.extend(backend.strip("()\"', ") for backend in backend_line.split(", "))
+                                else:
+                                    backends.append(backend_line.strip("()\"', "))
+
+                            # If the line is only a ')', then we reached the end of the backends and we break.
+                            if backend_line.strip() == ")":
+                                break
+                        backends = tuple(backends)
+
+                    # No backends are registered for export
+                    else:
+                        backends = ()
+
+                    backends = frozenset(backends + base_requirements)
+                    if backends not in module_requirements:
+                        module_requirements[backends] = {}
+                    if module_name not in module_requirements[backends]:
+                        module_requirements[backends][module_name] = set()
+
+                    if not line.startswith("class") and not line.startswith("def"):
+                        skip_line = True
+                    else:
+                        start_index = 6 if line.startswith("class") else 4
+                        object_name = line[start_index:].split("(")[0].strip(":")
+                        module_requirements[backends][module_name].add(object_name)
+                        exported_objects.add(object_name)
+
+                if not skip_line:
+                    previous_line = line
+                    previous_index = index
+
+        # All objects that are in __all__ should be exported by default.
+        # These objects are exported with the file backends.
+        if "__all__" in file_content:
+            for _all_object in fetch__all__(file_content):
+                if _all_object not in exported_objects:
+                    backends = frozenset(base_requirements)
+                    if backends not in module_requirements:
+                        module_requirements[backends] = {}
+                    if module_name not in module_requirements[backends]:
+                        module_requirements[backends][module_name] = set()
+
+                    module_requirements[backends][module_name].add(_all_object)
+
+    import_structure = {**module_requirements, **import_structure}
+    return import_structure
+
+
+def spread_import_structure(nested_import_structure):
+    """
+    This method takes as input an unordered import structure and brings the required backends at the top-level,
+    aggregating modules and objects under their required backends.
+
+    Here's an example of an input import structure at the src.transformers.models level:
+
+    {
+        'albert': {
+            frozenset(): {
+                'configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'}
+            },
+            frozenset({'tokenizers'}): {
+                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+            },
+        },
+        'align': {
+            frozenset(): {
+                'configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+                'processing_align': {'AlignProcessor'}
+            },
+        },
+        'altclip': {
+            frozenset(): {
+                'configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+                'processing_altclip': {'AltCLIPProcessor'},
+            }
+        }
+    }
+
+    Here's an example of an output import structure at the src.transformers.models level:
+
+    {
+        frozenset({'tokenizers'}): {
+            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+        },
+        frozenset(): {
+            'albert.configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'},
+            'align.processing_align': {'AlignProcessor'},
+            'align.configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+            'altclip.configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+            'altclip.processing_altclip': {'AltCLIPProcessor'}
+        }
+    }
+
+    """
+
+    def propagate_frozenset(unordered_import_structure):
+        tuple_first_import_structure = {}
+        for _key, _value in unordered_import_structure.items():
+            if not isinstance(_value, dict):
+                tuple_first_import_structure[_key] = _value
+
+            elif any(isinstance(v, frozenset) for v in _value.keys()):
+                # Here we want to switch around key and v
+                for k, v in _value.items():
+                    if isinstance(k, frozenset):
+                        if k not in tuple_first_import_structure:
+                            tuple_first_import_structure[k] = {}
+                        tuple_first_import_structure[k][_key] = v
+
+            else:
+                tuple_first_import_structure[_key] = propagate_frozenset(_value)
+
+        return tuple_first_import_structure
+
+    def flatten_dict(_dict, previous_key=None):
+        items = []
+        for _key, _value in _dict.items():
+            _key = f"{previous_key}.{_key}" if previous_key is not None else _key
+            if isinstance(_value, dict):
+                items.extend(flatten_dict(_value, _key).items())
+            else:
+                items.append((_key, _value))
+        return dict(items)
+
+    # The tuples contain the necessary backends. We want these first, so we propagate them up the
+    # import structure.
+    ordered_import_structure = nested_import_structure
+
+    # 6 is a number that gives us sufficient depth to go through all files and foreseeable folder depths
+    # while not taking too long to parse.
+    for i in range(6):
+        ordered_import_structure = propagate_frozenset(ordered_import_structure)
+
+    # We then flatten the dict so that it references a module path.
+    flattened_import_structure = {}
+    for key, value in ordered_import_structure.copy().items():
+        if isinstance(key, str):
+            del ordered_import_structure[key]
+        else:
+            flattened_import_structure[key] = flatten_dict(value)
+
+    return flattened_import_structure
+
+
+def define_import_structure(module_path: str) -> IMPORT_STRUCTURE_T:
+    """
+    This method takes a module_path as input and creates an import structure digestible by a _LazyModule.
+
+    Here's an example of an output import structure at the src.transformers.models level:
+
+    {
+        frozenset({'tokenizers'}): {
+            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+        },
+        frozenset(): {
+            'albert.configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'},
+            'align.processing_align': {'AlignProcessor'},
+            'align.configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+            'altclip.configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+            'altclip.processing_altclip': {'AltCLIPProcessor'}
+        }
+    }
+
+    The import structure is a dict defined with frozensets as keys, and dicts of strings to sets of objects.
+    """
+    import_structure = create_import_structure_from_path(module_path)
+    return spread_import_structure(import_structure)
diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py
index ef133142573de5..e7f2f67cc23236 100644
--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -34,8 +34,8 @@
         LongformerForSequenceClassification,
         LongformerForTokenClassification,
         LongformerModel,
-        LongformerSelfAttention,
     )
+    from transformers.models.longformer.modeling_longformer import LongformerSelfAttention
 
 
 class LongformerModelTester:
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index 0eda06522681c9..131c077653458f 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -37,8 +37,8 @@
         TFLongformerForSequenceClassification,
         TFLongformerForTokenClassification,
         TFLongformerModel,
-        TFLongformerSelfAttention,
     )
+    from transformers.models.longformer.modeling_tf_longformer import TFLongformerSelfAttention
     from transformers.tf_utils import shape_list
 
 
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index 152c4f2ba33f82..ba0a9232847a74 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -40,11 +40,11 @@
         ReformerForMaskedLM,
         ReformerForQuestionAnswering,
         ReformerForSequenceClassification,
-        ReformerLayer,
         ReformerModel,
         ReformerModelWithLMHead,
         ReformerTokenizer,
     )
+    from transformers.models.reformer.modeling_reformer import ReformerLayer
 
 
 class ReformerModelTester:
diff --git a/tests/utils/import_structures/failing_export.py b/tests/utils/import_structures/failing_export.py
new file mode 100644
index 00000000000000..d635619b60758a
--- /dev/null
+++ b/tests/utils/import_structures/failing_export.py
@@ -0,0 +1,23 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# fmt: off
+
+from transformers.utils.import_utils import export
+
+
+@export(backends=("random_item_that_should_not_exist",))
+class A0:
+    def __init__(self):
+        pass
diff --git a/tests/utils/import_structures/import_structure_raw_register.py b/tests/utils/import_structures/import_structure_raw_register.py
new file mode 100644
index 00000000000000..47f2ba84f1ef0d
--- /dev/null
+++ b/tests/utils/import_structures/import_structure_raw_register.py
@@ -0,0 +1,80 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# fmt: off
+
+from transformers.utils.import_utils import export
+
+
+@export()
+class A0:
+    def __init__(self):
+        pass
+
+
+@export()
+def a0():
+    pass
+
+
+@export(backends=("torch", "tf"))
+class A1:
+    def __init__(self):
+        pass
+
+
+@export(backends=("torch", "tf"))
+def a1():
+    pass
+
+
+@export(
+    backends=("torch", "tf")
+)
+class A2:
+    def __init__(self):
+        pass
+
+
+@export(
+    backends=("torch", "tf")
+)
+def a2():
+    pass
+
+
+@export(
+    backends=(
+        "torch",
+        "tf"
+    )
+)
+class A3:
+    def __init__(self):
+        pass
+
+
+@export(
+    backends=(
+            "torch",
+            "tf"
+    )
+)
+def a3():
+    pass
+
+@export(backends=())
+class A4:
+    def __init__(self):
+        pass
diff --git a/tests/utils/import_structures/import_structure_register_with_comments.py b/tests/utils/import_structures/import_structure_register_with_comments.py
new file mode 100644
index 00000000000000..18dfd40193c1ff
--- /dev/null
+++ b/tests/utils/import_structures/import_structure_register_with_comments.py
@@ -0,0 +1,79 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# fmt: off
+
+from transformers.utils.import_utils import export
+
+
+@export()
+# That's a statement
+class B0:
+    def __init__(self):
+        pass
+
+
+@export()
+# That's a statement
+def b0():
+    pass
+
+
+@export(backends=("torch", "tf"))
+# That's a statement
+class B1:
+    def __init__(self):
+        pass
+
+
+@export(backends=("torch", "tf"))
+# That's a statement
+def b1():
+    pass
+
+
+@export(backends=("torch", "tf"))
+# That's a statement
+class B2:
+    def __init__(self):
+        pass
+
+
+@export(backends=("torch", "tf"))
+# That's a statement
+def b2():
+    pass
+
+
+@export(
+    backends=(
+        "torch",
+        "tf"
+    )
+)
+# That's a statement
+class B3:
+    def __init__(self):
+        pass
+
+
+@export(
+    backends=(
+        "torch",
+        "tf"
+    )
+)
+# That's a statement
+def b3():
+    pass
diff --git a/tests/utils/import_structures/import_structure_register_with_duplicates.py b/tests/utils/import_structures/import_structure_register_with_duplicates.py
new file mode 100644
index 00000000000000..01842c71a1ff49
--- /dev/null
+++ b/tests/utils/import_structures/import_structure_register_with_duplicates.py
@@ -0,0 +1,77 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# fmt: off
+
+from transformers.utils.import_utils import export
+
+
+@export(backends=("torch", "torch"))
+class C0:
+    def __init__(self):
+        pass
+
+
+@export(backends=("torch", "torch"))
+def c0():
+    pass
+
+
+@export(backends=("torch", "torch"))
+# That's a statement
+class C1:
+    def __init__(self):
+        pass
+
+
+@export(backends=("torch", "torch"))
+# That's a statement
+def c1():
+    pass
+
+
+@export(backends=("torch", "torch"))
+# That's a statement
+class C2:
+    def __init__(self):
+        pass
+
+
+@export(backends=("torch", "torch"))
+# That's a statement
+def c2():
+    pass
+
+
+@export(
+    backends=(
+        "torch",
+        "torch"
+    )
+)
+# That's a statement
+class C3:
+    def __init__(self):
+        pass
+
+
+@export(
+    backends=(
+        "torch",
+        "torch"
+    )
+)
+# That's a statement
+def c3():
+    pass
diff --git a/tests/utils/test_import_structure.py b/tests/utils/test_import_structure.py
new file mode 100644
index 00000000000000..18f4b840088653
--- /dev/null
+++ b/tests/utils/test_import_structure.py
@@ -0,0 +1,98 @@
+import os
+import unittest
+from pathlib import Path
+
+from transformers.utils.import_utils import define_import_structure, spread_import_structure
+
+
+import_structures = Path("import_structures")
+
+
+def fetch__all__(file_content):
+    """
+    Returns the content of the __all__ variable in the file content.
+    Returns None if not defined, otherwise returns a list of strings.
+    """
+    lines = file_content.split("\n")
+    for line_index in range(len(lines)):
+        line = lines[line_index]
+        if line.startswith("__all__ = "):
+            # __all__ is defined on a single line
+            if line.endswith("]"):
+                return [obj.strip("\"' ") for obj in line.split("=")[1].strip(" []").split(",")]
+
+            # __all__ is defined on multiple lines
+            else:
+                _all = []
+                for __all__line_index in range(line_index + 1, len(lines)):
+                    if lines[__all__line_index].strip() == "]":
+                        return _all
+                    else:
+                        _all.append(lines[__all__line_index].strip("\"', "))
+
+
+class TestImportStructures(unittest.TestCase):
+    base_transformers_path = Path(__file__).parent.parent.parent
+    models_path = base_transformers_path / "src" / "transformers" / "models"
+    models_import_structure = spread_import_structure(define_import_structure(models_path))
+
+    def test_definition(self):
+        import_structure = define_import_structure(import_structures)
+        import_structure_definition = {
+            frozenset(()): {
+                "import_structure_raw_register": {"A0", "a0", "A4"},
+                "import_structure_register_with_comments": {"B0", "b0"},
+            },
+            frozenset(("tf", "torch")): {
+                "import_structure_raw_register": {"A1", "a1", "A2", "a2", "A3", "a3"},
+                "import_structure_register_with_comments": {"B1", "b1", "B2", "b2", "B3", "b3"},
+            },
+            frozenset(("torch",)): {
+                "import_structure_register_with_duplicates": {"C0", "c0", "C1", "c1", "C2", "c2", "C3", "c3"},
+            },
+        }
+
+        self.assertDictEqual(import_structure, import_structure_definition)
+
+    def test_transformers_specific_model_import(self):
+        """
+        This test ensures that there is equivalence between what is written down in __all__ and what is
+        written down with register().
+
+        It doesn't test the backends attributed to register().
+        """
+        for architecture in os.listdir(self.models_path):
+            if (
+                os.path.isfile(self.models_path / architecture)
+                or architecture.startswith("_")
+                or architecture == "deprecated"
+            ):
+                continue
+
+            with self.subTest(f"Testing arch {architecture}"):
+                import_structure = define_import_structure(self.models_path / architecture)
+                backend_agnostic_import_structure = {}
+                for requirement, module_object_mapping in import_structure.items():
+                    for module, objects in module_object_mapping.items():
+                        if module not in backend_agnostic_import_structure:
+                            backend_agnostic_import_structure[module] = []
+
+                        backend_agnostic_import_structure[module].extend(objects)
+
+                for module, objects in backend_agnostic_import_structure.items():
+                    with open(self.models_path / architecture / f"{module}.py") as f:
+                        content = f.read()
+                        _all = fetch__all__(content)
+
+                        if _all is None:
+                            raise ValueError(f"{module} doesn't have __all__ defined.")
+
+                        error_message = (
+                            f"self.models_path / architecture / f'{module}.py doesn't seem to be defined correctly:\n"
+                            f"Defined in __all__: {sorted(_all)}\nDefined with register: {sorted(objects)}"
+                        )
+                        self.assertListEqual(sorted(objects), sorted(_all), msg=error_message)
+
+    def test_export_backend_should_be_defined(self):
+        with self.assertRaisesRegex(ValueError, "Backend should be defined in the BACKENDS_MAPPING"):
+            pass
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
index 7adf804eaf1fb6..82bf07ce43a9c5 100644
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -244,7 +244,7 @@ def sort_imports(file: str, check_only: bool = True):
         code = f.read()
 
     # If the file is not a custom init, there is nothing to do.
-    if "_import_structure" not in code:
+    if "_import_structure" not in code or "define_import_structure" in code:
         return
 
     # Blocks of indent level 0