Electra Support (#583)

adapter-hub · Sep 6, 2023 · 9e5824d · 9e5824d
1 parent 57805f5
commit 9e5824d
Show file tree

Hide file tree

Showing 14 changed files with 597 additions and 1 deletion.
diff --git a/docs/classes/models/electra.rst b/docs/classes/models/electra.rst
@@ -0,0 +1,32 @@
+ELECTRA
+======
+
+The ELECTRA model was proposed in the paper `ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
+Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__. ELECTRA is a new pretraining approach which trains two
+transformer models: the generator and the discriminator. The generator's role is to replace tokens in a sequence, and
+is therefore trained as a masked language model. The discriminator, which is the model we're interested in, tries to
+identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
+of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
+predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
+rather than just the small subset that was masked out. As a result, the contextual representations learned by our
+approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
+particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
+using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
+where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
+using the same amount of compute.*
+
+ElectraAdapterModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: adapters.ElectraAdapterModel
+    :members:
+    :inherited-members: ElectraPreTrainedModel
diff --git a/docs/classes/models/llama.rst b/docs/classes/models/llama.rst
@@ -1,7 +1,7 @@
 LLaMA
 -----------------------------------------------------------------------------------------------------------------------
 
-The LLaMA model was proposed in `LLaMA: Open and Efficient Foundation Language Models<https://arxiv.org/abs/2302.13971>` by 
+The LLaMA model was proposed in `LLaMA: Open and Efficient Foundation Language Models <https://arxiv.org/abs/2302.13971>`__ by 
 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, 
 Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language 
 models ranging from 7B to 65B parameters.

diff --git a/docs/index.rst b/docs/index.rst
@@ -66,6 +66,7 @@ Currently, we support the PyTorch versions of all models as listed on the `Model
    classes/models/deberta
    classes/models/deberta_v2
    classes/models/distilbert
+   classes/models/electra
    classes/models/encoderdecoder
    classes/models/gpt2
    classes/models/gptj

diff --git a/docs/model_overview.md b/docs/model_overview.md
@@ -21,6 +21,7 @@ The table below further shows which model architectures support which adaptation
 | [DeBERTa](classes/models/deberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [DeBERTa-v2](classes/models/debertaV2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Electra](classes/models/electra.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) | |
 | [GPT-2](classes/models/gpt2.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [GPT-J](classes/models/gptj.html)       | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |

diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -99,6 +99,7 @@
     "models.deberta": ["DebertaAdapterModel"],
     "models.deberta_v2": ["DebertaV2AdapterModel"],
     "models.distilbert": ["DistilBertAdapterModel"],
+    "models.electra": ["ElectraAdapterModel"],
     "models.gpt2": ["GPT2AdapterModel"],
     "models.gptj": ["GPTJAdapterModel"],
     "models.llama": ["LlamaAdapterModel"],
@@ -199,6 +200,7 @@
     from .models.deberta import DebertaAdapterModel
     from .models.deberta_v2 import DebertaV2AdapterModel
     from .models.distilbert import DistilBertAdapterModel
+    from .models.electra import ElectraAdapterModel
     from .models.gpt2 import GPT2AdapterModel
     from .models.gptj import GPTJAdapterModel
     from .models.llama import LlamaAdapterModel

diff --git a/src/adapters/composition.py b/src/adapters/composition.py
@@ -135,6 +135,7 @@ def __init__(
         "xlm-roberta",
         "bert-generation",
         "llama",
+        "electra",
         "xmod",
     ],
 }

diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
@@ -598,6 +598,70 @@
         },
         "layers": ["lm_head"],
     },
+    "ElectraForTokenClassification": {
+        "config": {
+            "head_type": "tagging",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": [None, "classifier"],
+    },
+    "ElectraForSequenceClassification": {
+        "config": {
+            "head_type": "classification",
+            "layers": 2,
+            "activation_function": "gelu",
+            "bias": True,
+        },
+        "layers": [None, "classifier.dense", None, None, "classifier.out_proj"],
+    },
+    "ElectraForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": [None, "qa_outputs"],
+    },
+    "ElectraForMultipleChoice": {
+        "config": {
+            "head_type": "multiple_choice",
+            "layers": 2,
+            "activation_function": "gelu",
+            "use_pooler": False,
+        },
+        "layers": [None, "sequence_summary.summary", None, None, "classifier"],
+    },
+    "ElectraForMaskedLM": {
+        "config": {
+            "head_type": "masked_lm",
+            "layers": 2,
+            "activation_function": "gelu",
+            "layer_norm": True,
+            "bias": True,
+        },
+        "layers": [
+            "generator_predictions.dense",
+            None,
+            "generator_predictions.LayerNorm",
+            "generator_lm_head",
+        ],
+    },
+    "ElectraForCausalLM": {
+        "config": {
+            "head_type": "causal_lm",
+            "layers": 2,
+            "activation_function": "gelu",
+            "layer_norm": True,
+            "bias": True,
+        },
+        "layers": [
+            "generator_predictions.dense",
+            None,
+            "generator_predictions.LayerNorm",
+            "generator_lm_head",
+        ],
+    },
 }
 
 

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
@@ -43,6 +43,8 @@
     "CLIPModel": CLIPModelAdaptersMixin,
     "CLIPTextModelWithProjection": CLIPTextModelAdaptersMixin,
     "CLIPVisionModelWithProjection": CLIPVisionModelAdaptersMixin,
+    "ElectraLayer": BertLayerAdaptersMixin,
+    "ElectraModel": BertModelAdaptersMixin,
     "MBartEncoder": BartEncoderAdaptersMixin,
     "MBartDecoder": BartDecoderAdaptersMixin,
     "MBartDecoderWrapper": BartDecoderWrapperAdaptersMixin,

diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py
@@ -18,6 +18,7 @@
         ("deberta", "DebertaAdapterModel"),
         ("deberta-v2", "DebertaV2AdapterModel"),
         ("distilbert", "DistilBertAdapterModel"),
+        ("electra", "ElectraAdapterModel"),
         ("gpt2", "GPT2AdapterModel"),
         ("gptj", "GPTJAdapterModel"),
         ("llama", "LlamaAdapterModel"),

diff --git a/src/adapters/models/electra/__init__.py b/src/adapters/models/electra/__init__.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The Adapter-Hub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "adapter_model": ["ElectraAdapterModel"],
+}
+
+
+if TYPE_CHECKING:
+    from .adapter_model import ElectraAdapterModel
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+    )