adapter-hub · hSterz · Sep 6, 2023 · Sep 4, 2023 · Sep 4, 2023 · Sep 6, 2023
diff --git a/docs/classes/models/electra.rst b/docs/classes/models/electra.rst
@@ -0,0 +1,32 @@
+ELECTRA
+======
+
+The ELECTRA model was proposed in the paper `ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
+Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__. ELECTRA is a new pretraining approach which trains two
+transformer models: the generator and the discriminator. The generator's role is to replace tokens in a sequence, and
+is therefore trained as a masked language model. The discriminator, which is the model we're interested in, tries to
+identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
+of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
+predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
+rather than just the small subset that was masked out. As a result, the contextual representations learned by our
+approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
+particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
+using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
+where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
+using the same amount of compute.*
+
+ElectraAdapterModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: adapters.ElectraAdapterModel
+ :members:
+ :inherited-members: ElectraPreTrainedModel
diff --git a/docs/classes/models/llama.rst b/docs/classes/models/llama.rst
@@ -1,7 +1,7 @@
 LLaMA
 -----------------------------------------------------------------------------------------------------------------------
 
-The LLaMA model was proposed in `LLaMA: Open and Efficient Foundation Language Models<https://arxiv.org/abs/2302.13971>` by 
+The LLaMA model was proposed in `LLaMA: Open and Efficient Foundation Language Models <https://arxiv.org/abs/2302.13971>`__ by 
 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, 
 Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language 
 models ranging from 7B to 65B parameters.

diff --git a/docs/index.rst b/docs/index.rst
@@ -66,6 +66,7 @@ Currently, we support the PyTorch versions of all models as listed on the `Model
  classes/models/deberta
  classes/models/deberta_v2
  classes/models/distilbert
+ classes/models/electra
  classes/models/encoderdecoder
  classes/models/gpt2
  classes/models/gptj

diff --git a/docs/model_overview.md b/docs/model_overview.md
@@ -21,6 +21,7 @@ The table below further shows which model architectures support which adaptation
 | [DeBERTa](classes/models/deberta.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [DeBERTa-v2](classes/models/debertaV2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [DistilBERT](classes/models/distilbert.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [Electra](classes/models/electra.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [Encoder Decoder](classes/models/encoderdecoder.html) | (*) | (*) | (*) | (*) | (*) | (*) | |
 | [GPT-2](classes/models/gpt2.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [GPT-J](classes/models/gptj.html) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |

diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -99,6 +99,7 @@
  "models.deberta": ["DebertaAdapterModel"],
  "models.deberta_v2": ["DebertaV2AdapterModel"],
  "models.distilbert": ["DistilBertAdapterModel"],
+ "models.electra": ["ElectraAdapterModel"],
  "models.gpt2": ["GPT2AdapterModel"],
  "models.gptj": ["GPTJAdapterModel"],
  "models.llama": ["LlamaAdapterModel"],
@@ -198,6 +199,7 @@
  from .models.deberta import DebertaAdapterModel
  from .models.deberta_v2 import DebertaV2AdapterModel
  from .models.distilbert import DistilBertAdapterModel
+ from .models.electra import ElectraAdapterModel
  from .models.gpt2 import GPT2AdapterModel
  from .models.gptj import GPTJAdapterModel
  from .models.llama import LlamaAdapterModel

diff --git a/src/adapters/composition.py b/src/adapters/composition.py
@@ -135,6 +135,7 @@ def __init__(
  "xlm-roberta",
  "bert-generation",
  "llama",
+ "electra",
  ],
 }
 

diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
@@ -543,6 +543,70 @@
  },
  "layers": ["lm_head"],
  },
+ "ElectraForTokenClassification": {
+ "config": {
+ "head_type": "tagging",
+ "layers": 1,
+ "activation_function": None,
+ },
+ "layers": [None, "classifier"],
+ },
+ "ElectraForSequenceClassification": {
+ "config": {
+ "head_type": "classification",
+ "layers": 2,
+ "activation_function": "gelu",
+ "bias": True,
+ },
+ "layers": [None, "classifier.dense", None, None, "classifier.out_proj"],
+ },
+ "ElectraForQuestionAnswering": {
+ "config": {
+ "head_type": "question_answering",
+ "layers": 1,
+ "activation_function": None,
+ },
+ "layers": [None, "qa_outputs"],
+ },
+ "ElectraForMultipleChoice": {
+ "config": {
+ "head_type": "multiple_choice",
+ "layers": 2,
+ "activation_function": "gelu",
+ "use_pooler": False,
+ },
+ "layers": [None, "sequence_summary.summary", None, None, "classifier"],
+ },
+ "ElectraForMaskedLM": {
+ "config": {
+ "head_type": "masked_lm",
+ "layers": 2,
+ "activation_function": "gelu",
+ "layer_norm": True,
+ "bias": True,
+ },
+ "layers": [
+ "generator_predictions.dense",
+ None,
+ "generator_predictions.LayerNorm",
+ "generator_lm_head",
+ ],
+ },
+ "ElectraForCausalLM": {
+ "config": {
+ "head_type": "causal_lm",
+ "layers": 2,
+ "activation_function": "gelu",
+ "layer_norm": True,
+ "bias": True,
+ },
+ "layers": [
+ "generator_predictions.dense",
+ None,
+ "generator_predictions.LayerNorm",
+ "generator_lm_head",
+ ],
+ },
 }
 
 

diff --git a/src/adapters/models/__init__.py b/src/adapters/models/__init__.py
@@ -42,6 +42,8 @@
  "CLIPModel": CLIPModelAdaptersMixin,
  "CLIPTextModelWithProjection": CLIPTextModelAdaptersMixin,
  "CLIPVisionModelWithProjection": CLIPVisionModelAdaptersMixin,
+ "ElectraLayer": BertLayerAdaptersMixin,
+ "ElectraModel": BertModelAdaptersMixin,
  "MBartEncoder": BartEncoderAdaptersMixin,
  "MBartDecoder": BartDecoderAdaptersMixin,
  "MBartDecoderWrapper": BartDecoderWrapperAdaptersMixin,

diff --git a/src/adapters/models/auto/adapter_model.py b/src/adapters/models/auto/adapter_model.py
@@ -18,6 +18,7 @@
  ("deberta", "DebertaAdapterModel"),
  ("deberta-v2", "DebertaV2AdapterModel"),
  ("distilbert", "DistilBertAdapterModel"),
+ ("electra", "ElectraAdapterModel"),
  ("gpt2", "GPT2AdapterModel"),
  ("gptj", "GPTJAdapterModel"),
  ("llama", "LlamaAdapterModel"),

diff --git a/src/adapters/models/electra/__init__.py b/src/adapters/models/electra/__init__.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The Adapter-Hub Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+ "adapter_model": ["ElectraAdapterModel"],
+}
+
+
+if TYPE_CHECKING:
+ from .adapter_model import ElectraAdapterModel
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(
+ __name__,
+ globals()["__file__"],
+ _import_structure,
+ )