Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding adapter support for NeoX #523

Open
wants to merge 13 commits into
base: legacy
Choose a base branch
from
23 changes: 23 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2596,6 +2596,8 @@
"GPT2AdapterModel",
"GPT2ModelWithHeads",
"GPTJAdapterModel",
"GPTNeoXAdapterModel",
"GPTNeoXModelWithHeads",
"HoulsbyConfig",
"HoulsbyInvConfig",
"IA3Config",
Expand Down Expand Up @@ -2942,6 +2944,15 @@
"TFGPTJPreTrainedModel",
]
)
_import_structure["models.gpt_neox"].extend(
[
"TFGPTNeoXForCausalLM",
"TFGPTNeoXForQuestionAnswering",
"TFGPTNeoXForSequenceClassification",
"TFGPTNeoXModel",
"TFGPTNeoXPreTrainedModel",
]
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the addition of these imports related to changes of this PR?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I removed them

Copy link
Author

@ajesujoba ajesujoba Apr 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding all points in our contributing guide are addressed, it is optional in the documentation and I tried it but having some mismatches in dimension, Is it still optional?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify what exactly you're referring to from the contributing guide? The Parallel inference and static head conversion points are still optional (although highly recommended). If Parallel support is not implemented, please make sure to remove the test mixin classes starting with "Parallel..." from the model test class.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was referring to the Parallel inference and static head conversion points are still optional (although highly recommended). And as you recommended I would remove test mixin classes starting with "Parallel...". I would also remove some of the other tests such as IA3TestMixin, LoRATestMixin, PrefixTuningTestMixin, UniPELTTestMixin, as they require adding classification head 'add_classification_head' and the GPT_NeoX model in this version does not have that .

_import_structure["models.groupvit"].extend(
[
"TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
Expand Down Expand Up @@ -3456,6 +3467,8 @@
["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"]
)
_import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"])
_import_structure["models.gpt_neox"].extend(["FlaxGPTNeoXForCausalLM", "FlaxGPTNeoXModel", "FlaxGPTNeoXPreTrainedModel"])

_import_structure["models.longt5"].extend(
["FlaxLongT5ForConditionalGeneration", "FlaxLongT5Model", "FlaxLongT5PreTrainedModel"]
)
Expand Down Expand Up @@ -5699,6 +5712,8 @@
ForwardContext,
GPT2AdapterModel,
GPT2ModelWithHeads,
GPTNeoXAdapterModel,
GPTNeoXModelWithHeads,
GPTJAdapterModel,
HoulsbyConfig,
HoulsbyInvConfig,
Expand Down Expand Up @@ -6010,6 +6025,13 @@
TFGPTJModel,
TFGPTJPreTrainedModel,
)
from .models.gpt_neox import (
TFGPTNeoXForCausalLM,
TFGPTNeoXForQuestionAnswering,
TFGPTNeoXForSequenceClassification,
TFGPTNeoXModel,
TFGPTNeoXPreTrainedModel,
)
from .models.groupvit import (
TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFGroupViTModel,
Expand Down Expand Up @@ -6399,6 +6421,7 @@
from .models.encoder_decoder import FlaxEncoderDecoderModel
from .models.gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel
from .models.gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel
from .models.gpt_neox import FlaxGPTNeoXForCausalLM, FlaxGPTNeoXModel, FlaxGPTNeoXPreTrainedModel
from .models.gptj import FlaxGPTJForCausalLM, FlaxGPTJModel, FlaxGPTJPreTrainedModel
from .models.longt5 import FlaxLongT5ForConditionalGeneration, FlaxLongT5Model, FlaxLongT5PreTrainedModel
from .models.marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@
"GPT2AdapterModel",
"GPT2ModelWithHeads",
],
"models.gpt_neox": [
"GPTNeoXAdapterModel",
"GPTNeoXModelWithHeads",
],
"models.gptj": ["GPTJAdapterModel"],
"models.mbart": [
"MBartAdapterModel",
Expand Down Expand Up @@ -217,6 +221,7 @@
from .models.debertaV2 import DebertaV2AdapterModel
from .models.distilbert import DistilBertAdapterModel, DistilBertModelWithHeads
from .models.gpt2 import GPT2AdapterModel, GPT2ModelWithHeads
from .models.gpt_neox import GPTNeoXAdapterModel, GPTNeoXModelWithHeads
from .models.gptj import GPTJAdapterModel
from .models.mbart import MBartAdapterModel, MBartModelWithHeads
from .models.roberta import RobertaAdapterModel, RobertaModelWithHeads
Expand Down
7 changes: 7 additions & 0 deletions src/transformers/adapters/head_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,13 @@
},
"layers": [None, "classifier"],
},
#GPT-NeoX
"GPTNeoXForCausalLM": {
"config": {
"head_type": "causal_lm",
},
"layers": ["embed_out"],
},
# GPT-J
"GPTJForSequenceClassification": {
"config": {
Expand Down
32 changes: 32 additions & 0 deletions src/transformers/adapters/mixins/gpt_neox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Iterable, Tuple

import torch.nn as nn

from ..layer import AdapterLayer
from ..model_mixin import (
EmbeddingAdaptersMixin,
EmbeddingAdaptersWrapperMixin,
InvertibleAdaptersMixin,
ModelAdaptersMixin,
ModelWithHeadsAdaptersMixin,
)


class GPTNeoXDecoderBlockAdaptersMixin:
"""Adds adapters to the TransformerBlock module of DistilBert."""

def _init_adapter_modules(self):
self.attention_adapters = AdapterLayer("mh_adapter", self.config)
self.output_adapters = AdapterLayer("output_adapter", self.config)
self.attention_adapters._init_adapter_modules()
self.output_adapters._init_adapter_modules()


class GPTNeoXModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin):
def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
for i, layer in enumerate(self.base_model.layers):
yield i, layer


class GPTNeoXModelWithHeadsAdaptersMixin(EmbeddingAdaptersWrapperMixin, ModelWithHeadsAdaptersMixin):
pass
2 changes: 2 additions & 0 deletions src/transformers/adapters/models/auto/adapter_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
("bart", "BartAdapterModel"),
("mbart", "MBartAdapterModel"),
("gpt2", "GPT2AdapterModel"),
("gpt_neox", "GPTNeoXAdapterModel"),
("gptj", "GPTJAdapterModel"),
("t5", "T5AdapterModel"),
("vit", "ViTAdapterModel"),
Expand All @@ -34,6 +35,7 @@
("bart", "BartModelWithHeads"),
("mbart", "MBartModelWithHeads"),
("gpt2", "GPT2ModelWithHeads"),
("gpt_neox", "GPTNeoXModelWithHeads"),
("t5", "T5ModelWithHeads"),
]
)
Expand Down
42 changes: 42 additions & 0 deletions src/transformers/adapters/models/gpt_neox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.

# Copyright 2020 The Adapter-Hub Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

from ....utils import _LazyModule


_import_structure = {
"adapter_model": [
"GPTNeoXAdapterModel",
"GPTNeoXModelWithHeads"
],
}


if TYPE_CHECKING:
from .adapter_model import GPTNeoXAdapterModel, GPTNeoXModelWithHeads

else:
import sys

sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
)
Loading