From c4c51942acd0a598b9d1820a0391f2458e912c71 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Wed, 15 Mar 2023 14:45:49 +0100 Subject: [PATCH 01/13] Adding adapter support for NeoX --- src/transformers/__init__.py | 23 ++ src/transformers/adapters/__init__.py | 5 + src/transformers/adapters/head_utils.py | 7 + src/transformers/adapters/mixins/gpt_neox.py | 32 +++ .../adapters/models/auto/adapter_model.py | 2 + .../adapters/models/gpt_neox/__init__.py | 42 ++++ .../adapters/models/gpt_neox/adapter_model.py | 219 ++++++++++++++++++ .../adapters/wrappers/configuration.py | 3 + src/transformers/models/auto/modeling_auto.py | 1 + .../models/auto/tokenization_auto.py | 2 +- .../models/gpt_neox/configuration_gpt_neox.py | 10 +- .../models/gpt_neox/modeling_gpt_neox.py | 49 +++- 12 files changed, 379 insertions(+), 16 deletions(-) create mode 100644 src/transformers/adapters/mixins/gpt_neox.py create mode 100644 src/transformers/adapters/models/gpt_neox/__init__.py create mode 100644 src/transformers/adapters/models/gpt_neox/adapter_model.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a507a51e2..920c1cb38 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2596,6 +2596,8 @@ "GPT2AdapterModel", "GPT2ModelWithHeads", "GPTJAdapterModel", + "GPTNeoXAdapterModel", + "GPTNeoXModelWithHeads", "HoulsbyConfig", "HoulsbyInvConfig", "IA3Config", @@ -2942,6 +2944,15 @@ "TFGPTJPreTrainedModel", ] ) + _import_structure["models.gpt_neox"].extend( + [ + "TFGPTNeoXForCausalLM", + "TFGPTNeoXForQuestionAnswering", + "TFGPTNeoXForSequenceClassification", + "TFGPTNeoXModel", + "TFGPTNeoXPreTrainedModel", + ] + ) _import_structure["models.groupvit"].extend( [ "TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3456,6 +3467,8 @@ ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"] ) _import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"]) + _import_structure["models.gpt_neox"].extend(["FlaxGPTNeoXForCausalLM", "FlaxGPTNeoXModel", "FlaxGPTNeoXPreTrainedModel"]) + _import_structure["models.longt5"].extend( ["FlaxLongT5ForConditionalGeneration", "FlaxLongT5Model", "FlaxLongT5PreTrainedModel"] ) @@ -5699,6 +5712,8 @@ ForwardContext, GPT2AdapterModel, GPT2ModelWithHeads, + GPTNeoXAdapterModel, + GPTNeoXModelWithHeads, GPTJAdapterModel, HoulsbyConfig, HoulsbyInvConfig, @@ -6010,6 +6025,13 @@ TFGPTJModel, TFGPTJPreTrainedModel, ) + from .models.gpt_neox import ( + TFGPTNeoXForCausalLM, + TFGPTNeoXForQuestionAnswering, + TFGPTNeoXForSequenceClassification, + TFGPTNeoXModel, + TFGPTNeoXPreTrainedModel, + ) from .models.groupvit import ( TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST, TFGroupViTModel, @@ -6399,6 +6421,7 @@ from .models.encoder_decoder import FlaxEncoderDecoderModel from .models.gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel from .models.gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel + from .models.gpt_neox import FlaxGPTNeoXForCausalLM, FlaxGPTNeoXModel, FlaxGPTNeoXPreTrainedModel from .models.gptj import FlaxGPTJForCausalLM, FlaxGPTJModel, FlaxGPTJPreTrainedModel from .models.longt5 import FlaxLongT5ForConditionalGeneration, FlaxLongT5Model, FlaxLongT5PreTrainedModel from .models.marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py index a07b8e846..19eb72c8e 100644 --- a/src/transformers/adapters/__init__.py +++ b/src/transformers/adapters/__init__.py @@ -113,6 +113,10 @@ "GPT2AdapterModel", "GPT2ModelWithHeads", ], + "models.gpt_neox": [ + "GPTNeoXAdapterModel", + "GPTNeoXModelWithHeads", + ], "models.gptj": ["GPTJAdapterModel"], "models.mbart": [ "MBartAdapterModel", @@ -217,6 +221,7 @@ from .models.debertaV2 import DebertaV2AdapterModel from .models.distilbert import DistilBertAdapterModel, DistilBertModelWithHeads from .models.gpt2 import GPT2AdapterModel, GPT2ModelWithHeads + from .models.gpt_neox import GPTNeoXAdapterModel, GPTNeoXModelWithHeads from .models.gptj import GPTJAdapterModel from .models.mbart import MBartAdapterModel, MBartModelWithHeads from .models.roberta import RobertaAdapterModel, RobertaModelWithHeads diff --git a/src/transformers/adapters/head_utils.py b/src/transformers/adapters/head_utils.py index 05f3a5456..8ea4c2579 100644 --- a/src/transformers/adapters/head_utils.py +++ b/src/transformers/adapters/head_utils.py @@ -381,6 +381,13 @@ }, "layers": [None, "classifier"], }, + #GPT-NeoX + "GPTNeoXForCausalLM": { + "config": { + "head_type": "causal_lm", + }, + "layers": ["embed_out"], + }, # GPT-J "GPTJForSequenceClassification": { "config": { diff --git a/src/transformers/adapters/mixins/gpt_neox.py b/src/transformers/adapters/mixins/gpt_neox.py new file mode 100644 index 000000000..ff520ee9a --- /dev/null +++ b/src/transformers/adapters/mixins/gpt_neox.py @@ -0,0 +1,32 @@ +from typing import Iterable, Tuple + +import torch.nn as nn + +from ..layer import AdapterLayer +from ..model_mixin import ( + EmbeddingAdaptersMixin, + EmbeddingAdaptersWrapperMixin, + InvertibleAdaptersMixin, + ModelAdaptersMixin, + ModelWithHeadsAdaptersMixin, +) + + +class GPTNeoXDecoderBlockAdaptersMixin: + """Adds adapters to the TransformerBlock module of DistilBert.""" + + def _init_adapter_modules(self): + self.attention_adapters = AdapterLayer("mh_adapter", self.config) + self.output_adapters = AdapterLayer("output_adapter", self.config) + self.attention_adapters._init_adapter_modules() + self.output_adapters._init_adapter_modules() + + +class GPTNeoXModelAdapterMixin(EmbeddingAdaptersMixin, InvertibleAdaptersMixin, ModelAdaptersMixin): + def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]: + for i, layer in enumerate(self.base_model.layers): + yield i, layer + + +class GPTNeoXModelWithHeadsAdaptersMixin(EmbeddingAdaptersWrapperMixin, ModelWithHeadsAdaptersMixin): + pass diff --git a/src/transformers/adapters/models/auto/adapter_model.py b/src/transformers/adapters/models/auto/adapter_model.py index cfd159bad..274cdf4b0 100644 --- a/src/transformers/adapters/models/auto/adapter_model.py +++ b/src/transformers/adapters/models/auto/adapter_model.py @@ -20,6 +20,7 @@ ("bart", "BartAdapterModel"), ("mbart", "MBartAdapterModel"), ("gpt2", "GPT2AdapterModel"), + ("gpt_neox", "GPTNeoXAdapterModel"), ("gptj", "GPTJAdapterModel"), ("t5", "T5AdapterModel"), ("vit", "ViTAdapterModel"), @@ -34,6 +35,7 @@ ("bart", "BartModelWithHeads"), ("mbart", "MBartModelWithHeads"), ("gpt2", "GPT2ModelWithHeads"), + ("gpt_neox", "GPTNeoXModelWithHeads"), ("t5", "T5ModelWithHeads"), ] ) diff --git a/src/transformers/adapters/models/gpt_neox/__init__.py b/src/transformers/adapters/models/gpt_neox/__init__.py new file mode 100644 index 000000000..72aae7460 --- /dev/null +++ b/src/transformers/adapters/models/gpt_neox/__init__.py @@ -0,0 +1,42 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2020 The Adapter-Hub Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ....utils import _LazyModule + + +_import_structure = { + "adapter_model": [ + "GPTNeoXAdapterModel", + "GPTNeoXModelWithHeads" + ], +} + + +if TYPE_CHECKING: + from .adapter_model import GPTNeoXAdapterModel, GPTNeoXModelWithHeads + +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + ) diff --git a/src/transformers/adapters/models/gpt_neox/adapter_model.py b/src/transformers/adapters/models/gpt_neox/adapter_model.py new file mode 100644 index 000000000..7ac4c6291 --- /dev/null +++ b/src/transformers/adapters/models/gpt_neox/adapter_model.py @@ -0,0 +1,219 @@ +import logging + +import torch + +from ....models.gpt_neox.modeling_gpt_neox import GPT_NEOX_START_DOCSTRING, GPTNeoXModel, GPTNeoXPreTrainedModel +from ....utils import add_start_docstrings +from ...composition import adjust_tensors_for_parallel +from ...heads import ( + CausalLMHead, + ClassificationHead, + ModelWithFlexibleHeadsAdaptersMixin, + MultiLabelClassificationHead, + QuestionAnsweringHead, + TaggingHead, +) +from ...model_mixin import EmbeddingAdaptersWrapperMixin + + +logger = logging.getLogger(__name__) + + +@add_start_docstrings( + """ +The GPTNeoX Model that allows the loading of different heads for different tasks. This enables a flexible use of the +models and adapters. Since this class does classification on the last token, it requires to know the position of the +last token. If a :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding +token in each row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since +it cannot guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same +(take the last value in each row of the batch). +""", + GPT_NEOX_START_DOCSTRING, +) +class GPTNeoXAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, GPTNeoXPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.gpt_neox = GPTNeoXModel(config) + + self._init_head_modules() + + self.init_weights() + + # Model parallel + self.model_parallel = False + self.device_map = None + + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + head=None, + output_adapter_gating_scores=False, + output_adapter_fusion_attentions=False, + **kwargs + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.gpt_neox( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + output_adapter_gating_scores=output_adapter_gating_scores, + output_adapter_fusion_attentions=output_adapter_fusion_attentions, + adapter_input_parallelized=kwargs.pop("adapter_input_parallelized", False), + ) + + batch_size = outputs[0].shape[0] + + if self.config.pad_token_id is None: + # TODO-AH: this may result in unexpected behavior for classification. Find a better way to do this? + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 + (sequence_lengths,) = adjust_tensors_for_parallel(outputs[0], sequence_lengths) + else: + sequence_lengths = -1 + logger.warning( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + cls_logits = outputs[0][range(batch_size), sequence_lengths] + + outputs = self.forward_head( + outputs, + head_name=head, + cls_output=cls_logits, + attention_mask=attention_mask, + return_dict=return_dict, + **kwargs, + ) + + return outputs + + # Copied from GPTNeoXForCausalLM + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values and past_key_values[0] is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + } + + + head_types = { + "classification": ClassificationHead, + "multilabel_classification": MultiLabelClassificationHead, + "causal_lm": CausalLMHead, + "question_answering": QuestionAnsweringHead, + "tagging": TaggingHead, + } + + def add_classification_head( + self, + head_name, + num_labels=2, + layers=2, + activation_function="tanh", + overwrite_ok=False, + multilabel=False, + id2label=None, + ): + """ + Adds a sequence classification head on top of the model. + + Args: + head_name (str): The name of the head. + num_labels (int, optional): Number of classification labels. Defaults to 2. + layers (int, optional): Number of layers. Defaults to 2. + activation_function (str, optional): Activation function. Defaults to 'tanh'. + overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False. + multilabel (bool, optional): Enable multilabel classification setup. Defaults to False. + """ + + if multilabel: + head = MultiLabelClassificationHead(self, head_name, num_labels, layers, activation_function, id2label) + else: + head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label) + self.add_prediction_head(head, overwrite_ok) + + def add_causal_lm_head(self, head_name, overwrite_ok=False): + """ + Adds a causal language modeling head on top of the model. + + Args: + head_name (str): The name of the head. + overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False. + """ + head = CausalLMHead(self, head_name) + self.add_prediction_head(head, overwrite_ok=overwrite_ok) + + def add_qa_head( + self, + head_name, + num_labels=2, + layers=1, + activation_function="tanh", + overwrite_ok=False, + id2label=None, + ): + head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label) + self.add_prediction_head(head, overwrite_ok) + +class GPTNeoXModelWithHeads(GPTNeoXAdapterModel): + def __init__(self, *args, **kwargs): + warnings.warn( + "This class has been renamed to `{}` in v3. " + "Please use the new class instead as this class might be removed in a future version.".format( + self.__class__.__bases__[0].__name__ + ), + FutureWarning, + ) + super().__init__(*args, **kwargs) + + @classmethod + def from_config(cls, config): + warnings.warn( + "This class has been renamed to `{}` in v3. " + "Please use the new class instead as this class might be removed in a future version.".format( + cls.__bases__[0].__name__ + ), + FutureWarning, + ) + return super().from_config(config) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + warnings.warn( + "This class has been renamed to `{}` in v3. " + "Please use the new class instead as this class might be removed in a future version.".format( + cls.__bases__[0].__name__ + ), + FutureWarning, + ) + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/src/transformers/adapters/wrappers/configuration.py b/src/transformers/adapters/wrappers/configuration.py index 56da1694b..d679feff3 100644 --- a/src/transformers/adapters/wrappers/configuration.py +++ b/src/transformers/adapters/wrappers/configuration.py @@ -35,6 +35,9 @@ "hidden_dropout_prob": "resid_pdrop", "attention_probs_dropout_prob": "attn_pdrop", }, + "gpt_neox": { + "attention_probs_dropout_prob": "attention_probs_dropout_prob", + }, "gptj": { "hidden_dropout_prob": "resid_pdrop", "attention_probs_dropout_prob": "attn_pdrop", diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index ce3de79bd..2786dbc23 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -209,6 +209,7 @@ ("funnel", "FunnelForPreTraining"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), + ("gpt_neox", "GPTNeoXForCausalLM"), ("ibert", "IBertForMaskedLM"), ("layoutlm", "LayoutLMForMaskedLM"), ("longformer", "LongformerForMaskedLM"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 94da66961..d59be9b43 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -141,7 +141,7 @@ ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("gpt_neox", (GPT2Tokenizer, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index a5ba1fddd..548ebf489 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -88,14 +88,14 @@ class GPTNeoXConfig(PretrainedConfig): def __init__( self, vocab_size=50432, - hidden_size=6144, - num_hidden_layers=44, - num_attention_heads=64, - intermediate_size=24576, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=768, hidden_act="gelu", rotary_pct=0.25, rotary_emb_base=10000, - max_position_embeddings=2048, + max_position_embeddings=1024, initializer_range=0.02, layer_norm_eps=1e-5, use_cache=True, diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 589eaae78..3bd604925 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -19,9 +19,19 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN +from ...adapters.composition import adjust_tensors_for_parallel +from ...adapters.context import ForwardContext +from ...adapters.lora import Linear as LoRALinear +from ...adapters.lora import MergedLinear as LoRAMergedLinear +from ...adapters.mixins.gpt2 import ( + GPT2DecoderBlockAdaptersMixin, + GPT2ModelAdapterMixin, + GPT2ModelWithHeadsAdaptersMixin, +) +from ...adapters.prefix_tuning import PrefixTuningShim from ...file_utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -83,6 +93,7 @@ def __init__(self, config): self.hidden_size = config.hidden_size self.head_size = self.hidden_size // self.num_attention_heads self.rotary_ndims = int(self.head_size * config.rotary_pct) + self.prefix_tuning = PrefixTuningShim("self_prefix", config) max_positions = config.max_position_embeddings self.register_buffer( "bias", @@ -95,7 +106,7 @@ def __init__(self, config): self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()) - self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size) + self.query_key_value = LoRALinear(config.hidden_size, 3 * config.hidden_size, "selfattn", config) self.dense = nn.Linear(config.hidden_size, config.hidden_size) def forward( @@ -149,6 +160,9 @@ def forward( value = torch.cat((past_value, value), dim=-2) present = (key, value) if use_cache else None + key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask) + (query,) = adjust_tensors_for_parallel(key, query) + # Compute attention attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) @@ -286,8 +300,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): class GPTNeoXMLP(nn.Module): def __init__(self, config): super().__init__() - self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size) - self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size) + self.dense_h_to_4h = LoRALinear(config.hidden_size, config.intermediate_size, "intermediate", config) + self.dense_4h_to_h = LoRALinear(config.intermediate_size, config.hidden_size, "output", config) self.act = ACT2FN[config.hidden_act] def forward(self, hidden_states): @@ -297,7 +311,7 @@ def forward(self, hidden_states): return hidden_states -class GPTNeoXLayer(nn.Module): +class GPTNeoXLayer(GPTNeoXDecoderBlockAdaptersMixin, nn.Module): def __init__(self, config): super().__init__() self.use_parallel_residual = config.use_parallel_residual @@ -305,6 +319,7 @@ def __init__(self, config): self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = GPTNeoXAttention(config) self.mlp = GPTNeoXMLP(config) + self._init_adapter_modules() def forward( self, @@ -331,14 +346,20 @@ def forward( # pseudocode: # x = x + attn(ln1(x)) + mlp(ln2(x)) mlp_output = self.mlp(self.post_attention_layernorm(hidden_states)) - hidden_states = mlp_output + attn_output + hidden_states + # See https://github.com/adapter-hub/adapter-transformers/pull/426#discussion_r994450898 + hidden_states = self.attention_adapters(attn_output, hidden_states, None) + hidden_states = self.output_adapters(mlp_output, hidden_states, None) + #hidden_states = mlp_output + attn_output + hidden_states + else: # pseudocode: # x = x + attn(ln1(x)) # x = x + mlp(ln2(x)) attn_output = attn_output + hidden_states mlp_output = self.mlp(self.post_attention_layernorm(attn_output)) - hidden_states = mlp_output + attn_output + # residual connection + hidden_states = self.output_adapters(mlp_output, attn_output, None) + #hidden_states = mlp_output + attn_output if use_cache: outputs = (hidden_states,) + outputs # hidden_states, present, (attn_weights) @@ -413,7 +434,7 @@ def forward( "The bare GPTNeoX Model transformer outputting raw hidden-states without any specific head on top.", GPT_NEOX_START_DOCSTRING, ) -class GPTNeoXModel(GPTNeoXPreTrainedModel): +class GPTNeoXModel(GPTNeoXModelAdapterMixin, GPTNeoXPreTrainedModel): def __init__(self, config): super().__init__(config) self.config = config @@ -424,6 +445,8 @@ def __init__(self, config): self.gradient_checkpointing = False + self._init_adapter_modules() + # Initialize weights and apply final processing self.post_init() @@ -440,6 +463,7 @@ def set_input_embeddings(self, value): output_type=BaseModelOutputWithPast, config_class=_CONFIG_FOR_DOC, ) + @ForwardContext.wrap def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -511,7 +535,7 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_in(input_ids) - + inputs_embeds = self.invertible_adapters_forward(inputs_embeds) hidden_states = inputs_embeds presents = () if use_cache else None @@ -552,6 +576,11 @@ def custom_forward(*inputs): output_attentions=output_attentions, ) hidden_states = outputs[0] + (attention_mask,) = adjust_tensors_for_parallel(hidden_states, attention_mask) + # also adjust output shape if necessary + if getattr(ForwardContext.get_context(), "adapters_parallelized", False): + output_shape = hidden_states.size() + if use_cache is True: presents = presents + (outputs[1],) if output_attentions: @@ -576,7 +605,7 @@ def custom_forward(*inputs): @add_start_docstrings( """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING ) -class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel): +class GPTNeoXForCausalLM(GPTNeoXModelWithHeadsAdaptersMixin, GPTNeoXPreTrainedModel): _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] From b7d59dbc7b7c5e2b29186d020c3c10686fd72ecd Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Wed, 15 Mar 2023 15:17:31 +0100 Subject: [PATCH 02/13] Adding adapter support for NeoX --- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 3bd604925..e5644485f 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -26,10 +26,10 @@ from ...adapters.context import ForwardContext from ...adapters.lora import Linear as LoRALinear from ...adapters.lora import MergedLinear as LoRAMergedLinear -from ...adapters.mixins.gpt2 import ( - GPT2DecoderBlockAdaptersMixin, - GPT2ModelAdapterMixin, - GPT2ModelWithHeadsAdaptersMixin, +from ...adapters.mixins.gpt_neox import ( + GPTNeoXDecoderBlockAdaptersMixin, + GPTNeoXModelAdapterMixin, + GPTNeoXModelWithHeadsAdaptersMixin, ) from ...adapters.prefix_tuning import PrefixTuningShim from ...file_utils import ( From 492d16a51e2be4587b139ae3720254e7d409d11b Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Wed, 15 Mar 2023 17:57:47 +0100 Subject: [PATCH 03/13] Adding adapter support for NeoX --- .../adapters/wrappers/configuration.py | 1 + .../models/gpt_neox/configuration_gpt_neox.py | 10 +-- .../models/gpt_neox/modeling_gpt_neox.py | 1 + tests_adapters/test_gpt_neox.py | 63 +++++++++++++++++++ 4 files changed, 70 insertions(+), 5 deletions(-) create mode 100644 tests_adapters/test_gpt_neox.py diff --git a/src/transformers/adapters/wrappers/configuration.py b/src/transformers/adapters/wrappers/configuration.py index d679feff3..ac68df859 100644 --- a/src/transformers/adapters/wrappers/configuration.py +++ b/src/transformers/adapters/wrappers/configuration.py @@ -36,6 +36,7 @@ "attention_probs_dropout_prob": "attn_pdrop", }, "gpt_neox": { + "hidden_dropout_prob": "hidden_dropout_prob", "attention_probs_dropout_prob": "attention_probs_dropout_prob", }, "gptj": { diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 548ebf489..a5ba1fddd 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -88,14 +88,14 @@ class GPTNeoXConfig(PretrainedConfig): def __init__( self, vocab_size=50432, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=768, + hidden_size=6144, + num_hidden_layers=44, + num_attention_heads=64, + intermediate_size=24576, hidden_act="gelu", rotary_pct=0.25, rotary_emb_base=10000, - max_position_embeddings=1024, + max_position_embeddings=2048, initializer_range=0.02, layer_norm_eps=1e-5, use_cache=True, diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index e5644485f..26ea65b5b 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -314,6 +314,7 @@ def forward(self, hidden_states): class GPTNeoXLayer(GPTNeoXDecoderBlockAdaptersMixin, nn.Module): def __init__(self, config): super().__init__() + self.config = config self.use_parallel_residual = config.use_parallel_residual self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) diff --git a/tests_adapters/test_gpt_neox.py b/tests_adapters/test_gpt_neox.py new file mode 100644 index 000000000..24dfab710 --- /dev/null +++ b/tests_adapters/test_gpt_neox.py @@ -0,0 +1,63 @@ +import unittest + +from transformers import GPTNeoXConfig +from transformers.testing_utils import require_torch + +from .methods import ( + BottleneckAdapterTestMixin, + CompacterTestMixin, + IA3TestMixin, + LoRATestMixin, + PrefixTuningTestMixin, + UniPELTTestMixin, +) +from .test_adapter import AdapterTestBase, make_config +from .test_adapter_backward_compability import CompabilityTestMixin +from .composition.test_parallel import ParallelAdapterInferenceTestMixin, ParallelTrainingMixin +from .test_adapter_conversion import ModelClassConversionTestMixin +from .test_adapter_embeddings import EmbeddingTestMixin +from .test_adapter_fusion_common import AdapterFusionModelTestMixin +from .test_adapter_heads import PredictionHeadModelTestMixin + + +class GPTNeoXAdapterTestBase(AdapterTestBase): + config_class = GPTNeoXConfig + config = make_config( + GPTNeoXConfig, + n_embd=32, + n_layer=4, + n_head=4, + # set pad token to eos token + pad_token_id=50256, + resid_pdrop=0.1, + ) + tokenizer_name = "EleutherAI/gpt-neox-20b" + + +@require_torch +class GPTNeoXAdapterTest( + BottleneckAdapterTestMixin, + CompacterTestMixin, + IA3TestMixin, + LoRATestMixin, + UniPELTTestMixin, + PrefixTuningTestMixin, + EmbeddingTestMixin, + CompabilityTestMixin, + AdapterFusionModelTestMixin, + PredictionHeadModelTestMixin, + ParallelAdapterInferenceTestMixin, + ParallelTrainingMixin, + GPTNeoXAdapterTestBase, + unittest.TestCase, +): + pass + + +@require_torch +class GPTNeoXClassConversionTest( + ModelClassConversionTestMixin, + GPTNeoXAdapterTestBase, + unittest.TestCase, +): + pass From 0d25ae97b0d36977b92189f9af5daf29e8037be6 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Wed, 15 Mar 2023 18:03:18 +0100 Subject: [PATCH 04/13] restored the None NeoX tokenizer --- src/transformers/models/auto/tokenization_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index d59be9b43..94da66961 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -141,7 +141,7 @@ ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("gpt_neox", (GPT2Tokenizer, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), From f920882c0abd7bbe196ec141d543a150a8f7c0da Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Tue, 21 Mar 2023 17:56:49 +0100 Subject: [PATCH 05/13] Adding adapters to GPTNeoX --- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 1 - src/transformers/utils/fx.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 26ea65b5b..39c29c9d4 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -25,7 +25,6 @@ from ...adapters.composition import adjust_tensors_for_parallel from ...adapters.context import ForwardContext from ...adapters.lora import Linear as LoRALinear -from ...adapters.lora import MergedLinear as LoRAMergedLinear from ...adapters.mixins.gpt_neox import ( GPTNeoXDecoderBlockAdaptersMixin, GPTNeoXModelAdapterMixin, diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 7e951fdb1..29780928e 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -117,6 +117,7 @@ def _generate_supported_model_class_names( "electra", "gpt2", "gpt_neo", + "gpt_neox", "gptj", "hubert", "layoutlm", From 034dea1c13a6853f19a448d3c221629f10f3be9f Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Tue, 4 Apr 2023 14:11:40 +0200 Subject: [PATCH 06/13] fixed neox attention_adapters --- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 39c29c9d4..67f03069d 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -355,10 +355,11 @@ def forward( # pseudocode: # x = x + attn(ln1(x)) # x = x + mlp(ln2(x)) - attn_output = attn_output + hidden_states - mlp_output = self.mlp(self.post_attention_layernorm(attn_output)) + hidden_states = self.attention_adapters(attn_output, hidden_states, None) #attn_output = attn_output + hidden_states + residual = hidden_states + mlp_output = self.mlp(self.post_attention_layernorm(hidden_states)) # residual connection - hidden_states = self.output_adapters(mlp_output, attn_output, None) + hidden_states = self.output_adapters(mlp_output, residual, None) #hidden_states = mlp_output + attn_output if use_cache: From 8e2c52db52cfdd5f603cf8a3e06dfb08277233d6 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Fri, 21 Apr 2023 10:45:49 +0200 Subject: [PATCH 07/13] Fixed PR reviews --- src/transformers/__init__.py | 11 ------- .../adapters/models/gpt_neox/__init__.py | 7 ++-- .../adapters/models/gpt_neox/adapter_model.py | 33 ------------------- .../models/gpt_neox/modeling_gpt_neox.py | 3 +- 4 files changed, 4 insertions(+), 50 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 920c1cb38..1c9026e0c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2597,7 +2597,6 @@ "GPT2ModelWithHeads", "GPTJAdapterModel", "GPTNeoXAdapterModel", - "GPTNeoXModelWithHeads", "HoulsbyConfig", "HoulsbyInvConfig", "IA3Config", @@ -2944,15 +2943,6 @@ "TFGPTJPreTrainedModel", ] ) - _import_structure["models.gpt_neox"].extend( - [ - "TFGPTNeoXForCausalLM", - "TFGPTNeoXForQuestionAnswering", - "TFGPTNeoXForSequenceClassification", - "TFGPTNeoXModel", - "TFGPTNeoXPreTrainedModel", - ] - ) _import_structure["models.groupvit"].extend( [ "TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5713,7 +5703,6 @@ GPT2AdapterModel, GPT2ModelWithHeads, GPTNeoXAdapterModel, - GPTNeoXModelWithHeads, GPTJAdapterModel, HoulsbyConfig, HoulsbyInvConfig, diff --git a/src/transformers/adapters/models/gpt_neox/__init__.py b/src/transformers/adapters/models/gpt_neox/__init__.py index 72aae7460..447dc9d60 100644 --- a/src/transformers/adapters/models/gpt_neox/__init__.py +++ b/src/transformers/adapters/models/gpt_neox/__init__.py @@ -22,15 +22,12 @@ _import_structure = { - "adapter_model": [ - "GPTNeoXAdapterModel", - "GPTNeoXModelWithHeads" - ], + "adapter_model": ["GPTNeoXAdapterModel"], } if TYPE_CHECKING: - from .adapter_model import GPTNeoXAdapterModel, GPTNeoXModelWithHeads + from .adapter_model import GPTNeoXAdapterModel else: import sys diff --git a/src/transformers/adapters/models/gpt_neox/adapter_model.py b/src/transformers/adapters/models/gpt_neox/adapter_model.py index 7ac4c6291..eb405bf38 100644 --- a/src/transformers/adapters/models/gpt_neox/adapter_model.py +++ b/src/transformers/adapters/models/gpt_neox/adapter_model.py @@ -184,36 +184,3 @@ def add_qa_head( ): head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label) self.add_prediction_head(head, overwrite_ok) - -class GPTNeoXModelWithHeads(GPTNeoXAdapterModel): - def __init__(self, *args, **kwargs): - warnings.warn( - "This class has been renamed to `{}` in v3. " - "Please use the new class instead as this class might be removed in a future version.".format( - self.__class__.__bases__[0].__name__ - ), - FutureWarning, - ) - super().__init__(*args, **kwargs) - - @classmethod - def from_config(cls, config): - warnings.warn( - "This class has been renamed to `{}` in v3. " - "Please use the new class instead as this class might be removed in a future version.".format( - cls.__bases__[0].__name__ - ), - FutureWarning, - ) - return super().from_config(config) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - warnings.warn( - "This class has been renamed to `{}` in v3. " - "Please use the new class instead as this class might be removed in a future version.".format( - cls.__bases__[0].__name__ - ), - FutureWarning, - ) - return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 67f03069d..d9acbaed8 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -25,6 +25,7 @@ from ...adapters.composition import adjust_tensors_for_parallel from ...adapters.context import ForwardContext from ...adapters.lora import Linear as LoRALinear +from ...adapters.lora import MergedLinear as LoRAMergedLinear from ...adapters.mixins.gpt_neox import ( GPTNeoXDecoderBlockAdaptersMixin, GPTNeoXModelAdapterMixin, @@ -105,7 +106,7 @@ def __init__(self, config): self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()) - self.query_key_value = LoRALinear(config.hidden_size, 3 * config.hidden_size, "selfattn", config) + self.query_key_value = LoRAMergedLinear(config.hidden_size, 3 * config.hidden_size, "selfattn", config, fan_in_fan_out=True,) self.dense = nn.Linear(config.hidden_size, config.hidden_size) def forward( From 25228779ccd35c520bf4a013635afa7e056c0a37 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Fri, 21 Apr 2023 15:12:25 +0200 Subject: [PATCH 08/13] updated adapter head --- .../adapters/models/gpt_neox/adapter_model.py | 50 +------------------ 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/src/transformers/adapters/models/gpt_neox/adapter_model.py b/src/transformers/adapters/models/gpt_neox/adapter_model.py index eb405bf38..55b00923a 100644 --- a/src/transformers/adapters/models/gpt_neox/adapter_model.py +++ b/src/transformers/adapters/models/gpt_neox/adapter_model.py @@ -7,11 +7,7 @@ from ...composition import adjust_tensors_for_parallel from ...heads import ( CausalLMHead, - ClassificationHead, ModelWithFlexibleHeadsAdaptersMixin, - MultiLabelClassificationHead, - QuestionAnsweringHead, - TaggingHead, ) from ...model_mixin import EmbeddingAdaptersWrapperMixin @@ -127,41 +123,9 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti head_types = { - "classification": ClassificationHead, - "multilabel_classification": MultiLabelClassificationHead, - "causal_lm": CausalLMHead, - "question_answering": QuestionAnsweringHead, - "tagging": TaggingHead, + "causal_lm": CausalLMHead } - def add_classification_head( - self, - head_name, - num_labels=2, - layers=2, - activation_function="tanh", - overwrite_ok=False, - multilabel=False, - id2label=None, - ): - """ - Adds a sequence classification head on top of the model. - - Args: - head_name (str): The name of the head. - num_labels (int, optional): Number of classification labels. Defaults to 2. - layers (int, optional): Number of layers. Defaults to 2. - activation_function (str, optional): Activation function. Defaults to 'tanh'. - overwrite_ok (bool, optional): Force overwrite if a head with the same name exists. Defaults to False. - multilabel (bool, optional): Enable multilabel classification setup. Defaults to False. - """ - - if multilabel: - head = MultiLabelClassificationHead(self, head_name, num_labels, layers, activation_function, id2label) - else: - head = ClassificationHead(self, head_name, num_labels, layers, activation_function, id2label) - self.add_prediction_head(head, overwrite_ok) - def add_causal_lm_head(self, head_name, overwrite_ok=False): """ Adds a causal language modeling head on top of the model. @@ -172,15 +136,3 @@ def add_causal_lm_head(self, head_name, overwrite_ok=False): """ head = CausalLMHead(self, head_name) self.add_prediction_head(head, overwrite_ok=overwrite_ok) - - def add_qa_head( - self, - head_name, - num_labels=2, - layers=1, - activation_function="tanh", - overwrite_ok=False, - id2label=None, - ): - head = QuestionAnsweringHead(self, head_name, num_labels, layers, activation_function, id2label) - self.add_prediction_head(head, overwrite_ok) From 23034e6d1ee386789f2264e8ad33cdc1143438ae Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Sun, 23 Apr 2023 00:47:35 +0200 Subject: [PATCH 09/13] set LoRAMergedLinear to False for NeoX --- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index d9acbaed8..015b62b50 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -106,7 +106,7 @@ def __init__(self, config): self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()) - self.query_key_value = LoRAMergedLinear(config.hidden_size, 3 * config.hidden_size, "selfattn", config, fan_in_fan_out=True,) + self.query_key_value = LoRAMergedLinear(config.hidden_size, 3 * config.hidden_size, "selfattn", config, fan_in_fan_out=False,) self.dense = nn.Linear(config.hidden_size, config.hidden_size) def forward( From 31606a58d16b4bc89d89e87fb8e02ae24d961966 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Sun, 23 Apr 2023 01:13:08 +0200 Subject: [PATCH 10/13] set reformatted the files with black --- src/transformers/__init__.py | 4 +++- src/transformers/adapters/__init__.py | 4 ++-- src/transformers/adapters/head_utils.py | 2 +- .../adapters/models/gpt_neox/adapter_model.py | 7 ++----- .../models/gpt_neox/modeling_gpt_neox.py | 16 ++++++++++++---- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 1c9026e0c..5674f3591 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -3457,7 +3457,9 @@ ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"] ) _import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"]) - _import_structure["models.gpt_neox"].extend(["FlaxGPTNeoXForCausalLM", "FlaxGPTNeoXModel", "FlaxGPTNeoXPreTrainedModel"]) + _import_structure["models.gpt_neox"].extend( + ["FlaxGPTNeoXForCausalLM", "FlaxGPTNeoXModel", "FlaxGPTNeoXPreTrainedModel"] + ) _import_structure["models.longt5"].extend( ["FlaxLongT5ForConditionalGeneration", "FlaxLongT5Model", "FlaxLongT5PreTrainedModel"] diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py index 19eb72c8e..3842c5fe3 100644 --- a/src/transformers/adapters/__init__.py +++ b/src/transformers/adapters/__init__.py @@ -114,8 +114,8 @@ "GPT2ModelWithHeads", ], "models.gpt_neox": [ - "GPTNeoXAdapterModel", - "GPTNeoXModelWithHeads", + "GPTNeoXAdapterModel", + "GPTNeoXModelWithHeads", ], "models.gptj": ["GPTJAdapterModel"], "models.mbart": [ diff --git a/src/transformers/adapters/head_utils.py b/src/transformers/adapters/head_utils.py index 8ea4c2579..cf38f3d7f 100644 --- a/src/transformers/adapters/head_utils.py +++ b/src/transformers/adapters/head_utils.py @@ -381,7 +381,7 @@ }, "layers": [None, "classifier"], }, - #GPT-NeoX + # GPT-NeoX "GPTNeoXForCausalLM": { "config": { "head_type": "causal_lm", diff --git a/src/transformers/adapters/models/gpt_neox/adapter_model.py b/src/transformers/adapters/models/gpt_neox/adapter_model.py index 55b00923a..baed7f1aa 100644 --- a/src/transformers/adapters/models/gpt_neox/adapter_model.py +++ b/src/transformers/adapters/models/gpt_neox/adapter_model.py @@ -102,7 +102,7 @@ def forward( ) return outputs - + # Copied from GPTNeoXForCausalLM def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape @@ -120,11 +120,8 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti "attention_mask": attention_mask, "past_key_values": past_key_values, } - - head_types = { - "causal_lm": CausalLMHead - } + head_types = {"causal_lm": CausalLMHead} def add_causal_lm_head(self, head_name, overwrite_ok=False): """ diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 015b62b50..230f1a91f 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -106,7 +106,13 @@ def __init__(self, config): self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) self.norm_factor = torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()) - self.query_key_value = LoRAMergedLinear(config.hidden_size, 3 * config.hidden_size, "selfattn", config, fan_in_fan_out=False,) + self.query_key_value = LoRAMergedLinear( + config.hidden_size, + 3 * config.hidden_size, + "selfattn", + config, + fan_in_fan_out=False, + ) self.dense = nn.Linear(config.hidden_size, config.hidden_size) def forward( @@ -350,18 +356,20 @@ def forward( # See https://github.com/adapter-hub/adapter-transformers/pull/426#discussion_r994450898 hidden_states = self.attention_adapters(attn_output, hidden_states, None) hidden_states = self.output_adapters(mlp_output, hidden_states, None) - #hidden_states = mlp_output + attn_output + hidden_states + # hidden_states = mlp_output + attn_output + hidden_states else: # pseudocode: # x = x + attn(ln1(x)) # x = x + mlp(ln2(x)) - hidden_states = self.attention_adapters(attn_output, hidden_states, None) #attn_output = attn_output + hidden_states + hidden_states = self.attention_adapters( + attn_output, hidden_states, None + ) # attn_output = attn_output + hidden_states residual = hidden_states mlp_output = self.mlp(self.post_attention_layernorm(hidden_states)) # residual connection hidden_states = self.output_adapters(mlp_output, residual, None) - #hidden_states = mlp_output + attn_output + # hidden_states = mlp_output + attn_output if use_cache: outputs = (hidden_states,) + outputs # hidden_states, present, (attn_weights) From f228e9a3e2fc55858dd1989295bf861346058834 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Sun, 23 Apr 2023 01:20:53 +0200 Subject: [PATCH 11/13] use isort to fix the needed files --- src/transformers/__init__.py | 16 ++++++++-------- .../adapters/models/gpt_neox/adapter_model.py | 5 +---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5674f3591..56490a8a3 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -5704,8 +5704,8 @@ ForwardContext, GPT2AdapterModel, GPT2ModelWithHeads, - GPTNeoXAdapterModel, GPTJAdapterModel, + GPTNeoXAdapterModel, HoulsbyConfig, HoulsbyInvConfig, IA3Config, @@ -6009,13 +6009,6 @@ TFGPT2Model, TFGPT2PreTrainedModel, ) - from .models.gptj import ( - TFGPTJForCausalLM, - TFGPTJForQuestionAnswering, - TFGPTJForSequenceClassification, - TFGPTJModel, - TFGPTJPreTrainedModel, - ) from .models.gpt_neox import ( TFGPTNeoXForCausalLM, TFGPTNeoXForQuestionAnswering, @@ -6023,6 +6016,13 @@ TFGPTNeoXModel, TFGPTNeoXPreTrainedModel, ) + from .models.gptj import ( + TFGPTJForCausalLM, + TFGPTJForQuestionAnswering, + TFGPTJForSequenceClassification, + TFGPTJModel, + TFGPTJPreTrainedModel, + ) from .models.groupvit import ( TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST, TFGroupViTModel, diff --git a/src/transformers/adapters/models/gpt_neox/adapter_model.py b/src/transformers/adapters/models/gpt_neox/adapter_model.py index baed7f1aa..8573155f0 100644 --- a/src/transformers/adapters/models/gpt_neox/adapter_model.py +++ b/src/transformers/adapters/models/gpt_neox/adapter_model.py @@ -5,10 +5,7 @@ from ....models.gpt_neox.modeling_gpt_neox import GPT_NEOX_START_DOCSTRING, GPTNeoXModel, GPTNeoXPreTrainedModel from ....utils import add_start_docstrings from ...composition import adjust_tensors_for_parallel -from ...heads import ( - CausalLMHead, - ModelWithFlexibleHeadsAdaptersMixin, -) +from ...heads import CausalLMHead, ModelWithFlexibleHeadsAdaptersMixin from ...model_mixin import EmbeddingAdaptersWrapperMixin From 355b3155da4174a7ce0ee8931f77d419d47b2b87 Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Sun, 23 Apr 2023 01:29:19 +0200 Subject: [PATCH 12/13] fixed unsued imports --- src/transformers/models/gpt_neox/modeling_gpt_neox.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 230f1a91f..f39b750ac 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -19,7 +19,7 @@ import torch import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...adapters.composition import adjust_tensors_for_parallel @@ -587,9 +587,6 @@ def custom_forward(*inputs): ) hidden_states = outputs[0] (attention_mask,) = adjust_tensors_for_parallel(hidden_states, attention_mask) - # also adjust output shape if necessary - if getattr(ForwardContext.get_context(), "adapters_parallelized", False): - output_shape = hidden_states.size() if use_cache is True: presents = presents + (outputs[1],) From 8c5395b02f90af56f575724b0c5a78480590df3f Mon Sep 17 00:00:00 2001 From: Jesujoba Alabi Date: Sun, 23 Apr 2023 01:44:45 +0200 Subject: [PATCH 13/13] Remove TFGPTNeoX --- src/transformers/__init__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 56490a8a3..063fc63f5 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -6009,13 +6009,6 @@ TFGPT2Model, TFGPT2PreTrainedModel, ) - from .models.gpt_neox import ( - TFGPTNeoXForCausalLM, - TFGPTNeoXForQuestionAnswering, - TFGPTNeoXForSequenceClassification, - TFGPTNeoXModel, - TFGPTNeoXPreTrainedModel, - ) from .models.gptj import ( TFGPTJForCausalLM, TFGPTJForQuestionAnswering,