From 45a382b044eeb081573c3c5f7df2a0447ca33048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Mon, 19 Jun 2023 15:59:13 +0200 Subject: [PATCH 1/3] Fix arg sort in docstring --- src/transformers/models/gpt_neo/configuration_gpt_neo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index 476537909b58..3c38798853f4 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -75,10 +75,10 @@ class GPTNeoConfig(PretrainedConfig): just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`]. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. From 727dedc811d44901cd1cfbd722f91bc5483ab939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20GALLOU=C3=89DEC?= Date: Mon, 19 Jun 2023 16:37:07 +0200 Subject: [PATCH 2/3] further order fix --- .../models/gpt_neo/configuration_gpt_neo.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index 3c38798853f4..546f4006e7ce 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -47,34 +47,35 @@ class GPTNeoConfig(PretrainedConfig): Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of [`GPTNeoModel`]. - attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`): - The type of attention for each layer in a `List` of the following format `[[["attention_type"], - num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the - value of `attention_type` from `["global", "local"]` + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). hidden_size (`int`, *optional*, defaults to 2048): Dimensionality of the encoder layers and the pooler layer. num_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. + attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`): + The type of attention for each layer in a `List` of the following format `[[["attention_type"], + num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the + value of `attention_type` from `["global", "local"]` num_heads (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. intermediate_size (`int`, *optional*, defaults to 8192): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + window_size (`int`, *optional*, defaults to 256): + The size of the sliding window for local attention. activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + resid_dropout (`float`, *optional*, defaults to 0.0): + Residual dropout used in the attention pattern. embed_dropout (`float`, *optional*, defaults to 0.0): The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. classifier_dropout (`float`, *optional*, defaults to 0.1): Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. - The dropout ratio for the hidden layer. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - type_vocab_size (`int`, *optional*, defaults to 2): - The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`]. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. initializer_range (`float`, *optional*, defaults to 0.02): @@ -82,6 +83,10 @@ class GPTNeoConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. + bos_token_id (`int`, *optional*, defaults to 50256): + The id of the beginning of sentence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 50256): + The id of the end of sentence token in the vocabulary. Example: From cf33a629d620fc784a86b9a9edb917016c369d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20GALLOU=C3=89DEC?= Date: Mon, 19 Jun 2023 18:55:51 +0200 Subject: [PATCH 3/3] make style --- src/transformers/models/gpt_neo/configuration_gpt_neo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index 546f4006e7ce..ea1c37af2199 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -74,8 +74,8 @@ class GPTNeoConfig(PretrainedConfig): attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. classifier_dropout (`float`, *optional*, defaults to 0.1): - Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. - The dropout ratio for the hidden layer. + Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. The + dropout ratio for the hidden layer. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. initializer_range (`float`, *optional*, defaults to 0.02):