Skip to content

Commit 475c567

Browse files
committed
fix configs
1 parent 80c6438 commit 475c567

File tree

8 files changed

+2
-26
lines changed

8 files changed

+2
-26
lines changed

src/transformers/models/gemma/configuration_gemma.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@ class GemmaConfig(PretrainedConfig):
5454
The attention head dimension.
5555
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
5656
The legacy activation function. It is overwritten by the `hidden_activation`.
57-
hidden_activation (`str` or `function`, *optional*):
58-
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
59-
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
6057
max_position_embeddings (`int`, *optional*, defaults to 8192):
6158
The maximum sequence length that this model might ever be used with.
6259
initializer_range (`float`, *optional*, defaults to 0.02):
@@ -117,7 +114,6 @@ def __init__(
117114
num_key_value_heads=16,
118115
head_dim=256,
119116
hidden_act="gelu_pytorch_tanh",
120-
hidden_activation=None,
121117
max_position_embeddings=8192,
122118
initializer_range=0.02,
123119
rms_norm_eps=1e-6,
@@ -140,7 +136,6 @@ def __init__(
140136
self.head_dim = head_dim
141137
self.num_key_value_heads = num_key_value_heads
142138
self.hidden_act = hidden_act
143-
self.hidden_activation = hidden_activation
144139
self.initializer_range = initializer_range
145140
self.rms_norm_eps = rms_norm_eps
146141
self.use_cache = use_cache

src/transformers/models/gemma/modular_gemma.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@ class GemmaConfig(PretrainedConfig):
7979
The attention head dimension.
8080
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
8181
The legacy activation function. It is overwritten by the `hidden_activation`.
82-
hidden_activation (`str` or `function`, *optional*):
83-
The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
84-
if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
8582
max_position_embeddings (`int`, *optional*, defaults to 8192):
8683
The maximum sequence length that this model might ever be used with.
8784
initializer_range (`float`, *optional*, defaults to 0.02):
@@ -142,7 +139,6 @@ def __init__(
142139
num_key_value_heads=16,
143140
head_dim=256,
144141
hidden_act="gelu_pytorch_tanh",
145-
hidden_activation=None,
146142
max_position_embeddings=8192,
147143
initializer_range=0.02,
148144
rms_norm_eps=1e-6,
@@ -165,7 +161,6 @@ def __init__(
165161
self.head_dim = head_dim
166162
self.num_key_value_heads = num_key_value_heads
167163
self.hidden_act = hidden_act
168-
self.hidden_activation = hidden_activation
169164
self.initializer_range = initializer_range
170165
self.rms_norm_eps = rms_norm_eps
171166
self.use_cache = use_cache

src/transformers/models/lxmert/configuration_lxmert.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,6 @@ class LxmertConfig(PretrainedConfig):
6666
The vocabulary size of the *token_type_ids* passed into [`BertModel`].
6767
initializer_range (`float`, *optional*, defaults to 0.02):
6868
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69-
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
70-
The epsilon used by the layer normalization layers.
7169
l_layers (`int`, *optional*, defaults to 9):
7270
Number of hidden layers in the Transformer language encoder.
7371
x_layers (`int`, *optional*, defaults to 5):
@@ -119,7 +117,6 @@ def __init__(
119117
max_position_embeddings=512,
120118
type_vocab_size=2,
121119
initializer_range=0.02,
122-
layer_norm_eps=1e-12,
123120
l_layers=9,
124121
x_layers=5,
125122
r_layers=5,
@@ -145,7 +142,6 @@ def __init__(
145142
self.max_position_embeddings = max_position_embeddings
146143
self.type_vocab_size = type_vocab_size
147144
self.initializer_range = initializer_range
148-
self.layer_norm_eps = layer_norm_eps
149145
self.num_qa_labels = num_qa_labels
150146
self.num_object_labels = num_object_labels
151147
self.num_attr_labels = num_attr_labels

src/transformers/models/lxmert/modeling_lxmert.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,7 @@ def __init__(self, config):
187187
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0)
188188
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size, padding_idx=0)
189189

190-
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
191-
# any TensorFlow checkpoint file
190+
# self.LayerNorm is not snake-cased due to old tensorflow checkpoint name matching
192191
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
193192
self.dropout = nn.Dropout(config.hidden_dropout_prob)
194193

src/transformers/models/xlnet/configuration_xlnet.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ class XLNetConfig(PretrainedConfig):
4949
ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
5050
The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`, `"silu"` and
5151
`"gelu_new"` are supported.
52-
untie_r (`bool`, *optional*, defaults to `True`):
53-
Whether or not to untie relative position biases
5452
attn_type (`str`, *optional*, defaults to `"bi"`):
5553
The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
5654
initializer_range (`float`, *optional*, defaults to 0.02):
@@ -150,7 +148,6 @@ def __init__(
150148
n_head=16,
151149
d_inner=4096,
152150
ff_activation="gelu",
153-
untie_r=True,
154151
attn_type="bi",
155152
initializer_range=0.02,
156153
layer_norm_eps=1e-12,
@@ -188,7 +185,6 @@ def __init__(
188185
self.d_head = d_model // n_head
189186
self.ff_activation = ff_activation
190187
self.d_inner = d_inner
191-
self.untie_r = untie_r
192188
self.attn_type = attn_type
193189

194190
self.initializer_range = initializer_range

tests/models/lxmert/test_modeling_lxmert.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ def __init__(
5454
max_position_embeddings=512,
5555
type_vocab_size=2,
5656
initializer_range=0.02,
57-
layer_norm_eps=1e-12,
5857
pad_token_id=0,
5958
num_qa_labels=30,
6059
num_object_labels=16,
@@ -94,7 +93,6 @@ def __init__(
9493
self.max_position_embeddings = max_position_embeddings
9594
self.type_vocab_size = type_vocab_size
9695
self.initializer_range = initializer_range
97-
self.layer_norm_eps = layer_norm_eps
9896
self.pad_token_id = pad_token_id
9997
self.num_qa_labels = num_qa_labels
10098
self.num_object_labels = num_object_labels
@@ -194,7 +192,6 @@ def get_config(self):
194192
max_position_embeddings=self.max_position_embeddings,
195193
type_vocab_size=self.type_vocab_size,
196194
initializer_range=self.initializer_range,
197-
layer_norm_eps=self.layer_norm_eps,
198195
pad_token_id=self.pad_token_id,
199196
num_qa_labels=self.num_qa_labels,
200197
num_object_labels=self.num_object_labels,

tests/models/xlnet/test_modeling_xlnet.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ def __init__(
5656
d_inner=128,
5757
num_hidden_layers=2,
5858
type_sequence_label_size=2,
59-
untie_r=True,
6059
bi_data=False,
6160
same_length=False,
6261
initializer_range=0.05,
@@ -83,7 +82,6 @@ def __init__(
8382
self.d_inner = 128
8483
self.num_hidden_layers = 5
8584
self.type_sequence_label_size = 2
86-
self.untie_r = True
8785
self.bi_data = False
8886
self.same_length = False
8987
self.initializer_range = 0.05
@@ -152,7 +150,6 @@ def get_config(self):
152150
n_head=self.num_attention_heads,
153151
d_inner=self.d_inner,
154152
n_layer=self.num_hidden_layers,
155-
untie_r=self.untie_r,
156153
mem_len=self.mem_len,
157154
clamp_len=self.clamp_len,
158155
same_length=self.same_length,

utils/check_config_attributes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@
308308
"MarkupLMConfig": ["position_embedding_type"],
309309
"SmolLM3Config": ["no_rope_layer_interval"],
310310
"Gemma3nVisionConfig": ["architecture", "do_pooling", "model_args"], # this is for use in `timm`
311+
"GemmaConfig": ["tie_word_embeddings"],
311312
}
312313

313314

0 commit comments

Comments
 (0)