diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 4703ab231e91df..ad35b6182a4e21 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -168,12 +168,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) + super().build(input_shape) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( @@ -251,7 +246,6 @@ def __init__(self, config: AlbertConfig, **kwargs): # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -313,26 +307,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFAlbertLayer(tf.keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): @@ -355,7 +329,6 @@ def __init__(self, config: AlbertConfig, **kwargs): epsilon=config.layer_norm_eps, name="full_layer_layer_norm" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call( self, @@ -383,23 +356,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "ffn", None) is not None: - with tf.name_scope(self.ffn.name): - self.ffn.build(self.config.hidden_size) - if getattr(self, "ffn_output", None) is not None: - with tf.name_scope(self.ffn_output.name): - self.ffn_output.build(self.config.intermediate_size) - if getattr(self, "full_layer_layer_norm", None) is not None: - with tf.name_scope(self.full_layer_layer_norm.name): - self.full_layer_layer_norm.build([None, None, self.config.hidden_size]) - class TFAlbertLayerGroup(tf.keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): @@ -443,15 +399,6 @@ def call( return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert_layers", None) is not None: - for layer in self.albert_layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFAlbertTransformer(tf.keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): @@ -469,7 +416,6 @@ def __init__(self, config: AlbertConfig, **kwargs): self.albert_layer_groups = [ TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups) ] - self.config = config def call( self, @@ -511,18 +457,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedding_hidden_mapping_in", None) is not None: - with tf.name_scope(self.embedding_hidden_mapping_in.name): - self.embedding_hidden_mapping_in.build(self.config.embedding_size) - if getattr(self, "albert_layer_groups", None) is not None: - for layer in self.albert_layer_groups: - with tf.name_scope(layer.name): - layer.build(None) - class TFAlbertPreTrainedModel(TFPreTrainedModel): """ @@ -554,21 +488,13 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.decoder_bias = self.add_weight( shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.decoder @@ -724,20 +650,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) # TODO Matt might be wrong - @dataclass class TFAlbertForPreTrainingOutput(ModelOutput): @@ -913,14 +825,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - @add_start_docstrings( """ @@ -1017,20 +921,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) - if getattr(self, "sop_classifier", None) is not None: - with tf.name_scope(self.sop_classifier.name): - self.sop_classifier.build(None) - class TFAlbertSOPHead(tf.keras.layers.Layer): def __init__(self, config: AlbertConfig, **kwargs): @@ -1042,7 +932,6 @@ def __init__(self, config: AlbertConfig, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor: dropout_pooled_output = self.dropout(inputs=pooled_output, training=training) @@ -1050,14 +939,6 @@ def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor: return logits - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING) class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1154,17 +1035,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) - @add_start_docstrings( """ @@ -1188,7 +1058,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1248,17 +1117,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1287,7 +1145,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1343,17 +1200,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1375,7 +1221,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1450,17 +1295,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1482,7 +1316,6 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1561,14 +1394,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "albert", None) is not None: - with tf.name_scope(self.albert.name): - self.albert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index ed2b31d2e7037c..b13ecea213c49e 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -296,23 +296,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFBartEncoderLayer(tf.keras.layers.Layer): def __init__(self, config: BartConfig, **kwargs): @@ -328,7 +311,6 @@ def __init__(self, config: BartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -370,26 +352,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFBartDecoderLayer(tf.keras.layers.Layer): def __init__(self, config: BartConfig, **kwargs): @@ -418,7 +380,6 @@ def __init__(self, config: BartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -500,32 +461,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFBartClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -536,7 +471,6 @@ def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name self.dropout = tf.keras.layers.Dropout(pooler_dropout) self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj") self.input_dim = inner_dim - self.inner_dim = inner_dim def call(self, inputs): hidden_states = self.dropout(inputs) @@ -546,17 +480,6 @@ def call(self, inputs): hidden_states = self.out_proj(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.input_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.inner_dim) - class TFBartPretrainedModel(TFPreTrainedModel): config_class = BartConfig @@ -882,21 +805,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.embed_dim]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFBartDecoder(tf.keras.layers.Layer): @@ -1120,21 +1028,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFBartMainLayer(tf.keras.layers.Layer): @@ -1252,20 +1145,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare BART Model outputting raw hidden-states without any specific head on top.", @@ -1354,14 +1233,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - class BiasLayer(tf.keras.layers.Layer): """ @@ -1565,17 +1436,6 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1703,14 +1563,3 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "classification_head", None) is not None: - with tf.name_scope(self.classification_head.name): - self.classification_head.build(None) diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 60631541d054b0..fd0a07b415f4f2 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -156,7 +156,7 @@ def __init__(self, config: BertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -178,12 +178,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def call( self, @@ -253,7 +248,6 @@ def __init__(self, config: BertConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -343,20 +337,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -367,7 +347,6 @@ def __init__(self, config: BertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -376,17 +355,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFBertAttention(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -427,17 +395,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -451,7 +408,6 @@ def __init__(self, config: BertConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -459,14 +415,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -477,7 +425,6 @@ def __init__(self, config: BertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -486,17 +433,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFBertLayer(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -583,20 +519,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - class TFBertEncoder(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -666,15 +588,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - class TFBertPooler(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -686,7 +599,6 @@ def __init__(self, config: BertConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -696,14 +608,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFBertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config: BertConfig, **kwargs): @@ -721,7 +625,6 @@ def __init__(self, config: BertConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -730,17 +633,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): @@ -755,15 +647,10 @@ def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -1004,20 +891,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFBertPreTrainedModel(TFPreTrainedModel): """ @@ -1230,14 +1103,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - @add_start_docstrings( """ @@ -1350,20 +1215,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "nsp", None) is not None: - with tf.name_scope(self.nsp.name): - self.nsp.build(None) # TODO Matt might be wrong - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1450,17 +1301,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model @@ -1586,17 +1426,6 @@ def call( cross_attentions=outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) - @add_start_docstrings( """Bert Model with a `next sentence prediction (classification)` head on top.""", @@ -1679,17 +1508,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "nsp", None) is not None: - with tf.name_scope(self.nsp.name): - self.nsp.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1718,7 +1536,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1777,17 +1594,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1809,7 +1615,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1888,17 +1693,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1933,7 +1727,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1990,17 +1783,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -2030,7 +1812,6 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -2103,14 +1884,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 9e9c8c5664cad6..fdd85a7f87832c 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -291,23 +291,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): @@ -324,7 +307,6 @@ def __init__(self, config: BlenderbotConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -366,26 +348,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): @@ -415,7 +377,6 @@ def __init__(self, config: BlenderbotConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -497,32 +458,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFBlenderbotPreTrainedModel(TFPreTrainedModel): config_class = BlenderbotConfig @@ -841,21 +776,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFBlenderbotDecoder(tf.keras.layers.Layer): @@ -1086,21 +1006,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFBlenderbotMainLayer(tf.keras.layers.Layer): @@ -1209,20 +1114,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.", @@ -1326,14 +1217,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer class BiasLayer(tf.keras.layers.Layer): @@ -1553,14 +1436,3 @@ def prepare_inputs_for_generation( "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index b16ff7abf0f21d..1627079b2e96b1 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -291,23 +291,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): @@ -324,7 +307,6 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -366,26 +348,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): @@ -415,7 +377,6 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -497,32 +458,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel): config_class = BlenderbotSmallConfig @@ -847,21 +782,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.embed_dim]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): @@ -1095,21 +1015,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer): @@ -1218,20 +1123,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.", @@ -1319,14 +1210,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer class BiasLayer(tf.keras.layers.Layer): @@ -1531,14 +1414,3 @@ def prepare_inputs_for_generation( "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py index f6800edbee3cb1..54d15b3088c682 100644 --- a/src/transformers/models/blip/modeling_tf_blip.py +++ b/src/transformers/models/blip/modeling_tf_blip.py @@ -254,7 +254,7 @@ def __init__(self, config: BlipVisionConfig, **kwargs): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 - def build(self, input_shape=None): + def build(self, input_shape): self.class_embedding = self.add_weight( shape=(1, 1, self.embed_dim), initializer=get_initializer(self.config.initializer_range), @@ -268,13 +268,7 @@ def build(self, input_shape=None): trainable=True, name="position_embedding", ) - - if self.built: - return - self.built = True - if getattr(self, "patch_embedding", None) is not None: - with tf.name_scope(self.patch_embedding.name): - self.patch_embedding.build(3) + super().build(input_shape) def call(self, pixel_values: tf.Tensor) -> tf.Tensor: # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch @@ -418,20 +412,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - if getattr(self, "qkv", None) is not None: - with tf.name_scope(self.qkv.name): - self.qkv.build(self.embed_dim) - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.embed_dim) - class TFBlipMLP(tf.keras.layers.Layer): def __init__(self, config: BlipConfig, **kwargs): @@ -448,7 +428,6 @@ def __init__(self, config: BlipConfig, **kwargs): self.fc2 = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc1(inputs=hidden_states) @@ -456,17 +435,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc2(inputs=hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.config.hidden_size) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.intermediate_size) - class TFBlipEncoderLayer(tf.keras.layers.Layer): def __init__(self, config: BlipConfig, **kwargs): @@ -517,23 +485,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build([None, None, self.embed_dim]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build([None, None, self.embed_dim]) - class TFBlipPreTrainedModel(TFPreTrainedModel): """ @@ -694,15 +645,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFBlipVisionModel(TFBlipPreTrainedModel): main_input_name = "pixel_values" @@ -782,20 +724,6 @@ def call( def get_input_embeddings(self): return self.embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "post_layernorm", None) is not None: - with tf.name_scope(self.post_layernorm.name): - self.post_layernorm.build(None) - class TFBlipMainLayer(tf.keras.layers.Layer): config_class = BlipConfig @@ -847,22 +775,7 @@ def build(self, input_shape=None): initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), trainable=True, ) - - if self.built: - return - self.built = True - if getattr(self, "text_model", None) is not None: - with tf.name_scope(self.text_model.name): - self.text_model.build(None) - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - if getattr(self, "visual_projection", None) is not None: - with tf.name_scope(self.visual_projection.name): - self.visual_projection.build(self.vision_embed_dim) - if getattr(self, "text_projection", None) is not None: - with tf.name_scope(self.text_projection.name): - self.text_projection.build(self.text_embed_dim) + super().build(input_shape) @unpack_inputs def call( @@ -1082,14 +995,6 @@ def get_image_features( return image_features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "blip", None) is not None: - with tf.name_scope(self.blip.name): - self.blip.build(None) - @add_start_docstrings( """ @@ -1263,17 +1168,6 @@ def generate( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - if getattr(self, "text_decoder", None) is not None: - with tf.name_scope(self.text_decoder.name): - self.text_decoder.build(None) - @add_start_docstrings( """ @@ -1515,20 +1409,6 @@ def generate( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - if getattr(self, "text_encoder", None) is not None: - with tf.name_scope(self.text_encoder.name): - self.text_encoder.build(None) - if getattr(self, "text_decoder", None) is not None: - with tf.name_scope(self.text_decoder.name): - self.text_decoder.build(None) - @add_start_docstrings( """ @@ -1577,7 +1457,6 @@ def __init__(self, config: BlipConfig, *args, **kwargs): if not hasattr(config, "decoder_start_token_id") else config.decoder_start_token_id ) - self.config = config def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.vision_model.embeddings.patch_embedding @@ -1679,23 +1558,3 @@ def call( attentions=vision_outputs.attentions, question_embeds=question_embeds, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - if getattr(self, "text_encoder", None) is not None: - with tf.name_scope(self.text_encoder.name): - self.text_encoder.build(None) - if getattr(self, "vision_proj", None) is not None: - with tf.name_scope(self.vision_proj.name): - self.vision_proj.build(self.config.vision_self.config.hidden_size) - if getattr(self, "text_proj", None) is not None: - with tf.name_scope(self.text_proj.name): - self.text_proj.build(self.config.text_self.config.hidden_size) - if getattr(self, "itm_head", None) is not None: - with tf.name_scope(self.itm_head.name): - self.itm_head.build(self.config.text_self.config.hidden_size) diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index 46f8026eba4dd8..b7307c062f7911 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -127,23 +127,6 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_v embeddings = self.dropout(embeddings, training=training) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "word_embeddings", None) is not None: - with tf.name_scope(self.word_embeddings.name): - self.word_embeddings.build(None) - if getattr(self, "position_embeddings", None) is not None: - with tf.name_scope(self.position_embeddings.name): - self.position_embeddings.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 class TFBlipTextSelfAttention(tf.keras.layers.Layer): @@ -267,20 +250,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(None) # TODO Matt might be wrong - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(None) # TODO Matt might be wrong - class TFBlipTextSelfOutput(tf.keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): @@ -291,7 +260,6 @@ def __init__(self, config: BlipTextConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -300,17 +268,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Opti return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 class TFBlipTextAttention(tf.keras.layers.Layer): @@ -345,17 +302,6 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "self_output", None) is not None: - with tf.name_scope(self.self_output.name): - self.self_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText class TFBlipTextIntermediate(tf.keras.layers.Layer): @@ -370,7 +316,6 @@ def __init__(self, config: BlipTextConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -378,14 +323,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFBlipTextOutput(tf.keras.layers.Layer): def __init__(self, config: BlipTextConfig, **kwargs): @@ -396,7 +333,6 @@ def __init__(self, config: BlipTextConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -405,17 +341,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFBlipTextLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -475,20 +400,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "self_output", None) is not None: - with tf.name_scope(self.self_output.name): - self.self_output.build(None) - # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 @keras_serializable @@ -570,15 +481,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText class TFBlipTextPooler(tf.keras.layers.Layer): @@ -591,7 +493,6 @@ def __init__(self, config: BlipTextConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -601,14 +502,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): @@ -627,7 +520,6 @@ def __init__(self, config: BlipTextConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -636,17 +528,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -665,16 +546,7 @@ def __init__(self, config, **kwargs): def build(self, input_shape=None): self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) - - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(self.config.hidden_size) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.transform(hidden_states) @@ -691,14 +563,6 @@ def call(self, sequence_output: tf.Tensor) -> tf.Tensor: prediction_scores = self.predictions(sequence_output) return prediction_scores - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) - # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 class TFBlipTextPreTrainedModel(TFPreTrainedModel): @@ -938,20 +802,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): @@ -1092,14 +942,3 @@ def _reorder_cache(self, past_key_values, beam_idx): for layer_past in past_key_values: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bert", None) is not None: - with tf.name_scope(self.bert.name): - self.bert.build(None) - if getattr(self, "cls", None) is not None: - with tf.name_scope(self.cls.name): - self.cls.build(None) diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py index 84f2220ebae511..8def74a5b3045e 100644 --- a/src/transformers/models/camembert/modeling_tf_camembert.py +++ b/src/transformers/models/camembert/modeling_tf_camembert.py @@ -184,7 +184,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -206,12 +206,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -284,7 +279,6 @@ def __init__(self, config: CamembertConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -294,14 +288,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert class TFCamembertSelfAttention(tf.keras.layers.Layer): @@ -331,7 +317,6 @@ def __init__(self, config: CamembertConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -421,20 +406,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert class TFCamembertSelfOutput(tf.keras.layers.Layer): @@ -446,7 +417,6 @@ def __init__(self, config: CamembertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -455,17 +425,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert class TFCamembertAttention(tf.keras.layers.Layer): @@ -507,17 +466,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert class TFCamembertIntermediate(tf.keras.layers.Layer): @@ -532,7 +480,6 @@ def __init__(self, config: CamembertConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -540,14 +487,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert class TFCamembertOutput(tf.keras.layers.Layer): @@ -559,7 +498,6 @@ def __init__(self, config: CamembertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -568,17 +506,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert class TFCamembertLayer(tf.keras.layers.Layer): @@ -666,20 +593,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert class TFCamembertEncoder(tf.keras.layers.Layer): @@ -750,15 +663,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert @@ -957,20 +861,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - class TFCamembertPreTrainedModel(TFPreTrainedModel): """ @@ -1055,14 +945,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert class TFCamembertLMHead(tf.keras.layers.Layer): @@ -1083,18 +965,10 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def get_output_embeddings(self): return self.decoder @@ -1206,17 +1080,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead class TFCamembertClassificationHead(tf.keras.layers.Layer): @@ -1237,7 +1100,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1247,17 +1109,6 @@ def call(self, features, training=False): x = self.out_proj(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1335,17 +1186,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1372,7 +1212,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1431,17 +1270,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1464,7 +1292,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1536,17 +1363,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1568,7 +1384,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1641,17 +1456,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) - @add_start_docstrings( """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING @@ -1777,14 +1581,3 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index b5aab718526f2e..da63c5ff2150e1 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -169,12 +169,7 @@ def build(self, input_shape: tf.TensorShape = None): name="embeddings", ) - if self.built: - return - self.built = True - if getattr(self, "patch_embedding", None) is not None: - with tf.name_scope(self.patch_embedding.name): - self.patch_embedding.build(self.config.num_channels) + super().build(input_shape) def call(self, pixel_values: tf.Tensor) -> tf.Tensor: """`pixel_values` is expected to be of NCHW format.""" @@ -357,23 +352,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFCLIPMLP(tf.keras.layers.Layer): def __init__(self, config: CLIPConfig, **kwargs): @@ -391,7 +369,6 @@ def __init__(self, config: CLIPConfig, **kwargs): self.fc2 = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc1(inputs=hidden_states) @@ -399,17 +376,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc2(inputs=hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.config.hidden_size) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.intermediate_size) - class TFCLIPEncoderLayer(tf.keras.layers.Layer): def __init__(self, config: CLIPConfig, **kwargs): @@ -462,23 +428,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build([None, None, self.embed_dim]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build([None, None, self.embed_dim]) - class TFCLIPEncoder(tf.keras.layers.Layer): """ @@ -534,15 +483,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFCLIPTextTransformer(tf.keras.layers.Layer): def __init__(self, config: CLIPTextConfig, **kwargs): @@ -647,20 +587,6 @@ def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32) return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - @keras_serializable class TFCLIPTextMainLayer(tf.keras.layers.Layer): @@ -709,14 +635,6 @@ def call( return text_model_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "text_model", None) is not None: - with tf.name_scope(self.text_model.name): - self.text_model.build(None) - class TFCLIPVisionTransformer(tf.keras.layers.Layer): def __init__(self, config: CLIPVisionConfig, **kwargs): @@ -763,23 +681,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "pre_layernorm", None) is not None: - with tf.name_scope(self.pre_layernorm.name): - self.pre_layernorm.build([None, None, self.embed_dim]) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "post_layernorm", None) is not None: - with tf.name_scope(self.post_layernorm.name): - self.post_layernorm.build([None, None, self.embed_dim]) - @keras_serializable class TFCLIPVisionMainLayer(tf.keras.layers.Layer): @@ -815,14 +716,6 @@ def call( return vision_model_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - @keras_serializable class TFCLIPMainLayer(tf.keras.layers.Layer): @@ -877,21 +770,7 @@ def build(self, input_shape: tf.TensorShape = None): name="logit_scale", ) - if self.built: - return - self.built = True - if getattr(self, "text_model", None) is not None: - with tf.name_scope(self.text_model.name): - self.text_model.build(None) - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - if getattr(self, "visual_projection", None) is not None: - with tf.name_scope(self.visual_projection.name): - self.visual_projection.build(self.vision_embed_dim) - if getattr(self, "text_projection", None) is not None: - with tf.name_scope(self.text_projection.name): - self.text_projection.build(self.text_embed_dim) + super().build(input_shape) @unpack_inputs def get_text_features( @@ -1233,14 +1112,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "clip", None) is not None: - with tf.name_scope(self.clip.name): - self.clip.build(None) - class TFCLIPVisionModel(TFCLIPPreTrainedModel): config_class = CLIPVisionConfig @@ -1295,14 +1166,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "clip", None) is not None: - with tf.name_scope(self.clip.name): - self.clip.build(None) - @add_start_docstrings(CLIP_START_DOCSTRING) class TFCLIPModel(TFCLIPPreTrainedModel): @@ -1454,11 +1317,3 @@ def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput: # TensorFlow cannot trace through nested dataclasses. Reference: # https://github.com/huggingface/transformers/pull/16886 return output - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "clip", None) is not None: - with tf.name_scope(self.clip.name): - self.clip.build(None) diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index a514d40287bd5b..4beb01cb78b0ac 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -81,7 +81,7 @@ def __init__(self, config: ConvBertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -103,12 +103,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) + super().build(input_shape) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( @@ -213,7 +208,6 @@ def __init__(self, config, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -303,29 +297,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - if getattr(self, "key_conv_attn_layer", None) is not None: - with tf.name_scope(self.key_conv_attn_layer.name): - self.key_conv_attn_layer.build(None) - if getattr(self, "conv_kernel_layer", None) is not None: - with tf.name_scope(self.conv_kernel_layer.name): - self.conv_kernel_layer.build(self.all_head_size) - if getattr(self, "conv_out_layer", None) is not None: - with tf.name_scope(self.conv_out_layer.name): - self.conv_out_layer.build(self.config.hidden_size) - class TFConvBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -336,7 +307,6 @@ def __init__(self, config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -345,17 +315,6 @@ def call(self, hidden_states, input_tensor, training=False): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFConvBertAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -376,17 +335,6 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, train return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class GroupedLinearLayer(tf.keras.layers.Layer): def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs): @@ -467,7 +415,6 @@ def __init__(self, config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -476,14 +423,6 @@ def call(self, hidden_states, input_tensor, training=False): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFConvBertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -504,20 +443,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - class TFConvBertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -561,15 +486,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -585,7 +501,6 @@ def __init__(self, config, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -594,17 +509,6 @@ def call(self, hidden_states): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - @keras_serializable class TFConvBertMainLayer(tf.keras.layers.Layer): @@ -712,17 +616,6 @@ def call( return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - class TFConvBertPreTrainedModel(TFPreTrainedModel): """ @@ -877,14 +770,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convbert", None) is not None: - with tf.name_scope(self.convbert.name): - self.convbert.build(None) - class TFConvBertMaskedLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): @@ -929,7 +814,6 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") - self.config = config def call(self, generator_hidden_states, training=False): hidden_states = self.dense(generator_hidden_states) @@ -938,17 +822,6 @@ def call(self, generator_hidden_states, training=False): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING) class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1028,20 +901,6 @@ def call( attentions=generator_hidden_states.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convbert", None) is not None: - with tf.name_scope(self.convbert.name): - self.convbert.build(None) - if getattr(self, "generator_predictions", None) is not None: - with tf.name_scope(self.generator_predictions.name): - self.generator_predictions.build(None) - if getattr(self, "generator_lm_head", None) is not None: - with tf.name_scope(self.generator_lm_head.name): - self.generator_lm_head.build(None) - class TFConvBertClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -1072,17 +931,6 @@ def call(self, hidden_states, **kwargs): return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1151,17 +999,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convbert", None) is not None: - with tf.name_scope(self.convbert.name): - self.convbert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1181,7 +1018,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1256,20 +1092,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convbert", None) is not None: - with tf.name_scope(self.convbert.name): - self.convbert.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1291,7 +1113,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1346,17 +1167,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convbert", None) is not None: - with tf.name_scope(self.convbert.name): - self.convbert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1374,7 +1184,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1443,14 +1252,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convbert", None) is not None: - with tf.name_scope(self.convbert.name): - self.convbert.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index f1f46b6ed51195..59a36b3983768c 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -81,7 +81,6 @@ def __init__(self, config: ConvNextConfig, **kwargs): ) self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels - self.config = config def call(self, pixel_values): if isinstance(pixel_values, dict): @@ -102,17 +101,6 @@ def call(self, pixel_values): embeddings = self.layernorm(embeddings) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(self.config.num_channels) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_sizes[0]]) - class TFConvNextLayer(tf.keras.layers.Layer): """This corresponds to the `Block` class in the original implementation. @@ -179,25 +167,7 @@ def build(self, input_shape: tf.TensorShape = None): if self.config.layer_scale_init_value > 0 else None ) - - if self.built: - return - self.built = True - if getattr(self, "dwconv", None) is not None: - with tf.name_scope(self.dwconv.name): - self.dwconv.build(self.dim) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.dim]) - if getattr(self, "pwconv1", None) is not None: - with tf.name_scope(self.pwconv1.name): - self.pwconv1.build(self.dim) - if getattr(self, "pwconv2", None) is not None: - with tf.name_scope(self.pwconv2.name): - self.pwconv2.build(4 * self.dim) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) + super().build(input_shape) def call(self, hidden_states, training=False): input = hidden_states @@ -283,15 +253,6 @@ def call(self, hidden_states): hidden_states = layer(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFConvNextEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -392,20 +353,6 @@ def call( hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) - class TFConvNextPreTrainedModel(TFPreTrainedModel): """ @@ -538,14 +485,6 @@ def call( hidden_states=outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convnext", None) is not None: - with tf.name_scope(self.convnext.name): - self.convnext.build(None) - @add_start_docstrings( """ @@ -638,14 +577,3 @@ def call( logits=logits, hidden_states=outputs.hidden_states, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convnext", None) is not None: - with tf.name_scope(self.convnext.name): - self.convnext.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) diff --git a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py index c5cd4e444dc2e2..863e59406f1cfb 100644 --- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py @@ -133,7 +133,6 @@ def __init__(self, config: ConvNextV2Config, **kwargs): ) self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels - self.config = config def call(self, pixel_values): if isinstance(pixel_values, dict): @@ -154,17 +153,6 @@ def call(self, pixel_values): embeddings = self.layernorm(embeddings) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(self.config.num_channels) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_sizes[0]]) - class TFConvNextV2Layer(tf.keras.layers.Layer): """This corresponds to the `Block` class in the original implementation. @@ -235,29 +223,6 @@ def call(self, hidden_states, training=False): x = input + x return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dwconv", None) is not None: - with tf.name_scope(self.dwconv.name): - self.dwconv.build(self.dim) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.dim]) - if getattr(self, "pwconv1", None) is not None: - with tf.name_scope(self.pwconv1.name): - self.pwconv1.build(self.dim) - if getattr(self, "grn", None) is not None: - with tf.name_scope(self.grn.name): - self.grn.build(None) - if getattr(self, "pwconv2", None) is not None: - with tf.name_scope(self.pwconv2.name): - self.pwconv2.build(4 * self.dim) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) - # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2 class TFConvNextV2Stage(tf.keras.layers.Layer): @@ -329,15 +294,6 @@ def call(self, hidden_states): hidden_states = layer(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFConvNextV2Encoder(tf.keras.layers.Layer): def __init__(self, config: ConvNextV2Config, **kwargs): @@ -445,20 +401,6 @@ def call( hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) - class TFConvNextV2PreTrainedModel(TFPreTrainedModel): """ @@ -577,14 +519,6 @@ def call( hidden_states=outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convnextv2", None) is not None: - with tf.name_scope(self.convnextv2.name): - self.convnextv2.build(None) - @add_start_docstrings( """ @@ -659,14 +593,3 @@ def call( logits=logits, hidden_states=outputs.hidden_states, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convnextv2", None) is not None: - with tf.name_scope(self.convnextv2.name): - self.convnextv2.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 7cc41ccfd95689..70a5c17462595a 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -142,23 +142,6 @@ def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "Wq", None) is not None: - with tf.name_scope(self.Wq.name): - self.Wq.build(self.d_model_size) - if getattr(self, "Wk", None) is not None: - with tf.name_scope(self.Wk.name): - self.Wk.build(self.d_model_size) - if getattr(self, "Wv", None) is not None: - with tf.name_scope(self.Wv.name): - self.Wv.build(self.d_model_size) - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.d_model_size) - class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): def __init__(self, d_model_size, dff, **kwargs): @@ -192,7 +175,6 @@ def __init__( self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) - self.d_model_size = d_model_size def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): normed = self.layernorm1(x) @@ -220,23 +202,6 @@ def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output outputs = (out2,) + attn_outputs[1:] return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "multi_head_attention", None) is not None: - with tf.name_scope(self.multi_head_attention.name): - self.multi_head_attention.build(None) - if getattr(self, "ffn", None) is not None: - with tf.name_scope(self.ffn.name): - self.ffn.build(None) - if getattr(self, "layernorm1", None) is not None: - with tf.name_scope(self.layernorm1.name): - self.layernorm1.build([None, None, self.d_model_size]) - if getattr(self, "layernorm2", None) is not None: - with tf.name_scope(self.layernorm2.name): - self.layernorm2.build([None, None, self.d_model_size]) - @keras_serializable class TFCTRLMainLayer(tf.keras.layers.Layer): @@ -431,21 +396,6 @@ def call( attentions=all_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "w", None) is not None: - with tf.name_scope(self.w.name): - self.w.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.n_embd]) - if getattr(self, "h", None) is not None: - for layer in self.h: - with tf.name_scope(layer.name): - layer.build(None) - class TFCTRLPreTrainedModel(TFPreTrainedModel): """ @@ -613,14 +563,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - class TFCTRLBiasLayer(tf.keras.layers.Layer): """ @@ -768,17 +710,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) - @add_start_docstrings( """ @@ -806,7 +737,6 @@ def __init__(self, config, *inputs, **kwargs): use_bias=False, ) self.transformer = TFCTRLMainLayer(config, name="transformer") - self.config = config def get_output_embeddings(self): # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too. @@ -906,14 +836,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.n_embd) - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py index 5ecdc1b2658155..d610792e48f8e5 100644 --- a/src/transformers/models/cvt/modeling_tf_cvt.py +++ b/src/transformers/models/cvt/modeling_tf_cvt.py @@ -131,14 +131,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.dropout(hidden_state, training=training) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution_embeddings", None) is not None: - with tf.name_scope(self.convolution_embeddings.name): - self.convolution_embeddings.build(None) - class TFCvtConvEmbeddings(tf.keras.layers.Layer): """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" @@ -168,7 +160,6 @@ def __init__( # Using the same default epsilon as PyTorch self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") self.num_channels = num_channels - self.embed_dim = embed_dim def call(self, pixel_values: tf.Tensor) -> tf.Tensor: if isinstance(pixel_values, dict): @@ -186,17 +177,6 @@ def call(self, pixel_values: tf.Tensor) -> tf.Tensor: pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels)) return pixel_values - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - if getattr(self, "normalization", None) is not None: - with tf.name_scope(self.normalization.name): - self.normalization.build([None, None, self.embed_dim]) - class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): """Convolutional projection layer.""" @@ -216,24 +196,12 @@ def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: ) # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") - self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution(self.padding(hidden_state)) hidden_state = self.normalization(hidden_state, training=training) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution", None) is not None: - with tf.name_scope(self.convolution.name): - self.convolution.build(self.embed_dim) - if getattr(self, "normalization", None) is not None: - with tf.name_scope(self.normalization.name): - self.normalization.build(None) - class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): """Linear projection layer used to flatten tokens into 1D.""" @@ -392,29 +360,6 @@ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim)) return context - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution_projection_query", None) is not None: - with tf.name_scope(self.convolution_projection_query.name): - self.convolution_projection_query.build(None) - if getattr(self, "convolution_projection_key", None) is not None: - with tf.name_scope(self.convolution_projection_key.name): - self.convolution_projection_key.build(None) - if getattr(self, "convolution_projection_value", None) is not None: - with tf.name_scope(self.convolution_projection_value.name): - self.convolution_projection_value.build(None) - if getattr(self, "projection_query", None) is not None: - with tf.name_scope(self.projection_query.name): - self.projection_query.build(self.embed_dim) - if getattr(self, "projection_key", None) is not None: - with tf.name_scope(self.projection_key.name): - self.projection_key.build(self.embed_dim) - if getattr(self, "projection_value", None) is not None: - with tf.name_scope(self.projection_value.name): - self.projection_value.build(self.embed_dim) - class TFCvtSelfOutput(tf.keras.layers.Layer): """Output of the Attention layer .""" @@ -425,21 +370,12 @@ def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(drop_rate) - self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.dense(inputs=hidden_state) hidden_state = self.dropout(inputs=hidden_state, training=training) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.embed_dim) - class TFCvtAttention(tf.keras.layers.Layer): """Attention layer. First chunk of the convolutional transformer block.""" @@ -487,17 +423,6 @@ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool attention_output = self.dense_output(self_output, training=training) return attention_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFCvtIntermediate(tf.keras.layers.Layer): """Intermediate dense layer. Second chunk of the convolutional transformer block.""" @@ -510,20 +435,11 @@ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): activation="gelu", name="dense", ) - self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor) -> tf.Tensor: hidden_state = self.dense(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.embed_dim) - class TFCvtOutput(tf.keras.layers.Layer): """ @@ -536,8 +452,6 @@ def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs): units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(drop_rate) - self.mlp_ratio = config.mlp_ratio - self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.dense(inputs=hidden_state) @@ -545,14 +459,6 @@ def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_state = hidden_state + input_tensor return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(int(self.embed_dim * self.mlp_ratio)) - class TFCvtLayer(tf.keras.layers.Layer): """ @@ -608,7 +514,6 @@ def __init__( # Using the same default epsilon as PyTorch self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") - self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: # in Cvt, layernorm is applied before self-attention @@ -627,29 +532,6 @@ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool layer_output = self.drop_path(layer_output, training=training) return layer_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.embed_dim]) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.embed_dim]) - class TFCvtStage(tf.keras.layers.Layer): """ @@ -734,18 +616,6 @@ def call(self, hidden_state: tf.Tensor, training: bool = False): hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) return hidden_state, cls_token - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedding", None) is not None: - with tf.name_scope(self.embedding.name): - self.embedding.build(None) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFCvtEncoder(tf.keras.layers.Layer): """ @@ -798,15 +668,6 @@ def call( hidden_states=all_hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "stages", None) is not None: - for layer in self.stages: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFCvtMainLayer(tf.keras.layers.Layer): @@ -848,14 +709,6 @@ def call( hidden_states=encoder_outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - class TFCvtPreTrainedModel(TFPreTrainedModel): """ @@ -975,14 +828,6 @@ def call( hidden_states=outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "cvt", None) is not None: - with tf.name_scope(self.cvt.name): - self.cvt.build(None) - @add_start_docstrings( """ @@ -1008,7 +853,6 @@ def __init__(self, config: CvtConfig, *inputs, **kwargs): bias_initializer="zeros", name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) @@ -1078,17 +922,3 @@ def call( return ((loss,) + output) if loss is not None else output return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "cvt", None) is not None: - with tf.name_scope(self.cvt.name): - self.cvt.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.embed_dim[-1]]) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py index 021527e8b35ecc..96d3993dd7cd5f 100644 --- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py @@ -137,7 +137,7 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), @@ -164,12 +164,7 @@ def build(self, input_shape=None): else: self.position_embeddings = None - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(None) + super().build(input_shape) def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor: embeddings = self.patch_embeddings(pixel_values) @@ -253,14 +248,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: return tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1)) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): @@ -297,7 +284,6 @@ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = ) else: self.relative_position_bias = None - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -358,20 +344,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): """ @@ -386,7 +358,6 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -394,14 +365,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, tr return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFData2VecVisionAttention(tf.keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = None, **kwargs): @@ -435,17 +398,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision class TFData2VecVisionIntermediate(tf.keras.layers.Layer): @@ -460,7 +412,6 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -468,14 +419,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFData2VecVisionOutput(tf.keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, **kwargs): @@ -485,7 +428,6 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -493,14 +435,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - class TFData2VecVisionLayer(tf.keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" @@ -549,27 +483,7 @@ def build(self, input_shape: tf.TensorShape = None): else: self.lambda_1, self.lambda_2 = None, None - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "data2vec_output", None) is not None: - with tf.name_scope(self.data2vec_output.name): - self.data2vec_output.build(None) - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.config.hidden_size]) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.config.hidden_size]) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) + super().build(input_shape) def call( self, @@ -736,15 +650,6 @@ def call( attentions=all_self_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFData2VecVisionMainLayer(tf.keras.layers.Layer): @@ -836,23 +741,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFData2VecVisionPooler(tf.keras.layers.Layer): def __init__(self, config: Data2VecVisionConfig, **kwargs): @@ -874,14 +762,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build(None) - class TFData2VecVisionPreTrainedModel(TFPreTrainedModel): """ @@ -1016,14 +896,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "data2vec_vision", None) is not None: - with tf.name_scope(self.data2vec_vision.name): - self.data2vec_vision.build(None) - @add_start_docstrings( """ @@ -1045,7 +917,6 @@ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING) @@ -1097,17 +968,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "data2vec_vision", None) is not None: - with tf.name_scope(self.data2vec_vision.name): - self.data2vec_vision.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - class TFData2VecVisionConvModule(tf.keras.layers.Layer): """ @@ -1146,17 +1006,6 @@ def call(self, input: tf.Tensor) -> tf.Tensor: output = self.activation(output) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(self.in_channels) - if getattr(self, "bn", None) is not None: - with tf.name_scope(self.bn.name): - self.bn.build(None) - # Copied from: # https://gist.github.com/Rocketknight1/43abbe6e73f1008e6e459486e01e0ceb @@ -1371,23 +1220,6 @@ def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor: return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.channels) - if getattr(self, "psp_modules", None) is not None: - with tf.name_scope(self.psp_modules.name): - self.psp_modules.build(None) - if getattr(self, "bottleneck", None) is not None: - with tf.name_scope(self.bottleneck.name): - self.bottleneck.build(None) - if getattr(self, "fpn_bottleneck", None) is not None: - with tf.name_scope(self.fpn_bottleneck.name): - self.fpn_bottleneck.build(None) - class TFData2VecVisionFCNHead(tf.keras.layers.Layer): """ @@ -1466,14 +1298,6 @@ def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor: output = self.classifier(output) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.channels) - @add_start_docstrings( """ @@ -1633,25 +1457,3 @@ def reshape_features(x): hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "data2vec_vision", None) is not None: - with tf.name_scope(self.data2vec_vision.name): - self.data2vec_vision.build(None) - if getattr(self, "decode_head", None) is not None: - with tf.name_scope(self.decode_head.name): - self.decode_head.build(None) - if getattr(self, "auxiliary_head", None) is not None: - with tf.name_scope(self.auxiliary_head.name): - self.auxiliary_head.build(None) - if getattr(self, "fpn1", None) is not None: - for layer in self.fpn1: - with tf.name_scope(layer.name): - layer.build(None) - if getattr(self, "fpn2", None) is not None: - for layer in self.fpn2: - with tf.name_scope(layer.name): - layer.build(None) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index d29e1d079ce86d..a470cb5493e01b 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -167,7 +167,6 @@ def __init__(self, config: DebertaConfig, **kwargs): self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") - self.config = config def call(self, hidden_states, input_tensor, training: bool = False): hidden_states = self.dense(hidden_states) @@ -175,20 +174,6 @@ def call(self, hidden_states, input_tensor, training: bool = False): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - class TFDebertaAttention(tf.keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): @@ -226,17 +211,6 @@ def call( return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFDebertaIntermediate(tf.keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): @@ -250,7 +224,6 @@ def __init__(self, config: DebertaConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -258,14 +231,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFDebertaOutput(tf.keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): @@ -276,7 +241,6 @@ def __init__(self, config: DebertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -285,20 +249,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - class TFDebertaLayer(tf.keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): @@ -336,20 +286,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - class TFDebertaEncoder(tf.keras.layers.Layer): def __init__(self, config: DebertaConfig, **kwargs): @@ -363,21 +299,14 @@ def __init__(self, config: DebertaConfig, **kwargs): if self.max_relative_positions < 1: self.max_relative_positions = config.max_position_embeddings - def build(self, input_shape=None): + def build(self, input_shape): if self.relative_attention: self.rel_embeddings = self.add_weight( name="rel_embeddings.weight", shape=[self.max_relative_positions * 2, self.config.hidden_size], initializer=get_initializer(self.config.initializer_range), ) - return - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) + return super().build(input_shape) def get_rel_embedding(self): rel_embeddings = self.rel_embeddings if self.relative_attention else None @@ -806,7 +735,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -834,15 +763,7 @@ def build(self, input_shape=None): else: self.position_embeddings = None - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) + super().build(input_shape) def call( self, @@ -917,7 +838,6 @@ def __init__(self, config: DebertaConfig, **kwargs): else: self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -926,17 +846,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.embedding_size]) - class TFDebertaLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): @@ -951,15 +860,10 @@ def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Laye # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -996,14 +900,6 @@ def call(self, sequence_output: tf.Tensor) -> tf.Tensor: return prediction_scores - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) - # @keras_serializable class TFDebertaMainLayer(tf.keras.layers.Layer): @@ -1088,17 +984,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - class TFDebertaPreTrainedModel(TFPreTrainedModel): """ @@ -1239,14 +1124,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1317,17 +1194,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) - @add_start_docstrings( """ @@ -1410,23 +1276,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.output_dim) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1446,7 +1295,6 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1499,17 +1347,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1528,7 +1365,6 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1595,14 +1431,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 387d82b0fe6f04..67414267546427 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -150,7 +150,6 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") - self.config = config def call(self, hidden_states, input_tensor, training: bool = False): hidden_states = self.dense(hidden_states) @@ -158,20 +157,6 @@ def call(self, hidden_states, input_tensor, training: bool = False): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2 class TFDebertaV2Attention(tf.keras.layers.Layer): @@ -210,17 +195,6 @@ def call( return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2 class TFDebertaV2Intermediate(tf.keras.layers.Layer): @@ -235,7 +209,6 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -243,14 +216,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2 class TFDebertaV2Output(tf.keras.layers.Layer): @@ -262,7 +227,6 @@ def __init__(self, config: DebertaV2Config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -271,20 +235,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2 class TFDebertaV2Layer(tf.keras.layers.Layer): @@ -323,20 +273,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - class TFDebertaV2ConvLayer(tf.keras.layers.Layer): def __init__(self, config: DebertaV2Config, **kwargs): @@ -418,24 +354,14 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None - def build(self, input_shape=None): + def build(self, input_shape): if self.relative_attention: self.rel_embeddings = self.add_weight( name="rel_embeddings.weight", shape=[self.pos_ebd_size, self.config.hidden_size], initializer=get_initializer(self.config.initializer_range), ) - return - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) + return super().build(input_shape) def get_rel_embedding(self): rel_embeddings = self.rel_embeddings if self.relative_attention else None @@ -899,7 +825,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -927,15 +853,7 @@ def build(self, input_shape=None): else: self.position_embeddings = None - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) + super().build(input_shape) def call( self, @@ -1011,7 +929,6 @@ def __init__(self, config: DebertaV2Config, **kwargs): else: self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -1020,17 +937,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.embedding_size]) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2 class TFDebertaV2LMPredictionHead(tf.keras.layers.Layer): @@ -1046,15 +952,10 @@ def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.La # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -1092,14 +993,6 @@ def call(self, sequence_output: tf.Tensor) -> tf.Tensor: return prediction_scores - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2 class TFDebertaV2MainLayer(tf.keras.layers.Layer): @@ -1184,17 +1077,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPreTrainedModel with Deberta->DebertaV2 class TFDebertaV2PreTrainedModel(TFPreTrainedModel): @@ -1337,14 +1219,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaForMaskedLM with Deberta->DebertaV2 @@ -1416,17 +1290,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) - @add_start_docstrings( """ @@ -1510,23 +1373,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.output_dim) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1547,7 +1393,6 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1600,17 +1445,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1630,7 +1464,6 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1698,17 +1531,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1808,17 +1630,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deberta", None) is not None: - with tf.name_scope(self.deberta.name): - self.deberta.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.output_dim) diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py index b3477015693f93..a8ba5c9a8a6634 100644 --- a/src/transformers/models/deit/modeling_tf_deit.py +++ b/src/transformers/models/deit/modeling_tf_deit.py @@ -113,7 +113,7 @@ def __init__(self, config: DeiTConfig, use_mask_token: bool = False, **kwargs) - self.patch_embeddings = TFDeiTPatchEmbeddings(config=config, name="patch_embeddings") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), initializer=tf.keras.initializers.zeros(), @@ -141,16 +141,7 @@ def build(self, input_shape=None): trainable=True, name="position_embeddings", ) - - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(None) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) + super().build(input_shape) def call( self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None, training: bool = False @@ -212,14 +203,6 @@ def call(self, pixel_values: tf.Tensor) -> tf.Tensor: x = tf.reshape(x, (batch_size, height * width, num_channels)) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->DeiT class TFDeiTSelfAttention(tf.keras.layers.Layer): @@ -247,7 +230,6 @@ def __init__(self, config: DeiTConfig, **kwargs): units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -297,20 +279,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->DeiT class TFDeiTSelfOutput(tf.keras.layers.Layer): @@ -326,7 +294,6 @@ def __init__(self, config: DeiTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -334,14 +301,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->DeiT class TFDeiTAttention(tf.keras.layers.Layer): @@ -371,17 +330,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->DeiT class TFDeiTIntermediate(tf.keras.layers.Layer): @@ -396,7 +344,6 @@ def __init__(self, config: DeiTConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -404,14 +351,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->DeiT class TFDeiTOutput(tf.keras.layers.Layer): @@ -422,7 +361,6 @@ def __init__(self, config: DeiTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -431,14 +369,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - class TFDeiTLayer(tf.keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" @@ -456,7 +386,6 @@ def __init__(self, config: DeiTConfig, **kwargs): self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) - self.config = config def call( self, @@ -490,26 +419,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "deit_output", None) is not None: - with tf.name_scope(self.deit_output.name): - self.deit_output.build(None) - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.config.hidden_size]) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->DeiT class TFDeiTEncoder(tf.keras.layers.Layer): @@ -556,15 +465,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFDeiTMainLayer(tf.keras.layers.Layer): @@ -656,23 +556,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_size]) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTPreTrainedModel with ViT->DeiT all-casing class TFDeiTPreTrainedModel(TFPreTrainedModel): @@ -764,14 +647,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deit", None) is not None: - with tf.name_scope(self.deit.name): - self.deit.build(None) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTPooler with ViT->DeiT class TFDeiTPooler(tf.keras.layers.Layer): @@ -784,7 +659,6 @@ def __init__(self, config: DeiTConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -794,14 +668,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFDeitPixelShuffle(tf.keras.layers.Layer): """TF layer implementation of torch.nn.PixelShuffle""" @@ -956,17 +822,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deit", None) is not None: - with tf.name_scope(self.deit.name): - self.deit.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings( """ @@ -1064,17 +919,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deit", None) is not None: - with tf.name_scope(self.deit.name): - self.deit.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1154,17 +998,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "deit", None) is not None: - with tf.name_scope(self.deit.name): - self.deit.build(None) - if getattr(self, "cls_classifier", None) is not None: - with tf.name_scope(self.cls_classifier.name): - self.cls_classifier.build(None) - if getattr(self, "distillation_classifier", None) is not None: - with tf.name_scope(self.distillation_classifier.name): - self.distillation_classifier.build(None) diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index f2b41f8ebd0ea8..6b0e1b0f3febcf 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -84,7 +84,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.dropout) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -99,12 +99,7 @@ def build(self, input_shape=None): initializer=get_initializer(initializer_range=self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.dim]) + super().build(input_shape) def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): """ @@ -157,7 +152,6 @@ def __init__(self, config, **kwargs): ) self.pruned_heads = set() - self.config = config def prune_heads(self, heads): raise NotImplementedError @@ -218,23 +212,6 @@ def unshape(x): else: return (context,) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q_lin", None) is not None: - with tf.name_scope(self.q_lin.name): - self.q_lin.build(self.config.dim) - if getattr(self, "k_lin", None) is not None: - with tf.name_scope(self.k_lin.name): - self.k_lin.build(self.config.dim) - if getattr(self, "v_lin", None) is not None: - with tf.name_scope(self.v_lin.name): - self.v_lin.build(self.config.dim) - if getattr(self, "out_lin", None) is not None: - with tf.name_scope(self.out_lin.name): - self.out_lin.build(self.config.dim) - class TFFFN(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -247,7 +224,6 @@ def __init__(self, config, **kwargs): config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" ) self.activation = get_tf_activation(config.activation) - self.config = config def call(self, input, training=False): x = self.lin1(input) @@ -256,17 +232,6 @@ def call(self, input, training=False): x = self.dropout(x, training=training) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "lin1", None) is not None: - with tf.name_scope(self.lin1.name): - self.lin1.build(self.config.dim) - if getattr(self, "lin2", None) is not None: - with tf.name_scope(self.lin2.name): - self.lin2.build(self.config.hidden_dim) - class TFTransformerBlock(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -288,7 +253,6 @@ def __init__(self, config, **kwargs): self.ffn = TFFFN(config, name="ffn") self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") - self.config = config def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None """ @@ -317,23 +281,6 @@ def call(self, x, attn_mask, head_mask, output_attentions, training=False): # r output = (sa_weights,) + output return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "sa_layer_norm", None) is not None: - with tf.name_scope(self.sa_layer_norm.name): - self.sa_layer_norm.build([None, None, self.config.dim]) - if getattr(self, "ffn", None) is not None: - with tf.name_scope(self.ffn.name): - self.ffn.build(None) - if getattr(self, "output_layer_norm", None) is not None: - with tf.name_scope(self.output_layer_norm.name): - self.output_layer_norm.build([None, None, self.config.dim]) - class TFTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -389,15 +336,6 @@ def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFDistilBertMainLayer(tf.keras.layers.Layer): @@ -474,17 +412,6 @@ def call( return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class TFDistilBertPreTrainedModel(TFPreTrainedModel): @@ -621,14 +548,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "distilbert", None) is not None: - with tf.name_scope(self.distilbert.name): - self.distilbert.build(None) - class TFDistilBertLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): @@ -748,23 +667,6 @@ def call( attentions=distilbert_output.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "distilbert", None) is not None: - with tf.name_scope(self.distilbert.name): - self.distilbert.build(None) - if getattr(self, "vocab_transform", None) is not None: - with tf.name_scope(self.vocab_transform.name): - self.vocab_transform.build(self.config.dim) - if getattr(self, "vocab_layer_norm", None) is not None: - with tf.name_scope(self.vocab_layer_norm.name): - self.vocab_layer_norm.build([None, None, self.config.dim]) - if getattr(self, "vocab_projector", None) is not None: - with tf.name_scope(self.vocab_projector.name): - self.vocab_projector.build(None) - @add_start_docstrings( """ @@ -789,7 +691,6 @@ def __init__(self, config, *inputs, **kwargs): config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -845,20 +746,6 @@ def call( attentions=distilbert_output.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "distilbert", None) is not None: - with tf.name_scope(self.distilbert.name): - self.distilbert.build(None) - if getattr(self, "pre_classifier", None) is not None: - with tf.name_scope(self.pre_classifier.name): - self.pre_classifier.build(self.config.dim) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.dim) - @add_start_docstrings( """ @@ -877,7 +764,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -928,17 +814,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "distilbert", None) is not None: - with tf.name_scope(self.distilbert.name): - self.distilbert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -962,7 +837,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1034,20 +908,6 @@ def call( attentions=distilbert_output.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "distilbert", None) is not None: - with tf.name_scope(self.distilbert.name): - self.distilbert.build(None) - if getattr(self, "pre_classifier", None) is not None: - with tf.name_scope(self.pre_classifier.name): - self.pre_classifier.build(self.config.dim) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.dim) - @add_start_docstrings( """ @@ -1066,7 +926,6 @@ def __init__(self, config, *inputs, **kwargs): ) assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" self.dropout = tf.keras.layers.Dropout(config.qa_dropout) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1132,14 +991,3 @@ def call( hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "distilbert", None) is not None: - with tf.name_scope(self.distilbert.name): - self.distilbert.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.dim) diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py index 6197f74570cc8f..53efa41fda5dee 100644 --- a/src/transformers/models/dpr/modeling_tf_dpr.py +++ b/src/transformers/models/dpr/modeling_tf_dpr.py @@ -599,14 +599,6 @@ def call( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "ctx_encoder", None) is not None: - with tf.name_scope(self.ctx_encoder.name): - self.ctx_encoder.build(None) - @add_start_docstrings( "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.", @@ -687,14 +679,6 @@ def call( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "question_encoder", None) is not None: - with tf.name_scope(self.question_encoder.name): - self.question_encoder.build(None) - @add_start_docstrings( "The bare DPRReader transformer outputting span predictions.", @@ -768,11 +752,3 @@ def call( return_dict=return_dict, training=training, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "span_predictor", None) is not None: - with tf.name_scope(self.span_predictor.name): - self.span_predictor.build(None) diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py index 68346378892a15..c44a1534287407 100644 --- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py +++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py @@ -100,17 +100,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: embeddings = self.norm(embeddings, training=training) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - if getattr(self, "norm", None) is not None: - with tf.name_scope(self.norm.name): - self.norm.build(None) - class TFEfficientFormerSelfAttention(tf.keras.layers.Layer): def __init__( @@ -141,7 +130,6 @@ def __init__( units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" ) self.resolution = resolution - self.dim = dim def build(self, input_shape: tf.TensorShape) -> None: points = list(itertools.product(range(self.resolution), range(self.resolution))) @@ -172,15 +160,7 @@ def build(self, input_shape: tf.TensorShape) -> None: self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points))) - if self.built: - return - self.built = True - if getattr(self, "qkv", None) is not None: - with tf.name_scope(self.qkv.name): - self.qkv.build(self.dim) - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.total_expanded_key_dim) + super().build(input_shape) def call( self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False @@ -245,8 +225,6 @@ def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs): ) self.activation = tf.keras.layers.Activation(activation=tf.keras.activations.relu, name="activation") - self.out_channels = out_channels - self.config = config def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training) @@ -255,26 +233,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.activation(features) return features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution1", None) is not None: - with tf.name_scope(self.convolution1.name): - self.convolution1.build(self.config.num_channels) - if getattr(self, "batchnorm_before", None) is not None: - with tf.name_scope(self.batchnorm_before.name): - self.batchnorm_before.build(None) - if getattr(self, "convolution2", None) is not None: - with tf.name_scope(self.convolution2.name): - self.convolution2.build(self.out_channels // 2) - if getattr(self, "batchnorm_after", None) is not None: - with tf.name_scope(self.batchnorm_after.name): - self.batchnorm_after.build(None) - if getattr(self, "activation", None) is not None: - with tf.name_scope(self.activation.name): - self.activation.build(None) - class TFEfficientFormerPooling(tf.keras.layers.Layer): def __init__(self, pool_size: int, **kwargs): @@ -309,8 +267,6 @@ def __init__( self.linear_out = tf.keras.layers.Dense( units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out" ) - self.hidden_features = hidden_features - self.in_features = in_features def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.linear_in(inputs=hidden_states) @@ -321,17 +277,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "linear_in", None) is not None: - with tf.name_scope(self.linear_in.name): - self.linear_in.build(self.in_features) - if getattr(self, "linear_out", None) is not None: - with tf.name_scope(self.linear_out.name): - self.linear_out.build(self.hidden_features) - class TFEfficientFormerConvMlp(tf.keras.layers.Layer): def __init__( @@ -373,8 +318,6 @@ def __init__( self.batchnorm_after = tf.keras.layers.BatchNormalization( axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" ) - self.hidden_features = hidden_features - self.in_features = in_features def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution1(hidden_state) @@ -386,23 +329,6 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.dropout(hidden_state, training=training) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution1", None) is not None: - with tf.name_scope(self.convolution1.name): - self.convolution1.build(self.in_features) - if getattr(self, "convolution2", None) is not None: - with tf.name_scope(self.convolution2.name): - self.convolution2.build(self.hidden_features) - if getattr(self, "batchnorm_before", None) is not None: - with tf.name_scope(self.batchnorm_before.name): - self.batchnorm_before.build(None) - if getattr(self, "batchnorm_after", None) is not None: - with tf.name_scope(self.batchnorm_after.name): - self.batchnorm_after.build(None) - # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer class TFEfficientFormerDropPath(tf.keras.layers.Layer): @@ -464,7 +390,7 @@ def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0 ) self.config = config - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.layer_scale_1 = None self.layer_scale_2 = None @@ -481,25 +407,7 @@ def build(self, input_shape=None): trainable=True, name="layer_scale_2", ) - - if self.built: - return - self.built = True - if getattr(self, "token_mixer", None) is not None: - with tf.name_scope(self.token_mixer.name): - self.token_mixer.build(None) - if getattr(self, "layernorm1", None) is not None: - with tf.name_scope(self.layernorm1.name): - self.layernorm1.build([None, None, self.dim]) - if getattr(self, "layernorm2", None) is not None: - with tf.name_scope(self.layernorm2.name): - self.layernorm2.build([None, None, self.dim]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) + super().build(input_shape) def call( self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False @@ -568,15 +476,6 @@ def call( return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "blocks", None) is not None: - for layer in self.blocks: - with tf.name_scope(layer.name): - layer.build(None) - class TFEfficientFormerMeta4D(tf.keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): @@ -596,7 +495,7 @@ def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0 ) self.config = config - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.layer_scale_1 = None self.layer_scale_2 = None @@ -613,19 +512,7 @@ def build(self, input_shape=None): trainable=True, name="layer_scale_2", ) - - if self.built: - return - self.built = True - if getattr(self, "token_mixer", None) is not None: - with tf.name_scope(self.token_mixer.name): - self.token_mixer.build(None) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) + super().build(input_shape) def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]: outputs = self.token_mixer(hidden_states) @@ -673,15 +560,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Ten hidden_states = layer_module(hidden_states=hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "blocks", None) is not None: - for layer in self.blocks: - with tf.name_scope(layer.name): - layer.build(None) - class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, index: int, **kwargs): @@ -692,14 +570,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Ten hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "meta4D_layers", None) is not None: - with tf.name_scope(self.meta4D_layers.name): - self.meta4D_layers.build(None) - class TFEfficientFormerLastStage(tf.keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, **kwargs): @@ -719,20 +589,6 @@ def call( return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "meta4D_layers", None) is not None: - with tf.name_scope(self.meta4D_layers.name): - self.meta4D_layers.build(None) - if getattr(self, "flat", None) is not None: - with tf.name_scope(self.flat.name): - self.flat.build(None) - if getattr(self, "meta3D_layers", None) is not None: - with tf.name_scope(self.meta3D_layers.name): - self.meta3D_layers.build(None) - class TFEfficientFormerEncoder(tf.keras.layers.Layer): def __init__(self, config: EfficientFormerConfig, **kwargs): @@ -802,14 +658,6 @@ def call( attentions=all_self_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "last_stage", None) is not None: - with tf.name_scope(self.last_stage.name): - self.last_stage.build(None) - @keras_serializable class TFEfficientFormerMainLayer(tf.keras.layers.Layer): @@ -880,20 +728,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "patch_embed", None) is not None: - with tf.name_scope(self.patch_embed.name): - self.patch_embed.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) - class TFEfficientFormerPreTrainedModel(TFPreTrainedModel): """ @@ -970,14 +804,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "efficientformer", None) is not None: - with tf.name_scope(self.efficientformer.name): - self.efficientformer.build(None) - @add_start_docstrings( """ @@ -1047,17 +873,6 @@ def call( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "efficientformer", None) is not None: - with tf.name_scope(self.efficientformer.name): - self.efficientformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @dataclass class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput): @@ -1169,17 +984,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "efficientformer", None) is not None: - with tf.name_scope(self.efficientformer.name): - self.efficientformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - if getattr(self, "distillation_classifier", None) is not None: - with tf.name_scope(self.distillation_classifier.name): - self.distillation_classifier.build(None) diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 6a94c6c359e2e5..41c64eed369d6a 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -103,7 +103,6 @@ def __init__(self, config: ElectraConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -193,20 +192,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra class TFElectraSelfOutput(tf.keras.layers.Layer): @@ -218,7 +203,6 @@ def __init__(self, config: ElectraConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -227,17 +211,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra class TFElectraAttention(tf.keras.layers.Layer): @@ -279,17 +252,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra class TFElectraIntermediate(tf.keras.layers.Layer): @@ -304,7 +266,6 @@ def __init__(self, config: ElectraConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -312,14 +273,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra class TFElectraOutput(tf.keras.layers.Layer): @@ -331,7 +284,6 @@ def __init__(self, config: ElectraConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -340,17 +292,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra class TFElectraLayer(tf.keras.layers.Layer): @@ -438,20 +379,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra class TFElectraEncoder(tf.keras.layers.Layer): @@ -522,15 +449,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra class TFElectraPooler(tf.keras.layers.Layer): @@ -567,7 +485,7 @@ def __init__(self, config: ElectraConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -589,12 +507,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) + super().build(input_shape) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( @@ -653,17 +566,6 @@ def call(self, discriminator_hidden_states, training=False): return logits - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "dense_prediction", None) is not None: - with tf.name_scope(self.dense_prediction.name): - self.dense_prediction.build(self.config.hidden_size) - class TFElectraGeneratorPredictions(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -671,7 +573,6 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") - self.config = config def call(self, generator_hidden_states, training=False): hidden_states = self.dense(generator_hidden_states) @@ -680,17 +581,6 @@ def call(self, generator_hidden_states, training=False): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFElectraPreTrainedModel(TFPreTrainedModel): """ @@ -891,17 +781,6 @@ def call( return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - @dataclass class TFElectraForPreTrainingOutput(ModelOutput): @@ -1098,14 +977,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - @add_start_docstrings( """ @@ -1178,17 +1049,6 @@ def call( attentions=discriminator_hidden_states.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - if getattr(self, "discriminator_predictions", None) is not None: - with tf.name_scope(self.discriminator_predictions.name): - self.discriminator_predictions.build(None) - class TFElectraMaskedLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): @@ -1317,20 +1177,6 @@ def call( attentions=generator_hidden_states.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - if getattr(self, "generator_predictions", None) is not None: - with tf.name_scope(self.generator_predictions.name): - self.generator_predictions.build(None) - if getattr(self, "generator_lm_head", None) is not None: - with tf.name_scope(self.generator_lm_head.name): - self.generator_lm_head.build(None) - class TFElectraClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -1350,7 +1196,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, inputs, **kwargs): x = inputs[:, 0, :] # take token (equiv. to [CLS]) @@ -1362,17 +1207,6 @@ def call(self, inputs, **kwargs): return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1444,17 +1278,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1474,7 +1297,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1548,20 +1370,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1583,7 +1391,6 @@ def __init__(self, config, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1641,17 +1448,6 @@ def call( attentions=discriminator_hidden_states.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1669,7 +1465,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1746,14 +1541,3 @@ def call( hidden_states=discriminator_hidden_states.hidden_states, attentions=discriminator_hidden_states.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "electra", None) is not None: - with tf.name_scope(self.electra.name): - self.electra.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py index cb59a226702cfa..3e9223087ba9fc 100644 --- a/src/transformers/models/esm/modeling_tf_esm.py +++ b/src/transformers/models/esm/modeling_tf_esm.py @@ -149,15 +149,10 @@ def __init__( self.in_features = in_features self.regression = Dense(1, use_bias=bias, activation="sigmoid", name="regression") - def build(self, input_shape=None): + def build(self, input_shape): + super().build(input_shape) with tf.name_scope("regression"): self.regression.build((None, self.in_features)) - if self.built: - return - self.built = True - if getattr(self, "regression", None) is not None: - with tf.name_scope(self.regression.name): - self.regression.build(self.in_features) def call(self, tokens, attentions): # remove eos token attentions @@ -273,17 +268,6 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): ) return tf.broadcast_to(tf.expand_dims(position_ids, 0), input_shape) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "word_embeddings", None) is not None: - with tf.name_scope(self.word_embeddings.name): - self.word_embeddings.build(None) - if getattr(self, "position_embeddings", None) is not None: - with tf.name_scope(self.position_embeddings.name): - self.position_embeddings.build(None) - class TFEsmSelfAttention(Layer): def __init__(self, config, position_embedding_type=None, name=None): @@ -322,7 +306,6 @@ def __init__(self, config, position_embedding_type=None, name=None): self.rotary_embeddings = TFRotaryEmbedding(dim=self.attention_head_size, name="rotary_embeddings") self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size] @@ -432,20 +415,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - class TFEsmSelfOutput(Layer): def __init__(self, config, name=None): @@ -454,7 +423,6 @@ def __init__(self, config, name=None): config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -462,14 +430,6 @@ def call(self, hidden_states, input_tensor, training=False): hidden_states += input_tensor return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFEsmAttention(Layer): def __init__(self, config, name=None): @@ -478,7 +438,6 @@ def __init__(self, config, name=None): self.output_layer = TFEsmSelfOutput(config, name="output") self.pruned_heads = set() self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def prune_heads(self, heads): raise NotImplementedError @@ -509,20 +468,6 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "output_layer", None) is not None: - with tf.name_scope(self.output_layer.name): - self.output_layer.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFEsmIntermediate(tf.keras.layers.Layer): def __init__(self, config: EsmConfig, **kwargs): @@ -533,21 +478,12 @@ def __init__(self, config: EsmConfig, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = tf.nn.gelu(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFEsmOutput(Layer): def __init__(self, config, name=None): @@ -556,7 +492,6 @@ def __init__(self, config, name=None): config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -564,14 +499,6 @@ def call(self, hidden_states, input_tensor, training=False): hidden_states += input_tensor return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - class TFEsmLayer(Layer): def __init__(self, config, name=None): @@ -588,7 +515,6 @@ def __init__(self, config, name=None): self.intermediate = TFEsmIntermediate(config, name="intermediate") self.output_layer = TFEsmOutput(config, name="output") self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call( self, @@ -660,23 +586,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "output_layer", None) is not None: - with tf.name_scope(self.output_layer.name): - self.output_layer.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFEsmEncoder(Layer): def __init__(self, config, name=None): @@ -756,18 +665,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "emb_layer_norm_after", None) is not None: - with tf.name_scope(self.emb_layer_norm_after.name): - self.emb_layer_norm_after.build([None, None, self.config.hidden_size]) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Esm class TFEsmPooler(tf.keras.layers.Layer): @@ -780,7 +677,6 @@ def __init__(self, config: EsmConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -790,14 +686,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFEsmPreTrainedModel(TFPreTrainedModel): """ @@ -899,24 +787,10 @@ def __init__(self, config, add_pooling_layer=True, name=None, **kwargs): in_features=self.config.num_hidden_layers * self.config.num_attention_heads, bias=True, name="contact_head" ) - def build(self, input_shape=None): + def build(self, input_shape): + super().build(input_shape) with tf.name_scope("contact_head"): self.contact_head.build(input_shape) - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "contact_head", None) is not None: - with tf.name_scope(self.contact_head.name): - self.contact_head.build(None) def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1167,14 +1041,6 @@ def call( def predict_contacts(self, tokens, attention_mask): return self.esm.predict_contacts(tokens, attention_mask) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "esm", None) is not None: - with tf.name_scope(self.esm.name): - self.esm.build(None) - @add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING) class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1274,17 +1140,6 @@ def call( def predict_contacts(self, tokens, attention_mask): return self.esm.predict_contacts(tokens, attention_mask) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "esm", None) is not None: - with tf.name_scope(self.esm.name): - self.esm.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - class TFEsmLMHead(Layer): """ESM Head for masked language modeling.""" @@ -1307,19 +1162,11 @@ def __init__(self, config, name=None): ) self.config = config - def build(self, input_shape=None): + def build(self, input_shape): + super().build(input_shape) # Separate bias to match the PT model and allow weight cross-loading to work # Put it in the build so it gets the right name when adding it as a weight self.bias = self.add_weight("bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) def get_bias(self): return {"bias": self.bias} @@ -1410,17 +1257,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "esm", None) is not None: - with tf.name_scope(self.esm.name): - self.esm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1440,7 +1276,6 @@ def __init__(self, config): self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm") self.dropout = Dropout(config.hidden_dropout_prob) self.classifier = Dense(config.num_labels, name="classifier") - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1498,17 +1333,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "esm", None) is not None: - with tf.name_scope(self.esm.name): - self.esm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - class TFEsmClassificationHead(Layer): """Head for sentence-level classification tasks.""" @@ -1528,7 +1352,6 @@ def __init__(self, config, name=None): activation="linear", name="out_proj", ) - self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1538,17 +1361,6 @@ def call(self, features, training=False): x = self.out_proj(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): """ diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index 820e88f9e12323..375e19360f2a43 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -290,14 +290,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer): @@ -462,7 +454,7 @@ def __init__(self, config, **kwargs): tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") ) - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("position_embeddings"): self.position_embeddings = self.add_weight( name="embeddings", @@ -478,15 +470,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.embed_init_std), ) - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "layer_norm_emb", None) is not None: - with tf.name_scope(self.layer_norm_emb.name): - self.layer_norm_emb.build([None, None, self.dim]) + super().build(input_shape) def get_input_embeddings(self): return self.embeddings @@ -857,17 +841,6 @@ def call( logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "pred_layer", None) is not None: - with tf.name_scope(self.pred_layer.name): - self.pred_layer.build(None) - @add_start_docstrings( """ @@ -947,17 +920,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - @add_start_docstrings( """ @@ -974,7 +936,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1051,17 +1012,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1081,7 +1031,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1144,17 +1093,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1173,7 +1111,6 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) - self.config = config @property def dummy_inputs(self): @@ -1277,17 +1214,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "logits_proj", None) is not None: - with tf.name_scope(self.logits_proj.name): - self.logits_proj.build(self.config.num_labels) diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index 8c1d894409dc59..ccd07b5954b78d 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -90,7 +90,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout) - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -98,12 +98,7 @@ def build(self, input_shape=None): initializer=get_initializer(initializer_range=self.initializer_std), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.d_model]) + super().build(input_shape) def call(self, input_ids=None, inputs_embeds=None, training=False): """ @@ -412,7 +407,7 @@ def __init__(self, config, block_index, **kwargs): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.scale = 1.0 / (d_head**0.5) - def build(self, input_shape=None): + def build(self, input_shape): n_head, d_head, d_model = self.n_head, self.d_head, self.d_model initializer = get_initializer(self.initializer_range) @@ -431,25 +426,7 @@ def build(self, input_shape=None): self.seg_embed = self.add_weight( shape=(2, n_head, d_head), initializer=initializer, trainable=True, name="seg_embed" ) - - if self.built: - return - self.built = True - if getattr(self, "q_head", None) is not None: - with tf.name_scope(self.q_head.name): - self.q_head.build(d_model) - if getattr(self, "k_head", None) is not None: - with tf.name_scope(self.k_head.name): - self.k_head.build(d_model) - if getattr(self, "v_head", None) is not None: - with tf.name_scope(self.v_head.name): - self.v_head.build(d_model) - if getattr(self, "post_proj", None) is not None: - with tf.name_scope(self.post_proj.name): - self.post_proj.build(n_head * d_head) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, d_model]) + super().build(input_shape) def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None): """Relative attention score for the positional encodings""" @@ -580,7 +557,6 @@ def __init__(self, config, **kwargs): self.linear_2 = tf.keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.config = config def call(self, hidden, training=False): h = self.linear_1(hidden) @@ -590,20 +566,6 @@ def call(self, hidden, training=False): h = self.dropout(h, training=training) return self.layer_norm(hidden + h) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "linear_1", None) is not None: - with tf.name_scope(self.linear_1.name): - self.linear_1.build(self.config.d_model) - if getattr(self, "linear_2", None) is not None: - with tf.name_scope(self.linear_2.name): - self.linear_2.build(self.config.d_inner) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - class TFFunnelLayer(tf.keras.layers.Layer): def __init__(self, config, block_index, **kwargs): @@ -618,17 +580,6 @@ def call(self, query, key, value, attention_inputs, output_attentions=False, tra output = self.ffn(attn[0], training=training) return (output, attn[1]) if output_attentions else (output,) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "ffn", None) is not None: - with tf.name_scope(self.ffn.name): - self.ffn.build(None) - class TFFunnelEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -699,15 +650,6 @@ def call( return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None) return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - for block in self.blocks: - for layer in block: - with tf.name_scope(layer.name): - layer.build(None) - def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False): """ @@ -783,15 +725,6 @@ def call( return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None) return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFFunnelBaseLayer(tf.keras.layers.Layer): @@ -962,20 +895,6 @@ def call( attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - class TFFunnelDiscriminatorPredictions(tf.keras.layers.Layer): """Prediction module for the discriminator, made up of two dense layers.""" @@ -986,7 +905,6 @@ def __init__(self, config, **kwargs): self.dense = tf.keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense") self.activation_function = get_tf_activation(config.hidden_act) self.dense_prediction = tf.keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction") - self.config = config def call(self, discriminator_hidden_states): hidden_states = self.dense(discriminator_hidden_states) @@ -994,17 +912,6 @@ def call(self, discriminator_hidden_states): logits = tf.squeeze(self.dense_prediction(hidden_states)) return logits - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.d_model) - if getattr(self, "dense_prediction", None) is not None: - with tf.name_scope(self.dense_prediction.name): - self.dense_prediction.build(self.config.d_model) - class TFFunnelMaskedLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): @@ -1051,7 +958,6 @@ def __init__(self, config, n_labels, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.linear_out = tf.keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out") - self.config = config def call(self, hidden, training=False): hidden = self.linear_hidden(hidden) @@ -1059,17 +965,6 @@ def call(self, hidden, training=False): hidden = self.dropout(hidden, training=training) return self.linear_out(hidden) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "linear_hidden", None) is not None: - with tf.name_scope(self.linear_hidden.name): - self.linear_hidden.build(self.config.d_model) - if getattr(self, "linear_out", None) is not None: - with tf.name_scope(self.linear_out.name): - self.linear_out.build(self.config.d_model) - class TFFunnelPreTrainedModel(TFPreTrainedModel): """ @@ -1252,14 +1147,6 @@ def serving_output(self, output): attentions=output.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - @add_start_docstrings( "The bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.", @@ -1308,14 +1195,6 @@ def serving_output(self, output): attentions=output.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - @add_start_docstrings( """ @@ -1389,17 +1268,6 @@ def serving_output(self, output): logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - if getattr(self, "discriminator_predictions", None) is not None: - with tf.name_scope(self.discriminator_predictions.name): - self.discriminator_predictions.build(None) - @add_start_docstrings("""Funnel Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING) class TFFunnelForMaskedLM(TFFunnelPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1472,17 +1340,6 @@ def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: # different dimensions return TFMaskedLMOutput(logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - @add_start_docstrings( """ @@ -1558,17 +1415,6 @@ def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassi logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1664,17 +1510,6 @@ def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoic logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1693,7 +1528,6 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None: self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1753,17 +1587,6 @@ def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOu logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1781,7 +1604,6 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None: self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1857,14 +1679,3 @@ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAn hidden_states=output.hidden_states, attentions=output.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "funnel", None) is not None: - with tf.name_scope(self.funnel.name): - self.funnel.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index 5e27dd5f07c2dc..a71c6b3dfce48c 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -510,24 +510,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wte", None) is not None: - with tf.name_scope(self.wte.name): - self.wte.build(None) - if getattr(self, "wpe", None) is not None: - with tf.name_scope(self.wpe.name): - self.wpe.build(None) - if getattr(self, "ln_f", None) is not None: - with tf.name_scope(self.ln_f.name): - self.ln_f.build([None, None, self.embed_dim]) - if getattr(self, "h", None) is not None: - for layer in self.h: - with tf.name_scope(layer.name): - layer.build(None) - class TFGPT2PreTrainedModel(TFPreTrainedModel): """ @@ -770,14 +752,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - @add_start_docstrings( """ @@ -910,14 +884,6 @@ def call( cross_attentions=transformer_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - @add_start_docstrings( """ @@ -1047,17 +1013,6 @@ def input_signature(self): "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"), } - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "multiple_choice_head", None) is not None: - with tf.name_scope(self.multiple_choice_head.name): - self.multiple_choice_head.build(None) - @add_start_docstrings( """ @@ -1085,7 +1040,6 @@ def __init__(self, config, *inputs, **kwargs): use_bias=False, ) self.transformer = TFGPT2MainLayer(config, name="transformer") - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) @@ -1174,14 +1128,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "score", None) is not None: - with tf.name_scope(self.score.name): - self.score.build(self.config.n_embd) - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index 43eaec56bf231e..08c106c49f7036 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -267,23 +267,6 @@ def call( return outputs # a, present, (attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFGPTJMLP(tf.keras.layers.Layer): def __init__(self, intermediate_size: int, config: GPTJConfig, **kwargs): @@ -300,7 +283,6 @@ def __init__(self, intermediate_size: int, config: GPTJConfig, **kwargs): self.act = get_tf_activation(config.activation_function) self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) self.embed_dim = config.n_embd - self.intermediate_size = intermediate_size def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc_in(hidden_states) @@ -309,17 +291,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dropout(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "fc_in", None) is not None: - with tf.name_scope(self.fc_in.name): - self.fc_in.build(self.embed_dim) - if getattr(self, "fc_out", None) is not None: - with tf.name_scope(self.fc_out.name): - self.fc_out.build(self.intermediate_size) - class TFGPTJBlock(tf.keras.layers.Layer): def __init__(self, config: GPTJConfig, **kwargs): @@ -328,7 +299,6 @@ def __init__(self, config: GPTJConfig, **kwargs): self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFGPTJAttention(config, name="attn") self.mlp = TFGPTJMLP(inner_dim, config, name="mlp") - self.config = config def call( self, @@ -363,20 +333,6 @@ def call( outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "ln_1", None) is not None: - with tf.name_scope(self.ln_1.name): - self.ln_1.build([None, None, self.config.n_embd]) - if getattr(self, "attn", None) is not None: - with tf.name_scope(self.attn.name): - self.attn.build(None) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - @keras_serializable class TFGPTJMainLayer(tf.keras.layers.Layer): @@ -546,21 +502,6 @@ def call( attentions=all_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wte", None) is not None: - with tf.name_scope(self.wte.name): - self.wte.build(None) - if getattr(self, "ln_f", None) is not None: - with tf.name_scope(self.ln_f.name): - self.ln_f.build([None, None, self.embed_dim]) - if getattr(self, "h", None) is not None: - for layer in self.h: - with tf.name_scope(layer.name): - layer.build(None) - class TFGPTJPreTrainedModel(TFPreTrainedModel): """ @@ -733,14 +674,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - @add_start_docstrings( """ @@ -755,7 +688,6 @@ def __init__(self, config, *inputs, **kwargs): self.lm_head = tf.keras.layers.Dense( config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="lm_head" ) - self.config = config def get_output_embeddings(self): return self.lm_head @@ -854,17 +786,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(self.config.n_embd) - @add_start_docstrings( """ @@ -894,7 +815,6 @@ def __init__(self, config, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="score", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -988,17 +908,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "score", None) is not None: - with tf.name_scope(self.score.name): - self.score.build(self.config.n_embd) - @add_start_docstrings( """ @@ -1017,7 +926,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( self.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1090,14 +998,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index bdd9c9d839c844..e61539e3040cf9 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -271,7 +271,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2") self.mlp = TFGroupViTMLP(config, name="mlp") self.norm_post = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post") - self.config = config def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor: x = query @@ -280,23 +279,6 @@ def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.T x = self.norm_post(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attn", None) is not None: - with tf.name_scope(self.attn.name): - self.attn.build(None) - if getattr(self, "norm2", None) is not None: - with tf.name_scope(self.norm2.name): - self.norm2.build([None, None, self.config.hidden_size]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "norm_post", None) is not None: - with tf.name_scope(self.norm_post.name): - self.norm_post.build([None, None, self.config.hidden_size]) - class TFGroupViTAssignAttention(tf.keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, **kwargs): @@ -308,7 +290,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.v_proj = tf.keras.layers.Dense(config.hidden_size, name="v_proj") self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj") self.assign_eps = config.assign_eps - self.config = config def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor: if gumbel and training: @@ -346,23 +327,6 @@ def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False): return out, soft_attn - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.config.hidden_size) - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.config.hidden_size) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.config.hidden_size) - if getattr(self, "proj", None) is not None: - with tf.name_scope(self.proj.name): - self.proj.build(self.config.hidden_size) - class TFGroupViTTokenAssign(tf.keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_output_group: int, **kwargs): @@ -389,7 +353,6 @@ def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_outpu self.mlp_channels = TFGroupViTMLP( config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels" ) - self.config = config def project_group_token(self, group_tokens: tf.Tensor) -> tf.Tensor: """ @@ -423,35 +386,6 @@ def call(self, image_tokens: tf.Tensor, group_tokens: tf.Tensor, training: bool return new_image_tokens, attention - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "norm_tokens", None) is not None: - with tf.name_scope(self.norm_tokens.name): - self.norm_tokens.build([None, None, self.config.hidden_size]) - if getattr(self, "mlp_inter", None) is not None: - with tf.name_scope(self.mlp_inter.name): - self.mlp_inter.build(None) - if getattr(self, "norm_post_tokens", None) is not None: - with tf.name_scope(self.norm_post_tokens.name): - self.norm_post_tokens.build([None, None, self.config.hidden_size]) - if getattr(self, "norm_x", None) is not None: - with tf.name_scope(self.norm_x.name): - self.norm_x.build([None, None, self.config.hidden_size]) - if getattr(self, "pre_assign_attn", None) is not None: - with tf.name_scope(self.pre_assign_attn.name): - self.pre_assign_attn.build(None) - if getattr(self, "assign", None) is not None: - with tf.name_scope(self.assign.name): - self.assign.build(None) - if getattr(self, "norm_new_x", None) is not None: - with tf.name_scope(self.norm_new_x.name): - self.norm_new_x.build([None, None, self.config.hidden_size]) - if getattr(self, "mlp_channels", None) is not None: - with tf.name_scope(self.mlp_channels.name): - self.mlp_channels.build(None) - # Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT class TFGroupViTPatchEmbeddings(tf.keras.layers.Layer): @@ -523,14 +457,6 @@ def call( return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - # Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings class TFGroupViTVisionEmbeddings(tf.keras.layers.Layer): @@ -547,7 +473,7 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.config = config - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): num_patches = self.patch_embeddings.num_patches self.position_embeddings = self.add_weight( shape=(1, num_patches, self.config.hidden_size), @@ -556,18 +482,7 @@ def build(self, input_shape=None): name="position_embeddings", ) - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(None) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor: """ @@ -711,7 +626,7 @@ def __init__( else: self.group_projector = None - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): if self.num_group_token > 0: self.group_token = self.add_weight( shape=(1, self.num_group_token, self.config.hidden_size), @@ -721,14 +636,7 @@ def build(self, input_shape=None): ) else: self.group_token = None - - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) + super().build(input_shape) @property def with_group_token(self): @@ -812,8 +720,6 @@ def __init__( output_size = output_size if output_size is not None else hidden_size self.fc1 = tf.keras.layers.Dense(intermediate_size, name="fc1") self.fc2 = tf.keras.layers.Dense(output_size, name="fc2") - self.intermediate_size = intermediate_size - self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.fc1(hidden_states) @@ -821,17 +727,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.fc2(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.hidden_size) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.intermediate_size) - class TFGroupViTMixerMLP(TFGroupViTMLP): def call(self, x, training: bool = False): @@ -946,23 +841,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer with CLIP->GroupViT class TFGroupViTEncoderLayer(tf.keras.layers.Layer): @@ -1016,23 +894,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build([None, None, self.embed_dim]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build([None, None, self.embed_dim]) - # Adapted from transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder class TFGroupViTTextEncoder(tf.keras.layers.Layer): @@ -1078,15 +939,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFGroupViTVisionEncoder(tf.keras.layers.Layer): def __init__(self, config: GroupViTVisionConfig, **kwargs) -> None: @@ -1138,15 +990,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "stages", None) is not None: - for layer in self.stages: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder class TFGroupViTTextTransformer(tf.keras.layers.Layer): @@ -1252,20 +1095,6 @@ def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32) return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - # Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer class TFGroupViTVisionTransformer(tf.keras.layers.Layer): @@ -1310,20 +1139,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.embed_dim]) - @keras_serializable # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextMainLayer with CLIP->GroupViT @@ -1373,14 +1188,6 @@ def call( return text_model_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "text_model", None) is not None: - with tf.name_scope(self.text_model.name): - self.text_model.build(None) - @keras_serializable # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPVisionMainLayer with CLIP->GroupViT @@ -1417,14 +1224,6 @@ def call( return vision_model_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - @keras_serializable # Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPMainLayer @@ -1472,7 +1271,7 @@ def __init__(self, config: GroupViTConfig, **kwargs): tf.keras.layers.Dense(self.projection_dim, name="text_projection.3"), ] - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.logit_scale = self.add_weight( shape=(1,), initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), @@ -1480,23 +1279,7 @@ def build(self, input_shape=None): name="logit_scale", ) - if self.built: - return - self.built = True - if getattr(self, "text_model", None) is not None: - with tf.name_scope(self.text_model.name): - self.text_model.build(None) - if getattr(self, "vision_model", None) is not None: - with tf.name_scope(self.vision_model.name): - self.vision_model.build(None) - if getattr(self, "visual_projection", None) is not None: - for layer in self.visual_projection: - with tf.name_scope(layer.name): - layer.build(None) - if getattr(self, "text_projection", None) is not None: - for layer in self.text_projection: - with tf.name_scope(layer.name): - layer.build(None) + super().build(input_shape) @unpack_inputs def get_text_features( @@ -1888,14 +1671,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "groupvit", None) is not None: - with tf.name_scope(self.groupvit.name): - self.groupvit.build(None) - class TFGroupViTVisionModel(TFGroupViTPreTrainedModel): config_class = GroupViTVisionConfig @@ -1950,14 +1725,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "groupvit", None) is not None: - with tf.name_scope(self.groupvit.name): - self.groupvit.build(None) - @add_start_docstrings(GROUPVIT_START_DOCSTRING) class TFGroupViTModel(TFGroupViTPreTrainedModel): @@ -2114,11 +1881,3 @@ def serving_output(self, output: TFGroupViTModelOutput) -> TFGroupViTModelOutput # TensorFlow cannot trace through nested dataclasses. Reference: # https://github.com/huggingface/transformers/pull/16886 return output - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "groupvit", None) is not None: - with tf.name_scope(self.groupvit.name): - self.groupvit.build(None) diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 093d2dea7bdc75..ee52c09eee1366 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -469,14 +469,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2LayerNormConvLayer with Wav2Vec2->Hubert class TFHubertLayerNormConvLayer(tf.keras.layers.Layer): @@ -501,17 +493,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.out_conv_dim]) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNormConvLayer with Wav2Vec2->Hubert class TFHubertGroupNormConvLayer(tf.keras.layers.Layer): @@ -536,17 +517,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2PositionalConvEmbedding with Wav2Vec2->Hubert class TFHubertPositionalConvEmbedding(tf.keras.layers.Layer): @@ -568,14 +538,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2SamePadLayer with Wav2Vec2->Hubert class TFHubertSamePadLayer(tf.keras.layers.Layer): @@ -639,7 +601,6 @@ def __init__(self, config: HubertConfig, **kwargs): name="projection", ) self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) - self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.layer_norm(hidden_states) @@ -647,17 +608,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.config.conv_dim[-1]) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) # TODO Matt might be wrong - # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFHubert class TFHubertAttention(tf.keras.layers.Layer): @@ -812,23 +762,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2FeedForward with Wav2Vec2->Hubert class TFHubertFeedForward(tf.keras.layers.Layer): @@ -852,7 +785,6 @@ def __init__(self, config: HubertConfig, **kwargs): name="output_dense", ) self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.intermediate_dense(hidden_states) @@ -863,17 +795,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.output_dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "intermediate_dense", None) is not None: - with tf.name_scope(self.intermediate_dense.name): - self.intermediate_dense.build(self.config.hidden_size) - if getattr(self, "output_dense", None) is not None: - with tf.name_scope(self.output_dense.name): - self.output_dense.build(self.config.intermediate_size) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayer with Wav2Vec2->Hubert class TFHubertEncoderLayer(tf.keras.layers.Layer): @@ -892,7 +813,6 @@ def __init__(self, config: HubertConfig, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) - self.config = config def call( self, @@ -919,23 +839,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "feed_forward", None) is not None: - with tf.name_scope(self.feed_forward.name): - self.feed_forward.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert class TFHubertEncoderLayerStableLayerNorm(tf.keras.layers.Layer): @@ -954,7 +857,6 @@ def __init__(self, config: HubertConfig, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) - self.config = config def call( self, @@ -979,23 +881,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "feed_forward", None) is not None: - with tf.name_scope(self.feed_forward.name): - self.feed_forward.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2Encoder with Wav2Vec2->Hubert class TFHubertEncoder(tf.keras.layers.Layer): @@ -1062,21 +947,6 @@ def call( attentions=all_self_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "pos_conv_embed", None) is not None: - with tf.name_scope(self.pos_conv_embed.name): - self.pos_conv_embed.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderStableLayerNorm with Wav2Vec2->Hubert class TFHubertEncoderStableLayerNorm(tf.keras.layers.Layer): @@ -1145,21 +1015,6 @@ def call( attentions=all_self_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "pos_conv_embed", None) is not None: - with tf.name_scope(self.pos_conv_embed.name): - self.pos_conv_embed.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFHubertMainLayer(tf.keras.layers.Layer): @@ -1176,20 +1031,12 @@ def __init__(self, config: HubertConfig, **kwargs): else: self.encoder = TFHubertEncoder(config, name="encoder") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.masked_spec_embed = self.add_weight( shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" ) - if self.built: - return - self.built = True - if getattr(self, "feature_extractor", None) is not None: - with tf.name_scope(self.feature_extractor.name): - self.feature_extractor.build(None) - if getattr(self, "feature_projection", None) is not None: - with tf.name_scope(self.feature_projection.name): - self.feature_projection.build(None) + super().build(input_shape) def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor): """ @@ -1498,14 +1345,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "hubert", None) is not None: - with tf.name_scope(self.hubert.name): - self.hubert.build(None) - @add_start_docstrings( """TFHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", @@ -1661,14 +1500,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "hubert", None) is not None: - with tf.name_scope(self.hubert.name): - self.hubert.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(self.output_hidden_size) diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index 5a9a0c87c3c47a..c756609468598c 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -73,7 +73,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -123,12 +123,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def call( self, @@ -221,7 +216,6 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -311,20 +305,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->LayoutLM class TFLayoutLMSelfOutput(tf.keras.layers.Layer): @@ -336,7 +316,6 @@ def __init__(self, config: LayoutLMConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -345,17 +324,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->LayoutLM class TFLayoutLMAttention(tf.keras.layers.Layer): @@ -397,17 +365,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->LayoutLM class TFLayoutLMIntermediate(tf.keras.layers.Layer): @@ -422,7 +379,6 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -430,14 +386,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->LayoutLM class TFLayoutLMOutput(tf.keras.layers.Layer): @@ -449,7 +397,6 @@ def __init__(self, config: LayoutLMConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -458,17 +405,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->LayoutLM class TFLayoutLMLayer(tf.keras.layers.Layer): @@ -556,20 +492,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->LayoutLM class TFLayoutLMEncoder(tf.keras.layers.Layer): @@ -640,15 +562,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->LayoutLM class TFLayoutLMPooler(tf.keras.layers.Layer): @@ -661,7 +574,6 @@ def __init__(self, config: LayoutLMConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -671,14 +583,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->LayoutLM class TFLayoutLMPredictionHeadTransform(tf.keras.layers.Layer): @@ -697,7 +601,6 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -706,17 +609,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->LayoutLM class TFLayoutLMLMPredictionHead(tf.keras.layers.Layer): @@ -732,15 +624,10 @@ def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Lay # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -909,20 +796,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFLayoutLMPreTrainedModel(TFPreTrainedModel): """ @@ -1113,14 +986,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlm", None) is not None: - with tf.name_scope(self.layoutlm.name): - self.layoutlm.build(None) - @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING) class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1242,17 +1107,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlm", None) is not None: - with tf.name_scope(self.layoutlm.name): - self.layoutlm.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1278,7 +1132,6 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1372,17 +1225,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlm", None) is not None: - with tf.name_scope(self.layoutlm.name): - self.layoutlm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1414,7 +1256,6 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1506,17 +1347,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlm", None) is not None: - with tf.name_scope(self.layoutlm.name): - self.layoutlm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1546,7 +1376,6 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1656,14 +1485,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlm", None) is not None: - with tf.name_scope(self.layoutlm.name): - self.layoutlm.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py index 62610cbb98bccb..feba69eafc2a71 100644 --- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py @@ -87,7 +87,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): ) self.hidden_size = config.hidden_size self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1]) - self.config = config def call(self, pixel_values: tf.Tensor) -> tf.Tensor: # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. @@ -98,14 +97,6 @@ def call(self, pixel_values: tf.Tensor) -> tf.Tensor: embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size)) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "proj", None) is not None: - with tf.name_scope(self.proj.name): - self.proj.build(self.config.num_channels) - class TFLayoutLMv3TextEmbeddings(tf.keras.layers.Layer): """ @@ -160,7 +151,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): name="w_position_embeddings", ) self.max_2d_positions = config.max_2d_position_embeddings - self.config = config def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor: try: @@ -270,35 +260,6 @@ def call( embeddings = self.dropout(embeddings, training=training) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "word_embeddings", None) is not None: - with tf.name_scope(self.word_embeddings.name): - self.word_embeddings.build(None) - if getattr(self, "token_type_embeddings", None) is not None: - with tf.name_scope(self.token_type_embeddings.name): - self.token_type_embeddings.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "position_embeddings", None) is not None: - with tf.name_scope(self.position_embeddings.name): - self.position_embeddings.build(None) - if getattr(self, "x_position_embeddings", None) is not None: - with tf.name_scope(self.x_position_embeddings.name): - self.x_position_embeddings.build(None) - if getattr(self, "y_position_embeddings", None) is not None: - with tf.name_scope(self.y_position_embeddings.name): - self.y_position_embeddings.build(None) - if getattr(self, "h_position_embeddings", None) is not None: - with tf.name_scope(self.h_position_embeddings.name): - self.h_position_embeddings.build(None) - if getattr(self, "w_position_embeddings", None) is not None: - with tf.name_scope(self.w_position_embeddings.name): - self.w_position_embeddings.build(None) - class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): @@ -333,7 +294,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.has_relative_attention_bias = config.has_relative_attention_bias self.has_spatial_attention_bias = config.has_spatial_attention_bias - self.config = config def transpose_for_scores(self, x: tf.Tensor): shape = tf.shape(x) @@ -412,20 +372,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput class TFLayoutLMv3SelfOutput(tf.keras.layers.Layer): @@ -437,7 +383,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -446,17 +391,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFLayoutLMv3Attention(tf.keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): @@ -487,17 +421,6 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "self_output", None) is not None: - with tf.name_scope(self.self_output.name): - self.self_output.build(None) - # Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate class TFLayoutLMv3Intermediate(tf.keras.layers.Layer): @@ -512,7 +435,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -520,14 +442,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from models.roberta.modeling_tf_bert.TFRobertaOutput class TFLayoutLMv3Output(tf.keras.layers.Layer): @@ -539,7 +453,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -548,17 +461,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFLayoutLMv3Layer(tf.keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): @@ -593,20 +495,6 @@ def call( outputs = (layer_output,) + outputs return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - class TFLayoutLMv3Encoder(tf.keras.layers.Layer): def __init__(self, config: LayoutLMv3Config, **kwargs): @@ -762,15 +650,6 @@ def call( value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFLayoutLMv3MainLayer(tf.keras.layers.Layer): @@ -797,7 +676,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): self.encoder = TFLayoutLMv3Encoder(config, name="encoder") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): if self.config.visual_embed: image_size = self.config.input_size // self.config.patch_size self.cls_token = self.add_weight( @@ -815,12 +694,7 @@ def build(self, input_shape=None): name="pos_embed", ) - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) + super().build(input_shape) def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings @@ -1306,14 +1180,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlmv3", None) is not None: - with tf.name_scope(self.layoutlmv3.name): - self.layoutlmv3.build(None) - class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer): """ @@ -1340,7 +1206,6 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="out_proj", ) - self.config = config def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: outputs = self.dropout(inputs, training=training) @@ -1349,20 +1214,6 @@ def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: outputs = self.out_proj(outputs) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1466,17 +1317,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlmv3", None) is not None: - with tf.name_scope(self.layoutlmv3.name): - self.layoutlmv3.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1600,17 +1440,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlmv3", None) is not None: - with tf.name_scope(self.layoutlmv3.name): - self.layoutlmv3.build(None) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - @add_start_docstrings( """ @@ -1738,14 +1567,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layoutlmv3", None) is not None: - with tf.name_scope(self.layoutlmv3.name): - self.layoutlmv3.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(None) diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index bd011844b41fbb..8b8bd721e99e78 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -200,28 +200,7 @@ def build(self, input_shape=None): self.key_global.build((self.config.hidden_size,)) with tf.name_scope("value_global"): self.value_global.build((self.config.hidden_size,)) - - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - if getattr(self, "query_global", None) is not None: - with tf.name_scope(self.query_global.name): - self.query_global.build(self.config.hidden_size) - if getattr(self, "key_global", None) is not None: - with tf.name_scope(self.key_global.name): - self.key_global.build(self.config.hidden_size) - if getattr(self, "value_global", None) is not None: - with tf.name_scope(self.value_global.name): - self.value_global.build(self.config.hidden_size) + super().build(input_shape) def call( self, @@ -1004,7 +983,6 @@ def __init__(self, config, layer_id, **kwargs): super().__init__(**kwargs) self.longformer_self_attn = TFLEDEncoderSelfAttention(config, layer_id=layer_id, name="longformer_self_attn") self.output_dense = tf.keras.layers.Dense(config.d_model, use_bias=True, name="output") - self.config = config def call(self, inputs, training=False): ( @@ -1026,17 +1004,6 @@ def call(self, inputs, training=False): return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer_self_attn", None) is not None: - with tf.name_scope(self.longformer_self_attn.name): - self.longformer_self_attn.build(None) - if getattr(self, "output_dense", None) is not None: - with tf.name_scope(self.output_dense.name): - self.output_dense.build(self.config.d_model) - class TFLEDDecoderAttention(tf.keras.layers.Layer): """Multi-headed attention from "Attention Is All You Need""" @@ -1188,23 +1155,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFLEDEncoderLayer(tf.keras.layers.Layer): def __init__(self, config: LEDConfig, layer_id: int, **kwargs): @@ -1218,7 +1168,6 @@ def __init__(self, config: LEDConfig, layer_id: int, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -1265,26 +1214,6 @@ def call( return (hidden_states,) + layer_outputs[1:] - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFLEDDecoderLayer(tf.keras.layers.Layer): def __init__(self, config: LEDConfig, **kwargs): @@ -1313,7 +1242,6 @@ def __init__(self, config: LEDConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -1395,32 +1323,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFLEDPreTrainedModel(TFPreTrainedModel): config_class = LEDConfig @@ -1983,21 +1885,6 @@ def _pad_to_window_size( inputs_embeds, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.embed_dim]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFLEDDecoder(tf.keras.layers.Layer): @@ -2218,21 +2105,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFLEDMainLayer(tf.keras.layers.Layer): @@ -2339,20 +2211,6 @@ def call( encoder_global_attentions=encoder_outputs.global_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare LED Model outputting raw hidden-states without any specific head on top.", @@ -2439,14 +2297,6 @@ def serving_output(self, output): encoder_global_attentions=enc_g_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "led", None) is not None: - with tf.name_scope(self.led.name): - self.led.build(None) - # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer class BiasLayer(tf.keras.layers.Layer): @@ -2667,14 +2517,3 @@ def hf_compute_loss(self, labels, logits): masked_loss = unmasked_loss * loss_mask reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) return tf.reshape(reduced_masked_loss, (1,)) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "led", None) is not None: - with tf.name_scope(self.led.name): - self.led.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index 7b6653cb233fd4..0397c2ba320ec5 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -434,18 +434,10 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def get_output_embeddings(self): return self.decoder @@ -492,7 +484,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -514,12 +506,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -595,7 +582,6 @@ def __init__(self, config: LongformerConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -603,14 +589,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Longformer class TFLongformerOutput(tf.keras.layers.Layer): @@ -622,7 +600,6 @@ def __init__(self, config: LongformerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -631,17 +608,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Longformer class TFLongformerPooler(tf.keras.layers.Layer): @@ -654,7 +620,6 @@ def __init__(self, config: LongformerConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -664,14 +629,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Longformer class TFLongformerSelfOutput(tf.keras.layers.Layer): @@ -683,7 +640,6 @@ def __init__(self, config: LongformerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -692,17 +648,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFLongformerSelfAttention(tf.keras.layers.Layer): def __init__(self, config, layer_id, **kwargs): @@ -772,28 +717,7 @@ def build(self, input_shape=None): self.key_global.build((self.config.hidden_size,)) with tf.name_scope("value_global"): self.value_global.build((self.config.hidden_size,)) - - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - if getattr(self, "query_global", None) is not None: - with tf.name_scope(self.query_global.name): - self.query_global.build(self.config.hidden_size) - if getattr(self, "key_global", None) is not None: - with tf.name_scope(self.key_global.name): - self.key_global.build(self.config.hidden_size) - if getattr(self, "value_global", None) is not None: - with tf.name_scope(self.value_global.name): - self.value_global.build(self.config.hidden_size) + super().build(input_shape) def call( self, @@ -1600,17 +1524,6 @@ def call(self, inputs, training=False): return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFLongformerLayer(tf.keras.layers.Layer): def __init__(self, config, layer_id=0, **kwargs): @@ -1641,20 +1554,6 @@ def call(self, inputs, training=False): return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "longformer_output", None) is not None: - with tf.name_scope(self.longformer_output.name): - self.longformer_output.build(None) - class TFLongformerEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -1733,15 +1632,6 @@ def call( global_attentions=all_global_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFLongformerMainLayer(tf.keras.layers.Layer): @@ -1969,20 +1859,6 @@ def _merge_to_attention_mask(attention_mask: tf.Tensor, global_attention_mask: t return attention_mask - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFLongformerPreTrainedModel(TFPreTrainedModel): """ @@ -2168,14 +2044,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer", None) is not None: - with tf.name_scope(self.longformer.name): - self.longformer.build(None) - @add_start_docstrings( """Longformer Model with a `language modeling` head on top.""", @@ -2260,17 +2128,6 @@ def call( global_attentions=outputs.global_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer", None) is not None: - with tf.name_scope(self.longformer.name): - self.longformer.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - @add_start_docstrings( """ @@ -2293,7 +2150,6 @@ def __init__(self, config, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -2402,17 +2258,6 @@ def call( global_attentions=outputs.global_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer", None) is not None: - with tf.name_scope(self.longformer.name): - self.longformer.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) - class TFLongformerClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -2429,7 +2274,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, hidden_states, training=False): hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) @@ -2439,17 +2283,6 @@ def call(self, hidden_states, training=False): output = self.out_proj(hidden_states) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -2553,17 +2386,6 @@ def call( global_attentions=outputs.global_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer", None) is not None: - with tf.name_scope(self.longformer.name): - self.longformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -2584,7 +2406,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @property def input_signature(self): @@ -2679,17 +2500,6 @@ def call( global_attentions=outputs.global_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer", None) is not None: - with tf.name_scope(self.longformer.name): - self.longformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -2712,7 +2522,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -2770,14 +2579,3 @@ def call( attentions=outputs.attentions, global_attentions=outputs.global_attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "longformer", None) is not None: - with tf.name_scope(self.longformer.name): - self.longformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 8b60d283b2ccd8..f52dec26b33ec8 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -176,7 +176,6 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.feat_dim = config.visual_feat_dim self.pos_dim = config.visual_pos_dim - self.config = config def call(self, visn_input, training=False): feats, boxes = visn_input @@ -190,23 +189,6 @@ def call(self, visn_input, training=False): output = self.dropout(output, training=training) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "visn_fc", None) is not None: - with tf.name_scope(self.visn_fc.name): - self.visn_fc.build(self.feat_dim) - if getattr(self, "visn_layer_norm", None) is not None: - with tf.name_scope(self.visn_layer_norm.name): - self.visn_layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "box_fc", None) is not None: - with tf.name_scope(self.box_fc.name): - self.box_fc.build(self.pos_dim) - if getattr(self, "box_layer_norm", None) is not None: - with tf.name_scope(self.box_layer_norm.name): - self.box_layer_norm.build([None, None, self.config.hidden_size]) - class TFLxmertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings.""" @@ -221,7 +203,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -243,12 +225,7 @@ def build(self, input_shape=None): initializer=get_initializer(initializer_range=self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ @@ -310,7 +287,6 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.ctx_dim = config.hidden_size - self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -355,20 +331,6 @@ def call(self, hidden_states, context, attention_mask, output_attentions, traini outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.ctx_dim) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.ctx_dim) - class TFLxmertIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -382,21 +344,12 @@ def __init__(self, config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFLxmertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -409,7 +362,6 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -417,17 +369,6 @@ def call(self, hidden_states, input_tensor, training=False): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFLxmertAttentionOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -439,7 +380,6 @@ def __init__(self, config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -447,17 +387,6 @@ def call(self, hidden_states, input_tensor, training=False): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -473,17 +402,6 @@ def call(self, input_tensor, attention_mask, output_attentions, training=False): attention_output = self.attention_output(self_output[0], input_tensor) return (attention_output, attention_probs) if output_attentions else (attention_output,) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "attention_output", None) is not None: - with tf.name_scope(self.attention_output.name): - self.attention_output.build(None) - class TFLxmertCrossAttentionLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -506,17 +424,6 @@ def call( outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "att", None) is not None: - with tf.name_scope(self.att.name): - self.att.build(None) - if getattr(self, "attention_output", None) is not None: - with tf.name_scope(self.attention_output.name): - self.attention_output.build(None) - class TFLxmertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -533,20 +440,6 @@ def call(self, hidden_states, attention_mask, output_attentions, training=False) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "transformer_output", None) is not None: - with tf.name_scope(self.transformer_output.name): - self.transformer_output.build(None) - class TFLxmertXLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -652,32 +545,6 @@ def call( return (lang_output, visn_output, attention_probs[0]) if output_attentions else (lang_output, visn_output) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "visual_attention", None) is not None: - with tf.name_scope(self.visual_attention.name): - self.visual_attention.build(None) - if getattr(self, "lang_self_att", None) is not None: - with tf.name_scope(self.lang_self_att.name): - self.lang_self_att.build(None) - if getattr(self, "visn_self_att", None) is not None: - with tf.name_scope(self.visn_self_att.name): - self.visn_self_att.build(None) - if getattr(self, "lang_inter", None) is not None: - with tf.name_scope(self.lang_inter.name): - self.lang_inter.build(None) - if getattr(self, "lang_output", None) is not None: - with tf.name_scope(self.lang_output.name): - self.lang_output.build(None) - if getattr(self, "visn_inter", None) is not None: - with tf.name_scope(self.visn_inter.name): - self.visn_inter.build(None) - if getattr(self, "visn_output", None) is not None: - with tf.name_scope(self.visn_output.name): - self.visn_output.build(None) - class TFLxmertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -767,26 +634,6 @@ def call( cross_encoder_attentions if output_attentions else None, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "visn_fc", None) is not None: - with tf.name_scope(self.visn_fc.name): - self.visn_fc.build(None) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - if getattr(self, "x_layers", None) is not None: - for layer in self.x_layers: - with tf.name_scope(layer.name): - layer.build(None) - if getattr(self, "r_layers", None) is not None: - for layer in self.r_layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFLxmertMainLayer(tf.keras.layers.Layer): @@ -927,20 +774,6 @@ def call( cross_encoder_attentions=cross_encoder_attentions if output_attentions else None, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFLxmertPreTrainedModel(TFPreTrainedModel): """ @@ -1136,14 +969,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "lxmert", None) is not None: - with tf.name_scope(self.lxmert.name): - self.lxmert.build(None) - class TFLxmertPooler(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -1154,7 +979,6 @@ def __init__(self, config, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding @@ -1163,14 +987,6 @@ def call(self, hidden_states): pooled_output = self.dense(first_token_tensor) return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert class TFLxmertPredictionHeadTransform(tf.keras.layers.Layer): @@ -1189,7 +1005,6 @@ def __init__(self, config: LxmertConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -1198,17 +1013,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert class TFLxmertLMPredictionHead(tf.keras.layers.Layer): @@ -1224,15 +1028,10 @@ def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -1282,24 +1081,12 @@ def __init__(self, config, input_embeddings, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship", ) - self.config = config def call(self, sequence_output, pooled_output): prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) - if getattr(self, "seq_relationship", None) is not None: - with tf.name_scope(self.seq_relationship.name): - self.seq_relationship.build(self.config.hidden_size) - class TFLxmertVisualAnswerHead(tf.keras.layers.Layer): def __init__(self, config, num_labels, **kwargs): @@ -1326,20 +1113,6 @@ def call(self, hidden_states): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(None) # TODO Matt might be wrong - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) # TODO Matt might be wrong - if getattr(self, "dense_1", None) is not None: - with tf.name_scope(self.dense_1.name): - self.dense_1.build(None) # TODO Matt might be wrong - class TFLxmertVisualObjHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -1374,17 +1147,6 @@ def call(self, hidden_states): output[key] = self.decoder_dict[key](hidden_states) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) - if getattr(self, "decoder_dict", None) is not None: - with tf.name_scope(self.decoder_dict.name): - self.decoder_dict.build(None) # TODO Matt might be wrong - @add_start_docstrings("""Lxmert Model with a `language modeling` head on top.""", LXMERT_START_DOCSTRING) class TFLxmertForPreTraining(TFLxmertPreTrainedModel): @@ -1628,14 +1390,3 @@ def call( vision_attentions=lxmert_output.vision_attentions, cross_encoder_attentions=lxmert_output.cross_encoder_attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "lxmert", None) is not None: - with tf.name_scope(self.lxmert.name): - self.lxmert.build(None) - if getattr(self, "cls", None) is not None: - with tf.name_scope(self.cls.name): - self.cls.build(None) diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 6b01510975ab18..76235b5f0f705c 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -328,23 +328,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->Marian class TFMarianEncoderLayer(tf.keras.layers.Layer): @@ -361,7 +344,6 @@ def __init__(self, config: MarianConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -403,26 +385,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->Marian class TFMarianDecoderLayer(tf.keras.layers.Layer): @@ -452,7 +414,6 @@ def __init__(self, config: MarianConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -534,32 +495,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFMarianPreTrainedModel(TFPreTrainedModel): config_class = MarianConfig @@ -871,18 +806,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFMarianDecoder(tf.keras.layers.Layer): @@ -1115,18 +1038,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFMarianMainLayer(tf.keras.layers.Layer): @@ -1238,20 +1149,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare MARIAN Model outputting raw hidden-states without any specific head on top.", @@ -1339,14 +1236,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer class BiasLayer(tf.keras.layers.Layer): @@ -1554,14 +1443,3 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index 6e2ac5db053af8..d786cd0b3ad882 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -297,23 +297,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFMBartEncoderLayer(tf.keras.layers.Layer): def __init__(self, config: MBartConfig, **kwargs): @@ -329,7 +312,6 @@ def __init__(self, config: MBartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -371,26 +353,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFMBartDecoderLayer(tf.keras.layers.Layer): def __init__(self, config: MBartConfig, **kwargs): @@ -419,7 +381,6 @@ def __init__(self, config: MBartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -501,32 +462,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFMBartPreTrainedModel(TFPreTrainedModel): config_class = MBartConfig @@ -867,24 +802,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.embed_dim]) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFMBartDecoder(tf.keras.layers.Layer): @@ -1124,24 +1041,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layernorm_embedding", None) is not None: - with tf.name_scope(self.layernorm_embedding.name): - self.layernorm_embedding.build([None, None, self.config.d_model]) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFMBartMainLayer(tf.keras.layers.Layer): @@ -1256,20 +1155,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare MBART Model outputting raw hidden-states without any specific head on top.", @@ -1357,14 +1242,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer class BiasLayer(tf.keras.layers.Layer): @@ -1570,14 +1447,3 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index ffc900e3d740f8..f0b537c465d05f 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -130,7 +130,6 @@ def __init__(self, config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -138,14 +137,6 @@ def call(self, hidden_states): return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.true_hidden_size) - class TFLayerNorm(tf.keras.layers.LayerNormalization): def __init__(self, feat_size, *args, **kwargs): @@ -191,7 +182,7 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.embedded_input_size = self.embedding_size * (3 if self.trigram_input else 1) - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -213,15 +204,7 @@ def build(self, input_shape=None): initializer=get_initializer(initializer_range=self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "embedding_transformation", None) is not None: - with tf.name_scope(self.embedding_transformation.name): - self.embedding_transformation.build(self.embedded_input_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) + super().build(input_shape) def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ @@ -299,7 +282,6 @@ def __init__(self, config, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -351,22 +333,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.true_hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.true_hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build( - self.config.true_hidden_size if self.config.use_bottleneck_attention else self.config.hidden_size - ) - class TFMobileBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -380,7 +346,6 @@ def __init__(self, config, **kwargs): ) if not self.use_bottleneck: self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, residual_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -389,17 +354,6 @@ def call(self, hidden_states, residual_tensor, training=False): hidden_states = self.LayerNorm(hidden_states + residual_tensor) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.true_hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) - class TFMobileBertAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -429,17 +383,6 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "mobilebert_output", None) is not None: - with tf.name_scope(self.mobilebert_output.name): - self.mobilebert_output.build(None) - class TFOutputBottleneck(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -449,7 +392,6 @@ def __init__(self, config, **kwargs): config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def call(self, hidden_states, residual_tensor, training=False): layer_outputs = self.dense(hidden_states) @@ -457,17 +399,6 @@ def call(self, hidden_states, residual_tensor, training=False): layer_outputs = self.LayerNorm(layer_outputs + residual_tensor) return layer_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.true_hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) - class TFMobileBertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -483,7 +414,6 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) else: self.bottleneck = TFOutputBottleneck(config, name="bottleneck") - self.config = config def call(self, hidden_states, residual_tensor_1, residual_tensor_2, training=False): hidden_states = self.dense(hidden_states) @@ -495,17 +425,6 @@ def call(self, hidden_states, residual_tensor_1, residual_tensor_2, training=Fal hidden_states = self.bottleneck(hidden_states, residual_tensor_2) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) - class TFBottleneckLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -514,24 +433,12 @@ def __init__(self, config, **kwargs): self.LayerNorm = NORM2FN[config.normalization_type]( config.intra_bottleneck_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) - self.config = config def call(self, inputs): hidden_states = self.dense(inputs) hidden_states = self.LayerNorm(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) - class TFBottleneck(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -568,14 +475,6 @@ def call(self, hidden_states): else: return (hidden_states, hidden_states, hidden_states, bottlenecked_hidden_states) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "bottleneck_input", None) is not None: - with tf.name_scope(self.bottleneck_input.name): - self.bottleneck_input.build(None) - class TFFFNOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -584,24 +483,12 @@ def __init__(self, config, **kwargs): self.LayerNorm = NORM2FN[config.normalization_type]( config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) - self.config = config def call(self, hidden_states, residual_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.LayerNorm(hidden_states + residual_tensor) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) - class TFFFNLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -614,17 +501,6 @@ def call(self, hidden_states): layer_outputs = self.mobilebert_output(intermediate_output, hidden_states) return layer_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "mobilebert_output", None) is not None: - with tf.name_scope(self.mobilebert_output.name): - self.mobilebert_output.build(None) - class TFMobileBertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -685,20 +561,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "mobilebert_output", None) is not None: - with tf.name_scope(self.mobilebert_output.name): - self.mobilebert_output.build(None) - class TFMobileBertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -742,15 +604,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - class TFMobileBertPooler(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -786,7 +639,6 @@ def __init__(self, config, **kwargs): else: self.transform_act_fn = config.hidden_act self.LayerNorm = NORM2FN["layer_norm"](config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -794,17 +646,6 @@ def call(self, hidden_states): hidden_states = self.LayerNorm(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build(None) - class TFMobileBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -812,7 +653,7 @@ def __init__(self, config, **kwargs): self.transform = TFMobileBertPredictionHeadTransform(config, name="transform") self.config = config - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.dense = self.add_weight( shape=(self.config.hidden_size - self.config.embedding_size, self.config.vocab_size), @@ -826,13 +667,7 @@ def build(self, input_shape=None): trainable=True, name="decoder/weight", ) - - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self): return self @@ -980,20 +815,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFMobileBertPreTrainedModel(TFPreTrainedModel): """ @@ -1178,14 +999,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - @add_start_docstrings( """ @@ -1276,20 +1089,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) # TODO Matt might be wrong - if getattr(self, "seq_relationship", None) is not None: - with tf.name_scope(self.seq_relationship.name): - self.seq_relationship.build(None) # TODO Matt might be wrong - @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING) class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1370,36 +1169,16 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "predictions", None) is not None: - with tf.name_scope(self.predictions.name): - self.predictions.build(None) # TODO Matt might be wrong - class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship") - self.config = config def call(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "seq_relationship", None) is not None: - with tf.name_scope(self.seq_relationship.name): - self.seq_relationship.build(self.config.hidden_size) - @add_start_docstrings( """MobileBert Model with a `next sentence prediction (classification)` head on top.""", @@ -1482,17 +1261,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "cls", None) is not None: - with tf.name_scope(self.cls.name): - self.cls.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1523,7 +1291,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1584,17 +1351,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1621,7 +1377,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1695,17 +1450,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1732,7 +1476,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1808,17 +1551,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1850,7 +1582,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1908,14 +1639,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilebert", None) is not None: - with tf.name_scope(self.mobilebert.name): - self.mobilebert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py index 61519c1a53c6fb..723a18a1a60034 100644 --- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py @@ -144,14 +144,6 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.activation(features) return features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution", None) is not None: - with tf.name_scope(self.convolution.name): - self.convolution.build(self.in_channels) - class TFMobileViTInvertedResidual(tf.keras.layers.Layer): """ @@ -202,20 +194,6 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: return residual + features if self.use_residual else features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "expand_1x1", None) is not None: - with tf.name_scope(self.expand_1x1.name): - self.expand_1x1.build(None) - if getattr(self, "conv_3x3", None) is not None: - with tf.name_scope(self.conv_3x3.name): - self.conv_3x3.build(None) - if getattr(self, "reduce_1x1", None) is not None: - with tf.name_scope(self.reduce_1x1.name): - self.reduce_1x1.build(None) - class TFMobileViTMobileNetLayer(tf.keras.layers.Layer): def __init__( @@ -268,7 +246,6 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: self.value = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="value") self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - self.hidden_size = hidden_size def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: batch_size = tf.shape(x)[0] @@ -299,41 +276,18 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: context_layer = tf.reshape(context_layer, shape=(batch_size, -1, self.all_head_size)) return context_layer - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.hidden_size) - class TFMobileViTSelfOutput(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(hidden_size, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.hidden_size) - class TFMobileViTAttention(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: @@ -349,17 +303,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: attention_output = self.dense_output(self_outputs, training=training) return attention_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFMobileViTIntermediate(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: @@ -369,28 +312,18 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.hidden_size) - class TFMobileViTOutput(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(hidden_size, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.intermediate_size = intermediate_size def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) @@ -398,14 +331,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = hidden_states + input_tensor return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.intermediate_size) - class TFMobileViTTransformerLayer(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: int, **kwargs) -> None: @@ -419,7 +344,6 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) - self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: attention_output = self.attention(self.layernorm_before(hidden_states), training=training) @@ -430,26 +354,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: layer_output = self.mobilevit_output(layer_output, hidden_states, training=training) return layer_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "mobilevit_output", None) is not None: - with tf.name_scope(self.mobilevit_output.name): - self.mobilevit_output.build(None) - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.hidden_size]) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.hidden_size]) - class TFMobileViTTransformer(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, hidden_size: int, num_stages: int, **kwargs) -> None: @@ -539,7 +443,6 @@ def __init__( kernel_size=config.conv_kernel_size, name="fusion", ) - self.hidden_size = hidden_size def unfolding(self, features: tf.Tensor) -> Tuple[tf.Tensor, Dict]: patch_width, patch_height = self.patch_width, self.patch_height @@ -638,29 +541,6 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.fusion(tf.concat([residual, features], axis=-1), training=training) return features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv_kxk", None) is not None: - with tf.name_scope(self.conv_kxk.name): - self.conv_kxk.build(None) - if getattr(self, "conv_1x1", None) is not None: - with tf.name_scope(self.conv_1x1.name): - self.conv_1x1.build(None) - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.hidden_size]) - if getattr(self, "conv_projection", None) is not None: - with tf.name_scope(self.conv_projection.name): - self.conv_projection.build(None) - if getattr(self, "fusion", None) is not None: - with tf.name_scope(self.fusion.name): - self.fusion.build(None) - class TFMobileViTEncoder(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, **kwargs) -> None: @@ -862,20 +742,6 @@ def call( hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv_stem", None) is not None: - with tf.name_scope(self.conv_stem.name): - self.conv_stem.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) # TODO Matt might be wrong - class TFMobileViTPreTrainedModel(TFPreTrainedModel): """ @@ -976,14 +842,6 @@ def call( output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilevit", None) is not None: - with tf.name_scope(self.mobilevit.name): - self.mobilevit.build(None) - @add_start_docstrings( """ @@ -1044,17 +902,6 @@ def call( return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilevit", None) is not None: - with tf.name_scope(self.mobilevit.name): - self.mobilevit.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - class TFMobileViTASPPPooling(tf.keras.layers.Layer): def __init__(self, config: MobileViTConfig, in_channels: int, out_channels: int, **kwargs) -> None: @@ -1080,17 +927,6 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: features = tf.image.resize(features, size=spatial_size, method="bilinear") return features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "global_pool", None) is not None: - with tf.name_scope(self.global_pool.name): - self.global_pool.build(None) - if getattr(self, "conv_1x1", None) is not None: - with tf.name_scope(self.conv_1x1.name): - self.conv_1x1.build(None) - class TFMobileViTASPP(tf.keras.layers.Layer): """ @@ -1162,14 +998,6 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: pooled_features = self.dropout(pooled_features, training=training) return pooled_features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "project", None) is not None: - with tf.name_scope(self.project.name): - self.project.build(None) - class TFMobileViTDeepLabV3(tf.keras.layers.Layer): """ @@ -1199,17 +1027,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.classifier(features, training=training) return features - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "aspp", None) is not None: - with tf.name_scope(self.aspp.name): - self.aspp.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1322,14 +1139,3 @@ def call( logits=logits, hidden_states=outputs.hidden_states if output_hidden_states else None, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mobilevit", None) is not None: - with tf.name_scope(self.mobilevit.name): - self.mobilevit.build(None) - if getattr(self, "segmentation_head", None) is not None: - with tf.name_scope(self.segmentation_head.name): - self.segmentation_head.build(None) diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 454f854949eb35..2982899340d203 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -91,7 +91,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -106,12 +106,7 @@ def build(self, input_shape=None): initializer=get_initializer(initializer_range=self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def create_position_ids_from_input_ids(self, input_ids): """ @@ -170,7 +165,6 @@ def __init__(self, config: MPNetConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -180,14 +174,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFMPNetSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -217,7 +203,6 @@ def __init__(self, config, **kwargs): config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -262,23 +247,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, posi outputs = (o, attention_probs) if output_attentions else (o,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q", None) is not None: - with tf.name_scope(self.q.name): - self.q.build(self.config.hidden_size) - if getattr(self, "k", None) is not None: - with tf.name_scope(self.k.name): - self.k.build(self.config.hidden_size) - if getattr(self, "v", None) is not None: - with tf.name_scope(self.v.name): - self.v.build(self.config.hidden_size) - if getattr(self, "o", None) is not None: - with tf.name_scope(self.o.name): - self.o.build(self.config.hidden_size) - class TFMPNetAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -287,7 +255,6 @@ def __init__(self, config, **kwargs): self.attn = TFMPNetSelfAttention(config, name="attn") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.config = config def prune_heads(self, heads): raise NotImplementedError @@ -300,17 +267,6 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, posit outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attn", None) is not None: - with tf.name_scope(self.attn.name): - self.attn.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet class TFMPNetIntermediate(tf.keras.layers.Layer): @@ -325,7 +281,6 @@ def __init__(self, config: MPNetConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -333,14 +288,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet class TFMPNetOutput(tf.keras.layers.Layer): @@ -352,7 +299,6 @@ def __init__(self, config: MPNetConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -361,17 +307,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFMPNetLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -394,20 +329,6 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, posi return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "out", None) is not None: - with tf.name_scope(self.out.name): - self.out.build(None) - class TFMPNetEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -423,7 +344,7 @@ def __init__(self, config, **kwargs): self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] self.relative_attention_num_buckets = config.relative_attention_num_buckets - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("relative_attention_bias"): self.relative_attention_bias = self.add_weight( name="embeddings", @@ -431,14 +352,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - return - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) + return super().build(input_shape) def call( self, @@ -647,20 +561,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - MPNET_START_DOCSTRING = r""" @@ -793,14 +693,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mpnet", None) is not None: - with tf.name_scope(self.mpnet.name): - self.mpnet.build(None) - class TFMPNetLMHead(tf.keras.layers.Layer): """MPNet head for masked and permuted language modeling""" @@ -820,18 +712,10 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def get_output_embeddings(self): return self.decoder @@ -932,17 +816,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mpnet", None) is not None: - with tf.name_scope(self.mpnet.name): - self.mpnet.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - class TFMPNetClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -959,7 +832,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -969,17 +841,6 @@ def call(self, features, training=False): x = self.out_proj(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1052,17 +913,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mpnet", None) is not None: - with tf.name_scope(self.mpnet.name): - self.mpnet.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1080,7 +930,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1150,17 +999,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mpnet", None) is not None: - with tf.name_scope(self.mpnet.name): - self.mpnet.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1181,7 +1019,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1236,17 +1073,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mpnet", None) is not None: - with tf.name_scope(self.mpnet.name): - self.mpnet.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1266,7 +1092,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1334,14 +1159,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "mpnet", None) is not None: - with tf.name_scope(self.mpnet.name): - self.mpnet.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index e5153b584811fa..2fd5dbc79769b7 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -153,17 +153,6 @@ def call(self, x, attention_mask, head_mask, output_attentions, training=False): outputs = [a] + attn_outputs[1:] return outputs # a, (attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "c_attn", None) is not None: - with tf.name_scope(self.c_attn.name): - self.c_attn.build(None) - if getattr(self, "c_proj", None) is not None: - with tf.name_scope(self.c_proj.name): - self.c_proj.build(None) - class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): @@ -180,17 +169,6 @@ def call(self, x, training=False): h2 = self.dropout(h2, training=training) return h2 - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "c_fc", None) is not None: - with tf.name_scope(self.c_fc.name): - self.c_fc.build(None) - if getattr(self, "c_proj", None) is not None: - with tf.name_scope(self.c_proj.name): - self.c_proj.build(None) - class TFBlock(tf.keras.layers.Layer): def __init__(self, config, scale=False, **kwargs): @@ -213,23 +191,6 @@ def call(self, x, attention_mask, head_mask, output_attentions, training=False): outputs = [h] + output_attn[1:] return outputs # x, (attentions) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attn", None) is not None: - with tf.name_scope(self.attn.name): - self.attn.build(None) - if getattr(self, "ln_1", None) is not None: - with tf.name_scope(self.ln_1.name): - self.ln_1.build([None, None, self.nx]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "ln_2", None) is not None: - with tf.name_scope(self.ln_2.name): - self.ln_2.build([None, None, self.nx]) - @keras_serializable class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): @@ -253,7 +214,7 @@ def __init__(self, config, *inputs, **kwargs): self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("positions_embed"): self.positions_embed = self.add_weight( name="embeddings", @@ -261,16 +222,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "tokens_embed", None) is not None: - with tf.name_scope(self.tokens_embed.name): - self.tokens_embed.build(None) - if getattr(self, "h", None) is not None: - for layer in self.h: - with tf.name_scope(layer.name): - layer.build(None) + super().build(input_shape) def get_input_embeddings(self): return self.tokens_embed @@ -577,14 +529,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - @add_start_docstrings( """ @@ -670,14 +614,6 @@ def call( def prepare_inputs_for_generation(self, inputs, **kwargs): return {"input_ids": inputs} - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - @add_start_docstrings( """ @@ -799,17 +735,6 @@ def input_signature(self): "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), } - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "multiple_choice_head", None) is not None: - with tf.name_scope(self.multiple_choice_head.name): - self.multiple_choice_head.build(None) - @add_start_docstrings( """ @@ -837,7 +762,6 @@ def __init__(self, config, *inputs, **kwargs): use_bias=False, ) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING) @@ -925,14 +849,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "score", None) is not None: - with tf.name_scope(self.score.name): - self.score.build(self.config.n_embd) - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index 7e6914d236ad39..6c48d6e629273c 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -268,23 +268,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFOPTDecoderLayer(tf.keras.layers.Layer): def __init__(self, config: OPTConfig, **kwargs): @@ -305,7 +288,6 @@ def __init__(self, config: OPTConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -372,26 +354,6 @@ def call( return (hidden_states, self_attn_weights, present_key_value) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) # TODO Matt might be wrong - OPT_START_DOCSTRING = r""" This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -734,21 +696,6 @@ def call( attentions=all_self_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_tokens", None) is not None: - with tf.name_scope(self.embed_tokens.name): - self.embed_tokens.build(None) - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFOPTMainLayer(tf.keras.layers.Layer): @@ -810,14 +757,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings( "The bare TF OPT Model outputting raw hidden-states without any specific head on top.", @@ -902,14 +841,6 @@ def serving_output(self, output): attentions=attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - @add_start_docstrings( """ @@ -1075,11 +1006,3 @@ def serving_output(self, output): loss=output.loss, logits=output.logits, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index c18bc69e4b3797..52171b884ca825 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -330,23 +330,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus class TFPegasusEncoderLayer(tf.keras.layers.Layer): @@ -363,7 +346,6 @@ def __init__(self, config: PegasusConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -405,26 +387,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus class TFPegasusDecoderLayer(tf.keras.layers.Layer): @@ -454,7 +416,6 @@ def __init__(self, config: PegasusConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -536,32 +497,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFPegasusPreTrainedModel(TFPreTrainedModel): config_class = PegasusConfig @@ -877,21 +812,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFPegasusDecoder(tf.keras.layers.Layer): @@ -1127,21 +1047,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFPegasusMainLayer(tf.keras.layers.Layer): @@ -1253,20 +1158,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) # TODO Matt might be wrong - @add_start_docstrings( "The bare PEGASUS Model outputting raw hidden-states without any specific head on top.", @@ -1354,14 +1245,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer class BiasLayer(tf.keras.layers.Layer): @@ -1569,14 +1452,3 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "bias_layer", None) is not None: - with tf.name_scope(self.bias_layer.name): - self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index 002fcffbccf307..d1151bcd5a64b3 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -1292,14 +1292,6 @@ def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, return loss - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rag", None) is not None: - with tf.name_scope(self.rag.name): - self.rag.build(None) - @add_start_docstrings_to_model_forward( """ @@ -1751,11 +1743,3 @@ def _cat_and_pad(tensors, pad_token_id): output = tf.convert_to_tensor(output) return tf.cast(output, tensors[0][0][0].dtype) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rag", None) is not None: - with tf.name_scope(self.rag.name): - self.rag.build(None) diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py index e66d4811a14143..f847d00fa09a3b 100644 --- a/src/transformers/models/regnet/modeling_tf_regnet.py +++ b/src/transformers/models/regnet/modeling_tf_regnet.py @@ -84,17 +84,6 @@ def call(self, hidden_state): hidden_state = self.activation(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution", None) is not None: - with tf.name_scope(self.convolution.name): - self.convolution.build(self.in_channels) - if getattr(self, "normalization", None) is not None: - with tf.name_scope(self.normalization.name): - self.normalization.build(None) - class TFRegNetEmbeddings(tf.keras.layers.Layer): """ @@ -127,14 +116,6 @@ def call(self, pixel_values): hidden_state = self.embedder(pixel_values) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedder", None) is not None: - with tf.name_scope(self.embedder.name): - self.embedder.build(None) - class TFRegNetShortCut(tf.keras.layers.Layer): """ @@ -153,17 +134,6 @@ def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwarg def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: return self.normalization(self.convolution(inputs), training=training) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution", None) is not None: - with tf.name_scope(self.convolution.name): - self.convolution.build(self.in_channels) - if getattr(self, "normalization", None) is not None: - with tf.name_scope(self.normalization.name): - self.normalization.build(None) - class TFRegNetSELayer(tf.keras.layers.Layer): """ @@ -186,18 +156,6 @@ def call(self, hidden_state): hidden_state = hidden_state * pooled return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "attention", None) is not None: - for layer in self.attention: - with tf.name_scope(layer.name): - layer.build(None) - class TFRegNetXLayer(tf.keras.layers.Layer): """ @@ -232,18 +190,6 @@ def call(self, hidden_state): hidden_state = self.activation(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shortcut", None) is not None: - with tf.name_scope(self.shortcut.name): - self.shortcut.build(None) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFRegNetYLayer(tf.keras.layers.Layer): """ @@ -278,18 +224,6 @@ def call(self, hidden_state): hidden_state = self.activation(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shortcut", None) is not None: - with tf.name_scope(self.shortcut.name): - self.shortcut.build(None) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFRegNetStage(tf.keras.layers.Layer): """ @@ -313,15 +247,6 @@ def call(self, hidden_state): hidden_state = layer_module(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFRegNetEncoder(tf.keras.layers.Layer): def __init__(self, config: RegNetConfig, **kwargs): @@ -412,20 +337,6 @@ def call( hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedder", None) is not None: - with tf.name_scope(self.embedder.name): - self.embedder.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFRegNetPreTrainedModel(TFPreTrainedModel): """ @@ -511,14 +422,6 @@ def call( hidden_states=outputs.hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "regnet", None) is not None: - with tf.name_scope(self.regnet.name): - self.regnet.build(None) - @add_start_docstrings( """ @@ -580,15 +483,3 @@ def call( return ((loss,) + output) if loss is not None else output return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "regnet", None) is not None: - with tf.name_scope(self.regnet.name): - self.regnet.build(None) - if getattr(self, "classifier", None) is not None: - for layer in self.classifier: - with tf.name_scope(layer.name): - layer.build(None) diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index 62a1ef7670d12a..1595fd8118debd 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -80,7 +80,7 @@ def __init__(self, config: RemBertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -102,12 +102,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.input_embedding_size]) + super().build(input_shape) def call( self, @@ -177,7 +172,6 @@ def __init__(self, config: RemBertConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -267,20 +261,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RemBert class TFRemBertSelfOutput(tf.keras.layers.Layer): @@ -292,7 +272,6 @@ def __init__(self, config: RemBertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -301,17 +280,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->RemBert class TFRemBertAttention(tf.keras.layers.Layer): @@ -353,17 +321,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RemBert class TFRemBertIntermediate(tf.keras.layers.Layer): @@ -378,7 +335,6 @@ def __init__(self, config: RemBertConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -386,14 +342,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RemBert class TFRemBertOutput(tf.keras.layers.Layer): @@ -405,7 +353,6 @@ def __init__(self, config: RemBertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -414,17 +361,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RemBert class TFRemBertLayer(tf.keras.layers.Layer): @@ -512,20 +448,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - class TFRemBertEncoder(tf.keras.layers.Layer): def __init__(self, config: RemBertConfig, **kwargs): @@ -602,18 +524,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedding_hidden_mapping_in", None) is not None: - with tf.name_scope(self.embedding_hidden_mapping_in.name): - self.embedding_hidden_mapping_in.build(self.config.input_embedding_size) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RemBert class TFRemBertPooler(tf.keras.layers.Layer): @@ -626,7 +536,6 @@ def __init__(self, config: RemBertConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -636,14 +545,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFRemBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): @@ -661,7 +562,7 @@ def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Laye self.activation = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.decoder = self.add_weight( name="decoder/weight", shape=[self.config.vocab_size, self.output_embedding_size], @@ -671,15 +572,7 @@ def build(self, input_shape=None): shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.output_embedding_size]) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self @@ -907,20 +800,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFRemBertPreTrainedModel(TFPreTrainedModel): """ @@ -1103,14 +982,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING) class TFRemBertForMaskedLM(TFRemBertPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1183,17 +1054,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - @add_start_docstrings( """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING @@ -1310,17 +1170,6 @@ def call( cross_attentions=outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1341,7 +1190,6 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1398,17 +1246,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1426,7 +1263,6 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1506,17 +1342,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1536,7 +1361,6 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1591,17 +1415,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1620,7 +1433,6 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1689,14 +1501,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rembert", None) is not None: - with tf.name_scope(self.rembert.name): - self.rembert.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py index 01c52edd4cd5d1..54d0edb399452f 100644 --- a/src/transformers/models/resnet/modeling_tf_resnet.py +++ b/src/transformers/models/resnet/modeling_tf_resnet.py @@ -82,17 +82,6 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.activation(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(self.in_channels) - if getattr(self, "normalization", None) is not None: - with tf.name_scope(self.normalization.name): - self.normalization.build(None) - class TFResNetEmbeddings(tf.keras.layers.Layer): """ @@ -124,17 +113,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.pooler(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedder", None) is not None: - with tf.name_scope(self.embedder.name): - self.embedder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFResNetShortCut(tf.keras.layers.Layer): """ @@ -157,17 +135,6 @@ def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.normalization(hidden_state, training=training) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "convolution", None) is not None: - with tf.name_scope(self.convolution.name): - self.convolution.build(self.in_channels) - if getattr(self, "normalization", None) is not None: - with tf.name_scope(self.normalization.name): - self.normalization.build(None) - class TFResNetBasicLayer(tf.keras.layers.Layer): """ @@ -197,20 +164,6 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.activation(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shortcut", None) is not None: - with tf.name_scope(self.shortcut.name): - self.shortcut.build(None) - if getattr(self, "conv1", None) is not None: - with tf.name_scope(self.conv1.name): - self.conv1.build(None) # TODO Matt might be wrong - if getattr(self, "conv2", None) is not None: - with tf.name_scope(self.conv2.name): - self.conv2.build(None) # TODO Matt might be wrong - class TFResNetBottleNeckLayer(tf.keras.layers.Layer): """ @@ -252,23 +205,6 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.activation(hidden_state) return hidden_state - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shortcut", None) is not None: - with tf.name_scope(self.shortcut.name): - self.shortcut.build(None) - if getattr(self, "conv0", None) is not None: - with tf.name_scope(self.conv0.name): - self.conv0.build(None) # TODO Matt might be wrong - if getattr(self, "conv1", None) is not None: - with tf.name_scope(self.conv1.name): - self.conv1.build(None) # TODO Matt might be wrong - if getattr(self, "conv2", None) is not None: - with tf.name_scope(self.conv2.name): - self.conv2.build(None) # TODO Matt might be wrong - class TFResNetStage(tf.keras.layers.Layer): """ @@ -337,15 +273,6 @@ def call( return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "stages", None) is not None: - for layer in self.stages: - with tf.name_scope(layer.name): - layer.build(None) - class TFResNetPreTrainedModel(TFPreTrainedModel): """ @@ -446,17 +373,6 @@ def call( hidden_states=hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embedder", None) is not None: - with tf.name_scope(self.embedder.name): - self.embedder.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - @add_start_docstrings( "The bare ResNet model outputting raw features without any specific head on top.", @@ -496,14 +412,6 @@ def call( ) return resnet_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "resnet", None) is not None: - with tf.name_scope(self.resnet.name): - self.resnet.build(None) - @add_start_docstrings( """ @@ -567,14 +475,3 @@ def call( return (loss,) + output if loss is not None else output return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "resnet", None) is not None: - with tf.name_scope(self.resnet.name): - self.resnet.build(None) - if getattr(self, "classifier_layer", None) is not None: - with tf.name_scope(self.classifier_layer.name): - self.classifier_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 5195005de8e60e..9b6c491d2761e6 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -89,7 +89,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -111,12 +111,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -189,7 +184,6 @@ def __init__(self, config: RobertaConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -199,14 +193,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta class TFRobertaSelfAttention(tf.keras.layers.Layer): @@ -236,7 +222,6 @@ def __init__(self, config: RobertaConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -326,20 +311,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta class TFRobertaSelfOutput(tf.keras.layers.Layer): @@ -351,7 +322,6 @@ def __init__(self, config: RobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -360,17 +330,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta class TFRobertaAttention(tf.keras.layers.Layer): @@ -412,17 +371,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta class TFRobertaIntermediate(tf.keras.layers.Layer): @@ -437,7 +385,6 @@ def __init__(self, config: RobertaConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -445,14 +392,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta class TFRobertaOutput(tf.keras.layers.Layer): @@ -464,7 +403,6 @@ def __init__(self, config: RobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -473,17 +411,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta class TFRobertaLayer(tf.keras.layers.Layer): @@ -571,20 +498,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta class TFRobertaEncoder(tf.keras.layers.Layer): @@ -655,15 +568,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFRobertaMainLayer(tf.keras.layers.Layer): @@ -861,20 +765,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - class TFRobertaPreTrainedModel(TFPreTrainedModel): """ @@ -1056,14 +946,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - class TFRobertaLMHead(tf.keras.layers.Layer): """Roberta Head for masked language modeling.""" @@ -1083,18 +965,10 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def get_output_embeddings(self): return self.decoder @@ -1202,17 +1076,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss): # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model @@ -1335,17 +1198,6 @@ def call( cross_attentions=outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - class TFRobertaClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -1365,7 +1217,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1375,17 +1226,6 @@ def call(self, features, training=False): x = self.out_proj(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1462,17 +1302,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1494,7 +1323,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1564,17 +1392,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1600,7 +1417,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1659,17 +1475,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1690,7 +1495,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1762,14 +1566,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py index 983dee75a6a2ba..2f98a5f5d0cff4 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py @@ -94,7 +94,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -116,12 +116,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -194,7 +189,6 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -204,14 +198,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm class TFRobertaPreLayerNormSelfAttention(tf.keras.layers.Layer): @@ -241,7 +227,6 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -331,20 +316,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): @@ -354,7 +325,6 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -363,14 +333,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): @@ -379,7 +341,6 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self") self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads def prune_heads(self, heads): @@ -415,20 +376,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): @@ -443,7 +390,6 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) @@ -452,17 +398,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer): def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): @@ -472,7 +407,6 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -481,14 +415,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm class TFRobertaPreLayerNormLayer(tf.keras.layers.Layer): @@ -576,20 +502,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm class TFRobertaPreLayerNormEncoder(tf.keras.layers.Layer): @@ -660,15 +572,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFRobertaPreLayerNormMainLayer(tf.keras.layers.Layer): @@ -862,23 +765,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel): @@ -1062,14 +948,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm class TFRobertaPreLayerNormLMHead(tf.keras.layers.Layer): @@ -1090,18 +968,10 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def get_output_embeddings(self): return self.decoder @@ -1215,17 +1085,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss): @@ -1355,17 +1214,6 @@ def call( cross_attentions=outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm class TFRobertaPreLayerNormClassificationHead(tf.keras.layers.Layer): @@ -1386,7 +1234,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1396,17 +1243,6 @@ def call(self, features, training=False): x = self.out_proj(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1486,17 +1322,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1519,7 +1344,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1591,17 +1415,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1629,7 +1442,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1687,17 +1499,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1720,7 +1521,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1791,14 +1591,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta_prelayernorm", None) is not None: - with tf.name_scope(self.roberta_prelayernorm.name): - self.roberta_prelayernorm.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index f71a0c9df23628..cea286c828b4df 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -142,7 +142,7 @@ def __init__(self, config: RoFormerConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -157,12 +157,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) + super().build(input_shape) def call( self, @@ -223,7 +218,6 @@ def __init__(self, config: RoFormerConfig, **kwargs): ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.rotary_value = config.rotary_value - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -313,20 +307,6 @@ def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, val return query_layer, key_layer, value_layer return query_layer, key_layer - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer class TFRoFormerSelfOutput(tf.keras.layers.Layer): @@ -338,7 +318,6 @@ def __init__(self, config: RoFormerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -347,17 +326,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFRoFormerAttention(tf.keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): @@ -393,17 +361,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RoFormer class TFRoFormerIntermediate(tf.keras.layers.Layer): @@ -418,7 +375,6 @@ def __init__(self, config: RoFormerConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -426,14 +382,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RoFormer class TFRoFormerOutput(tf.keras.layers.Layer): @@ -445,7 +393,6 @@ def __init__(self, config: RoFormerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -454,17 +401,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - class TFRoFormerLayer(tf.keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): @@ -500,20 +436,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "roformer_output", None) is not None: - with tf.name_scope(self.roformer_output.name): - self.roformer_output.build(None) - class TFRoFormerEncoder(tf.keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): @@ -569,18 +491,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config: RoFormerConfig, **kwargs): @@ -598,7 +508,6 @@ def __init__(self, config: RoFormerConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -607,17 +516,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.embedding_size]) - class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): @@ -632,15 +530,10 @@ def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Lay # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -794,17 +687,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - class TFRoFormerPreTrainedModel(TFPreTrainedModel): """ @@ -952,14 +834,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING) class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1030,17 +904,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - @add_start_docstrings( """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING @@ -1114,17 +977,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - if getattr(self, "mlm", None) is not None: - with tf.name_scope(self.mlm.name): - self.mlm.build(None) # TODO Matt might be wrong - class TFRoFormerClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" @@ -1144,7 +996,6 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.classifier_act_fn = get_tf_activation(config.hidden_act) else: self.classifier_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) @@ -1156,17 +1007,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1235,17 +1075,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1263,7 +1092,6 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1339,20 +1167,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1372,7 +1186,6 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1425,17 +1238,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1454,7 +1256,6 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1520,14 +1321,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roformer", None) is not None: - with tf.name_scope(self.roformer.name): - self.roformer.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py index 8626edcdaec37a..a0a48b5aa7cdc7 100644 --- a/src/transformers/models/sam/modeling_tf_sam.py +++ b/src/transformers/models/sam/modeling_tf_sam.py @@ -150,14 +150,6 @@ def call(self, pixel_values): embeddings = self.projection(tf.transpose(pixel_values, perm=[0, 2, 3, 1])) return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - class TFSamMLPBlock(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -165,7 +157,6 @@ def __init__(self, config, **kwargs): self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1") self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2") self.act = ACT2FN[config.hidden_act] - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.lin1(hidden_states) @@ -173,17 +164,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.lin2(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "lin1", None) is not None: - with tf.name_scope(self.lin1.name): - self.lin1.build(self.config.hidden_size) - if getattr(self, "lin2", None) is not None: - with tf.name_scope(self.lin2.name): - self.lin2.build(self.config.mlp_dim) - class TFSamLayerNorm(tf.keras.layers.Layer): r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. @@ -277,23 +257,6 @@ def call(self, query: tf.Tensor, key: tf.Tensor, value: tf.Tensor) -> tf.Tensor: return out - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.hidden_size) - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.hidden_size) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.internal_dim) - class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer): def __init__(self, config, attention_downsample_rate: int = 2, skip_first_layer_pe: bool = False, **kwargs): @@ -382,35 +345,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build([None, None, self.hidden_size]) - if getattr(self, "cross_attn_token_to_image", None) is not None: - with tf.name_scope(self.cross_attn_token_to_image.name): - self.cross_attn_token_to_image.build(None) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build([None, None, self.hidden_size]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - if getattr(self, "layer_norm3", None) is not None: - with tf.name_scope(self.layer_norm3.name): - self.layer_norm3.build([None, None, self.hidden_size]) - if getattr(self, "layer_norm4", None) is not None: - with tf.name_scope(self.layer_norm4.name): - self.layer_norm4.build([None, None, self.hidden_size]) - if getattr(self, "cross_attn_image_to_token", None) is not None: - with tf.name_scope(self.cross_attn_image_to_token.name): - self.cross_attn_image_to_token.build(None) - class TFSamTwoWayTransformer(tf.keras.layers.Layer): def __init__(self, config: SamMaskDecoderConfig, **kwargs): @@ -478,17 +412,6 @@ def call( queries = self.layer_norm_final_attn(queries) return queries, keys, all_attentions - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "final_attn_token_to_image", None) is not None: - with tf.name_scope(self.final_attn_token_to_image.name): - self.final_attn_token_to_image.build(None) - if getattr(self, "layer_norm_final_attn", None) is not None: - with tf.name_scope(self.layer_norm_final_attn.name): - self.layer_norm_final_attn.build([None, None, self.config.hidden_size]) - class TFSamFeedForward(tf.keras.layers.Layer): def __init__( @@ -504,8 +427,6 @@ def __init__( for i in range(num_layers - 2) ] self.sigmoid_output = sigmoid_output - self.hidden_dim = hidden_dim - self.input_dim = input_dim def call(self, hidden_states): hidden_states = self.proj_in(hidden_states) @@ -518,21 +439,6 @@ def call(self, hidden_states): hidden_states = tf.sigmoid(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "proj_in", None) is not None: - with tf.name_scope(self.proj_in.name): - self.proj_in.build(self.input_dim) - if getattr(self, "proj_out", None) is not None: - with tf.name_scope(self.proj_out.name): - self.proj_out.build(self.hidden_dim) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFSamMaskDecoder(tf.keras.layers.Layer): def __init__(self, config: SamMaskDecoderConfig, **kwargs): @@ -577,30 +483,12 @@ def __init__(self, config: SamMaskDecoderConfig, **kwargs): name="iou_prediction_head", ) - def build(self, input_shape=None): + def build(self, input_shape): self.iou_token = self.add_weight(shape=(1, self.hidden_size), name="iou_token.weight", trainable=True) self.mask_tokens = self.add_weight( shape=(self.num_mask_tokens, self.hidden_size), name="mask_tokens.weight", trainable=True ) - - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "upscale_conv1", None) is not None: - with tf.name_scope(self.upscale_conv1.name): - self.upscale_conv1.build(self.hidden_size) - if getattr(self, "upscale_conv2", None) is not None: - with tf.name_scope(self.upscale_conv2.name): - self.upscale_conv2.build(self.hidden_size // 4) - if getattr(self, "upscale_layer_norm", None) is not None: - with tf.name_scope(self.upscale_layer_norm.name): - self.upscale_layer_norm.build(None) - if getattr(self, "iou_prediction_head", None) is not None: - with tf.name_scope(self.iou_prediction_head.name): - self.iou_prediction_head.build(None) + super().build(input_shape) def call( self, @@ -727,7 +615,6 @@ def __init__(self, config: SamPromptEncoderConfig, **kwargs): self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3") self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1") self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2") - self.config = config def call(self, masks): masks = tf.transpose(masks, perm=(0, 2, 3, 1)) # Convert to channels-last @@ -742,7 +629,7 @@ def call(self, masks): dense_embeddings = tf.transpose(dense_embeddings, perm=(0, 3, 1, 2)) # Convert back to channels-first return dense_embeddings - def build(self, input_shape=None): + def build(self, input_shape): # This class needs an explicit build method because it isn't called with the standard dummy inputs conv1_shape = [None, None, None, 1] conv2_shape = [None, None, None, self.mask_input_channels] @@ -759,25 +646,7 @@ def build(self, input_shape=None): self.layer_norm1.build(layer_norm1_shape) with tf.name_scope("layer_norm2"): self.layer_norm2.build(layer_norm2_shape) - - if self.built: - return - self.built = True - if getattr(self, "conv1", None) is not None: - with tf.name_scope(self.conv1.name): - self.conv1.build(1) - if getattr(self, "conv2", None) is not None: - with tf.name_scope(self.conv2.name): - self.conv2.build(self.mask_input_channels) - if getattr(self, "conv3", None) is not None: - with tf.name_scope(self.conv3.name): - self.conv3.build(self.config.mask_input_channels) - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build(None) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build(None) + super().build(input_shape) class TFSamPromptEncoder(tf.keras.layers.Layer): @@ -795,7 +664,7 @@ def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding, **kwa self.not_a_point_embed = None self.config = config - def build(self, input_shape=None): + def build(self, input_shape): self.no_mask_embed = self.add_weight( name="no_mask_embed.weight", shape=(1, self.hidden_size), @@ -822,13 +691,7 @@ def build(self, input_shape=None): self.mask_embed.build( (None, self.config.mask_input_channels, self.config.image_size, self.config.image_size) ) - - if self.built: - return - self.built = True - if getattr(self, "mask_embed", None) is not None: - with tf.name_scope(self.mask_embed.name): - self.mask_embed.build(None) + super().build(input_shape) def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.Tensor: """Embeds point prompts.""" @@ -949,7 +812,7 @@ def __init__(self, config, window_size, **kwargs): raise ValueError("Input size must be provided if using relative positional encoding.") self.config = config - def build(self, input_shape=None): + def build(self, input_shape): if self.input_size is not None: # initialize relative positional embeddings self.rel_pos_h = self.add_weight( @@ -958,16 +821,7 @@ def build(self, input_shape=None): self.rel_pos_w = self.add_weight( shape=(2 * self.input_size[1] - 1, self.head_dim), initializer="zeros", name="rel_pos_w" ) - - if self.built: - return - self.built = True - if getattr(self, "qkv", None) is not None: - with tf.name_scope(self.qkv.name): - self.qkv.build(self.config.hidden_size) - if getattr(self, "proj", None) is not None: - with tf.name_scope(self.proj.name): - self.proj.build(self.config.hidden_size) + super().build(input_shape) def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor: """ @@ -1095,7 +949,6 @@ def __init__(self, config, window_size, **kwargs): self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") self.mlp = TFSamMLPBlock(config, name="mlp") self.window_size = window_size - self.config = config def window_partition(self, hidden_states: tf.Tensor, window_size: int) -> Tuple[tf.Tensor, Tuple[int, int]]: batch_size, height, width, channel = shape_list(hidden_states) @@ -1163,23 +1016,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build([None, None, self.config.hidden_size]) - if getattr(self, "attn", None) is not None: - with tf.name_scope(self.attn.name): - self.attn.build(None) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build([None, None, self.config.hidden_size]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - class TFSamVisionNeck(tf.keras.layers.Layer): def __init__(self, config: SamVisionConfig, **kwargs): @@ -1211,23 +1047,6 @@ def call(self, hidden_states): hidden_states = tf.transpose(hidden_states, perm=[0, 3, 1, 2]) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv1", None) is not None: - with tf.name_scope(self.conv1.name): - self.conv1.build(self.config.hidden_size) - if getattr(self, "layer_norm1", None) is not None: - with tf.name_scope(self.layer_norm1.name): - self.layer_norm1.build(None) - if getattr(self, "conv2", None) is not None: - with tf.name_scope(self.conv2.name): - self.conv2.build(self.config.output_channels) - if getattr(self, "layer_norm2", None) is not None: - with tf.name_scope(self.layer_norm2.name): - self.layer_norm2.build(None) - class TFSamVisionEncoder(tf.keras.layers.Layer): def __init__(self, config: SamVisionConfig, **kwargs): @@ -1250,7 +1069,7 @@ def __init__(self, config: SamVisionConfig, **kwargs): self.neck = TFSamVisionNeck(config, name="neck") - def build(self, input_shape=None): + def build(self, input_shape): if self.config.use_abs_pos: # Initialize absolute positional embedding with pretrain image size. self.pos_embed = self.add_weight( @@ -1264,16 +1083,7 @@ def build(self, input_shape=None): trainable=True, name="pos_embed", ) - - if self.built: - return - self.built = True - if getattr(self, "patch_embed", None) is not None: - with tf.name_scope(self.patch_embed.name): - self.patch_embed.build(None) - if getattr(self, "neck", None) is not None: - with tf.name_scope(self.neck.name): - self.neck.build(None) + super().build(input_shape) def get_input_embeddings(self): return self.patch_embed @@ -1653,20 +1463,3 @@ def serving_output(self, output: TFSamImageSegmentationOutput) -> TFSamImageSegm vision_attentions=attns if self.config.output_attentions else None, mask_decoder_attentions=output.mask_decoder_attentions if self.config.output_attentions else None, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shared_image_embedding", None) is not None: - with tf.name_scope(self.shared_image_embedding.name): - self.shared_image_embedding.build(None) - if getattr(self, "vision_encoder", None) is not None: - with tf.name_scope(self.vision_encoder.name): - self.vision_encoder.build(None) - if getattr(self, "prompt_encoder", None) is not None: - with tf.name_scope(self.prompt_encoder.name): - self.prompt_encoder.build(None) - if getattr(self, "mask_decoder", None) is not None: - with tf.name_scope(self.mask_decoder.name): - self.mask_decoder.build(None) diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index cda04240dba58e..86eb3249382996 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -88,7 +88,6 @@ def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") self.num_channels = num_channels - self.hidden_size = hidden_size def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]: embeddings = self.proj(self.padding(pixel_values)) @@ -101,17 +100,6 @@ def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]: embeddings = self.layer_norm(embeddings) return embeddings, height, width - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "proj", None) is not None: - with tf.name_scope(self.proj.name): - self.proj.build(self.num_channels) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.hidden_size]) - class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer): """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT @@ -209,41 +197,18 @@ def call( outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.hidden_size) - class TFSegformerSelfOutput(tf.keras.layers.Layer): def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(hidden_size, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.hidden_size) - class TFSegformerAttention(tf.keras.layers.Layer): def __init__( @@ -273,17 +238,6 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFSegformerDWConv(tf.keras.layers.Layer): def __init__(self, dim: int = 768, **kwargs): @@ -291,7 +245,6 @@ def __init__(self, dim: int = 768, **kwargs): self.depthwise_convolution = tf.keras.layers.Conv2D( filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv" ) - self.dim = dim def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: batch_size = shape_list(hidden_states)[0] @@ -305,14 +258,6 @@ def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels)) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "depthwise_convolution", None) is not None: - with tf.name_scope(self.depthwise_convolution.name): - self.depthwise_convolution.build(self.dim) - class TFSegformerMixFFN(tf.keras.layers.Layer): def __init__( @@ -333,8 +278,6 @@ def __init__( self.intermediate_act_fn = config.hidden_act self.dense2 = tf.keras.layers.Dense(out_features, name="dense2") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.hidden_features = hidden_features - self.in_features = in_features def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: hidden_states = self.dense1(hidden_states) @@ -345,20 +288,6 @@ def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool hidden_states = self.dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense1", None) is not None: - with tf.name_scope(self.dense1.name): - self.dense1.build(self.in_features) - if getattr(self, "depthwise_convolution", None) is not None: - with tf.name_scope(self.depthwise_convolution.name): - self.depthwise_convolution.build(None) - if getattr(self, "dense2", None) is not None: - with tf.name_scope(self.dense2.name): - self.dense2.build(self.hidden_features) - class TFSegformerLayer(tf.keras.layers.Layer): """This corresponds to the Block class in the original implementation.""" @@ -386,7 +315,6 @@ def __init__( self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2") mlp_hidden_size = int(hidden_size * mlp_ratio) self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp") - self.hidden_size = hidden_size def call( self, @@ -420,23 +348,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norm_1", None) is not None: - with tf.name_scope(self.layer_norm_1.name): - self.layer_norm_1.build([None, None, self.hidden_size]) - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "layer_norm_2", None) is not None: - with tf.name_scope(self.layer_norm_2.name): - self.layer_norm_2.build([None, None, self.hidden_size]) - if getattr(self, "mlp", None) is not None: - with tf.name_scope(self.mlp.name): - self.mlp.build(None) - class TFSegformerEncoder(tf.keras.layers.Layer): def __init__(self, config: SegformerConfig, **kwargs): @@ -540,15 +451,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norms", None) is not None: - for layer in self.layer_norms: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFSegformerMainLayer(tf.keras.layers.Layer): @@ -609,14 +511,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - class TFSegformerPreTrainedModel(TFPreTrainedModel): """ @@ -713,14 +607,6 @@ def call( ) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "segformer", None) is not None: - with tf.name_scope(self.segformer.name): - self.segformer.build(None) - @add_start_docstrings( """ @@ -738,7 +624,6 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs): # Classifier head self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -785,17 +670,6 @@ def call( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "segformer", None) is not None: - with tf.name_scope(self.segformer.name): - self.segformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_sizes[-1]) - class TFSegformerMLP(tf.keras.layers.Layer): """ @@ -815,14 +689,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.proj(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "proj", None) is not None: - with tf.name_scope(self.proj.name): - self.proj.build(self.input_dim) - class TFSegformerDecodeHead(TFSegformerPreTrainedModel): def __init__(self, config: SegformerConfig, **kwargs): @@ -878,20 +744,6 @@ def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.T return logits - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "linear_fuse", None) is not None: - with tf.name_scope(self.linear_fuse.name): - self.linear_fuse.build(self.config.decoder_hidden_size * self.config.num_encoder_blocks) - if getattr(self, "batch_norm", None) is not None: - with tf.name_scope(self.batch_norm.name): - self.batch_norm.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.decoder_hidden_size) - @add_start_docstrings( """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""", @@ -1002,14 +854,3 @@ def call( hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "segformer", None) is not None: - with tf.name_scope(self.segformer.name): - self.segformer.build(None) - if getattr(self, "decode_head", None) is not None: - with tf.name_scope(self.decode_head.name): - self.decode_head.build(None) diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index d6566e78d62773..026d2241b461ea 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -166,15 +166,6 @@ def call(self, input_features: tf.Tensor) -> tf.Tensor: hidden_states = glu(hidden_states, axis=2) # GLU over the Channel dimension return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv_layers", None) is not None: - for layer in self.conv_layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFSpeech2TextSinusoidalPositionalEmbedding(tf.keras.layers.Layer): """This module produces sinusoidal positional embeddings of any length.""" @@ -388,23 +379,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer): def __init__(self, config: Speech2TextConfig, **kwargs): @@ -420,7 +394,6 @@ def __init__(self, config: Speech2TextConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False @@ -461,26 +434,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer): def __init__(self, config: Speech2TextConfig, **kwargs): @@ -510,7 +463,6 @@ def __init__(self, config: Speech2TextConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -594,32 +546,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFSpeech2TextPreTrainedModel(TFPreTrainedModel): config_class = Speech2TextConfig @@ -944,24 +870,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFSpeech2TextDecoder(tf.keras.layers.Layer): @@ -1184,24 +1092,6 @@ def call( cross_attentions=all_cross_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_tokens", None) is not None: - with tf.name_scope(self.embed_tokens.name): - self.embed_tokens.build(None) - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFSpeech2TextMainLayer(tf.keras.layers.Layer): @@ -1307,17 +1197,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings( "The bare Speech2Text Model outputting raw hidden-states without any specific head on top.", @@ -1400,14 +1279,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - @add_start_docstrings( "The Speech2Text Model with a language modeling head. Can be used for summarization.", @@ -1420,7 +1291,6 @@ def __init__(self, config: Speech2TextConfig): self.lm_head = tf.keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head") # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate self.supports_xla_generation = False - self.config = config def get_encoder(self): return self.model.encoder @@ -1590,14 +1460,3 @@ def prepare_inputs_for_generation( "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(self.config.d_model) diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py index 52eb0614785005..5d53561442457f 100644 --- a/src/transformers/models/swin/modeling_tf_swin.py +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -283,7 +283,6 @@ def __init__(self, config: SwinConfig, use_mask_token: bool = False, **kwargs) - self.norm = tf.keras.layers.LayerNormalization(name="norm", epsilon=1e-5) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") - self.config = config def build(self, input_shape: tf.TensorShape) -> None: if self.use_mask_token: @@ -297,19 +296,7 @@ def build(self, input_shape: tf.TensorShape) -> None: ) else: self.position_embeddings = None - - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(None) - if getattr(self, "norm", None) is not None: - with tf.name_scope(self.norm.name): - self.norm.build([None, None, self.config.embed_dim]) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) + super().build(input_shape) def call( self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False @@ -394,14 +381,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> Tuple[tf.Tens embeddings = tf.transpose(embeddings, (0, 2, 1)) return embeddings, output_dimensions - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - class TFSwinPatchMerging(tf.keras.layers.Layer): """ @@ -464,14 +443,6 @@ def call(self, input_feature: tf.Tensor, input_dimensions: Tuple[int, int], trai return input_feature - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "reduction", None) is not None: - with tf.name_scope(self.reduction.name): - self.reduction.build(4 * self.dim) - class TFSwinDropPath(tf.keras.layers.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" @@ -550,19 +521,7 @@ def build(self, input_shape: tf.TensorShape) -> None: relative_coords = tf.stack([stack_0, stack_1], axis=2) self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32)) - - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.all_head_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.all_head_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.all_head_size) + super().build(input_shape) def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size] @@ -638,24 +597,12 @@ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(dim, name="dense") self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout") - self.dim = dim def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.dim) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - class TFSwinAttention(tf.keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None: @@ -684,17 +631,6 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self", None) is not None: - with tf.name_scope(self.self.name): - self.self.build(None) - if getattr(self, "self_output", None) is not None: - with tf.name_scope(self.self_output.name): - self.self_output.build(None) - class TFSwinIntermediate(tf.keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: @@ -704,43 +640,24 @@ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act - self.dim = dim def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.dim) - class TFSwinOutput(tf.keras.layers.Layer): def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(dim, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, "dropout") - self.config = config - self.dim = dim def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(int(self.config.mlp_ratio * self.dim)) - class TFSwinLayer(tf.keras.layers.Layer): def __init__( @@ -767,7 +684,6 @@ def __init__( ) self.intermediate = TFSwinIntermediate(config, dim, name="intermediate") self.swin_output = TFSwinOutput(config, dim, name="output") - self.dim = dim def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None: img_mask = tf.zeros((height, width)) @@ -873,29 +789,6 @@ def call( layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) return layer_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.dim]) - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "drop_path", None) is not None: - with tf.name_scope(self.drop_path.name): - self.drop_path.build(None) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.dim]) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "swin_output", None) is not None: - with tf.name_scope(self.swin_output.name): - self.swin_output.build(None) - class TFSwinStage(tf.keras.layers.Layer): def __init__( @@ -968,15 +861,6 @@ def call( stage_outputs += layer_outputs[1:] return stage_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "blocks", None) is not None: - for layer in self.blocks: - with tf.name_scope(layer.name): - layer.build(None) - class TFSwinEncoder(tf.keras.layers.Layer): def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs): @@ -1057,15 +941,6 @@ def call( reshaped_hidden_states=all_reshaped_hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFSwinPreTrainedModel(TFPreTrainedModel): """ @@ -1285,20 +1160,6 @@ def call( reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.num_features]) - @add_start_docstrings( "The bare Swin Model transformer outputting raw hidden-states without any specific head on top.", @@ -1356,14 +1217,6 @@ def call( return swin_outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "swin", None) is not None: - with tf.name_scope(self.swin.name): - self.swin.build(None) - class TFSwinPixelShuffle(tf.keras.layers.Layer): """TF layer implementation of torch.nn.PixelShuffle""" @@ -1519,17 +1372,6 @@ def call( reshaped_hidden_states=outputs.reshaped_hidden_states, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "swin", None) is not None: - with tf.name_scope(self.swin.name): - self.swin.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings( """ @@ -1604,14 +1446,3 @@ def call( attentions=outputs.attentions, reshaped_hidden_states=outputs.reshaped_hidden_states, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "swin", None) is not None: - with tf.name_scope(self.swin.name): - self.swin.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index 3fef06dd631f64..26eb7b9b6a2929 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -111,7 +111,6 @@ def __init__(self, config, **kwargs): ) # Update init weights as in flax self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = get_tf_activation(config.dense_act_fn) - self.config = config def call(self, hidden_states, training=False): hidden_states = self.wi(hidden_states) @@ -120,17 +119,6 @@ def call(self, hidden_states, training=False): hidden_states = self.wo(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wi", None) is not None: - with tf.name_scope(self.wi.name): - self.wi.build(self.config.d_model) - if getattr(self, "wo", None) is not None: - with tf.name_scope(self.wo.name): - self.wo.build(self.config.d_ff) - class TFT5DenseGatedActDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -152,7 +140,6 @@ def __init__(self, config, **kwargs): ) # Update init weights as in flax self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = get_tf_activation(config.dense_act_fn) - self.config = config def call(self, hidden_states, training=False): hidden_gelu = self.act(self.wi_0(hidden_states)) @@ -162,20 +149,6 @@ def call(self, hidden_states, training=False): hidden_states = self.wo(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wi_0", None) is not None: - with tf.name_scope(self.wi_0.name): - self.wi_0.build(self.config.d_model) - if getattr(self, "wi_1", None) is not None: - with tf.name_scope(self.wi_1.name): - self.wi_1.build(self.config.d_model) - if getattr(self, "wo", None) is not None: - with tf.name_scope(self.wo.name): - self.wo.build(self.config.d_ff) - class TFT5LayerFF(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -194,14 +167,6 @@ def call(self, hidden_states, training=False): hidden_states = hidden_states + self.dropout(dense_output, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) - class TFT5Attention(tf.keras.layers.Layer): NEW_ID = itertools.count() @@ -254,7 +219,7 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs): self.pruned_heads = set() - def build(self, input_shape=None): + def build(self, input_shape): if self.has_relative_attention_bias: with tf.name_scope("relative_attention_bias"): self.relative_attention_bias = self.add_weight( @@ -263,22 +228,7 @@ def build(self, input_shape=None): initializer=self.relative_attention_bias_initializer, # Add initializer ) - return - if self.built: - return - self.built = True - if getattr(self, "q", None) is not None: - with tf.name_scope(self.q.name): - self.q.build(self.d_model) - if getattr(self, "k", None) is not None: - with tf.name_scope(self.k.name): - self.k.build(self.d_model) - if getattr(self, "v", None) is not None: - with tf.name_scope(self.v.name): - self.v.build(self.d_model) - if getattr(self, "o", None) is not None: - with tf.name_scope(self.o.name): - self.o.build(self.inner_dim) + return super().build(input_shape) def prune_heads(self, heads): raise NotImplementedError @@ -519,17 +469,6 @@ def call( outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "SelfAttention", None) is not None: - with tf.name_scope(self.SelfAttention.name): - self.SelfAttention.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) - class TFT5LayerCrossAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -572,17 +511,6 @@ def call( outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "EncDecAttention", None) is not None: - with tf.name_scope(self.EncDecAttention.name): - self.EncDecAttention.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) - class TFT5Block(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): @@ -921,18 +849,6 @@ def call( attentions=all_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "block", None) is not None: - for layer in self.block: - with tf.name_scope(layer.name): - layer.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build(None) # TODO Matt might be wrong - #################################################### # TFT5PreTrainedModel is a sub-class of tf.keras.Model @@ -1308,20 +1224,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss): @@ -1572,20 +1474,6 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return self._shift_right(labels) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings( "The bare T5 Model transformer outputting encoder's raw hidden-stateswithout any specific head on top.", @@ -1664,14 +1552,3 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "shared", None) is not None: - with tf.name_scope(self.shared.name): - self.shared.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index ed338d4bd125a7..a41b56e1a6caef 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -160,7 +160,7 @@ def __init__(self, config: TapasConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -186,12 +186,7 @@ def build(self, input_shape=None): ), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def call( self, @@ -284,7 +279,6 @@ def __init__(self, config: TapasConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -374,20 +368,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas class TFTapasSelfOutput(tf.keras.layers.Layer): @@ -399,7 +379,6 @@ def __init__(self, config: TapasConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -408,17 +387,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas class TFTapasAttention(tf.keras.layers.Layer): @@ -460,17 +428,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas class TFTapasIntermediate(tf.keras.layers.Layer): @@ -485,7 +442,6 @@ def __init__(self, config: TapasConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -493,14 +449,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas class TFTapasOutput(tf.keras.layers.Layer): @@ -512,7 +460,6 @@ def __init__(self, config: TapasConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -521,17 +468,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas class TFTapasLayer(tf.keras.layers.Layer): @@ -619,20 +555,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas class TFTapasEncoder(tf.keras.layers.Layer): @@ -703,15 +625,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas class TFTapasPooler(tf.keras.layers.Layer): @@ -724,7 +637,6 @@ def __init__(self, config: TapasConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -734,14 +646,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas class TFTapasPredictionHeadTransform(tf.keras.layers.Layer): @@ -760,7 +664,6 @@ def __init__(self, config: TapasConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -769,17 +672,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas class TFTapasLMPredictionHead(tf.keras.layers.Layer): @@ -795,15 +687,10 @@ def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "transform", None) is not None: - with tf.name_scope(self.transform.name): - self.transform.build(None) + super().build(input_shape) def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -965,20 +852,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFTapasPreTrainedModel(TFPreTrainedModel): """ @@ -1160,14 +1033,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "tapas", None) is not None: - with tf.name_scope(self.tapas.name): - self.tapas.build(None) - @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING) class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss): @@ -1264,17 +1129,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "tapas", None) is not None: - with tf.name_scope(self.tapas.name): - self.tapas.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - class TFTapasComputeTokenLogits(tf.keras.layers.Layer): def __init__(self, config: TapasConfig, **kwargs): @@ -1698,20 +1552,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "tapas", None) is not None: - with tf.name_scope(self.tapas.name): - self.tapas.build(None) - if getattr(self, "compute_token_logits", None) is not None: - with tf.name_scope(self.compute_token_logits.name): - self.compute_token_logits.build(None) # TODO Matt might be wrong - if getattr(self, "compute_column_logits", None) is not None: - with tf.name_scope(self.compute_column_logits.name): - self.compute_column_logits.build(None) # TODO Matt might be wrong - @add_start_docstrings( """ @@ -1730,7 +1570,6 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1815,20 +1654,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "tapas", None) is not None: - with tf.name_scope(self.tapas.name): - self.tapas.build(None) - if getattr(self, "dropout", None) is not None: - with tf.name_scope(self.dropout.name): - self.dropout.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - """ TAPAS utilities.""" diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py index ea3c1debda73ba..34349c8661757c 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py @@ -220,22 +220,12 @@ def __init__( self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection") self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection") self.logit_scale = None - self.config = config def build(self, input_shape=None): # Build in the build() method to make sure the names are right initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value) self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale") - - if self.built: - return - self.built = True - if getattr(self, "visual_projection", None) is not None: - with tf.name_scope(self.visual_projection.name): - self.visual_projection.build(None) - if getattr(self, "text_projection", None) is not None: - with tf.name_scope(self.text_projection.name): - self.text_projection.build(None) + super().build(input_shape) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index ef5d6e2acadd1f..727db8dfc6c081 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -66,7 +66,7 @@ def __init__(self, config: ViTConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): num_patches = self.patch_embeddings.num_patches self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), @@ -81,12 +81,7 @@ def build(self, input_shape=None): name="position_embeddings", ) - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(None) + super().build(input_shape) def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor: """ @@ -210,14 +205,6 @@ def call( return embeddings - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - class TFViTSelfAttention(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): @@ -244,7 +231,6 @@ def __init__(self, config: ViTConfig, **kwargs): units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -294,20 +280,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - class TFViTSelfOutput(tf.keras.layers.Layer): """ @@ -322,7 +294,6 @@ def __init__(self, config: ViTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -330,14 +301,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFViTAttention(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): @@ -366,17 +329,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - class TFViTIntermediate(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): @@ -390,7 +342,6 @@ def __init__(self, config: ViTConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -398,14 +349,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - class TFViTOutput(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): @@ -415,7 +358,6 @@ def __init__(self, config: ViTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -424,14 +366,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - class TFViTLayer(tf.keras.layers.Layer): """This corresponds to the Block class in the timm implementation.""" @@ -449,7 +383,6 @@ def __init__(self, config: ViTConfig, **kwargs): self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) - self.config = config def call( self, @@ -483,26 +416,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "vit_output", None) is not None: - with tf.name_scope(self.vit_output.name): - self.vit_output.build(None) - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.config.hidden_size]) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.config.hidden_size]) - class TFViTEncoder(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): @@ -548,15 +461,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFViTMainLayer(tf.keras.layers.Layer): @@ -635,23 +539,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_size]) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - class TFViTPreTrainedModel(TFPreTrainedModel): """ @@ -778,14 +665,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vit", None) is not None: - with tf.name_scope(self.vit.name): - self.vit.build(None) - class TFViTPooler(tf.keras.layers.Layer): def __init__(self, config: ViTConfig, **kwargs): @@ -797,7 +676,6 @@ def __init__(self, config: ViTConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -807,14 +685,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -844,7 +714,6 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @@ -895,14 +764,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vit", None) is not None: - with tf.name_scope(self.vit.name): - self.vit.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py index 79a91f4d22ec9a..21898bbe83bb2c 100644 --- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py @@ -213,7 +213,7 @@ def __init__(self, config: ViTMAEConfig, **kwargs): self.config = config - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), @@ -233,12 +233,7 @@ def build(self, input_shape=None): )[None, ...] self.position_embeddings.assign(pos_embed) - if self.built: - return - self.built = True - if getattr(self, "patch_embeddings", None) is not None: - with tf.name_scope(self.patch_embeddings.name): - self.patch_embeddings.build(None) + super().build(input_shape) def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None): """ @@ -357,14 +352,6 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.num_channels) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE class TFViTMAESelfAttention(tf.keras.layers.Layer): @@ -392,7 +379,6 @@ def __init__(self, config: ViTMAEConfig, **kwargs): units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -442,20 +428,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE class TFViTMAESelfOutput(tf.keras.layers.Layer): @@ -471,7 +443,6 @@ def __init__(self, config: ViTMAEConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -479,14 +450,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE class TFViTMAEAttention(tf.keras.layers.Layer): @@ -516,17 +479,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE class TFViTMAEIntermediate(tf.keras.layers.Layer): @@ -541,7 +493,6 @@ def __init__(self, config: ViTMAEConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -549,14 +500,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE class TFViTMAEOutput(tf.keras.layers.Layer): @@ -567,7 +510,6 @@ def __init__(self, config: ViTMAEConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -576,14 +518,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE class TFViTMAELayer(tf.keras.layers.Layer): @@ -602,7 +536,6 @@ def __init__(self, config: ViTMAEConfig, **kwargs): self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) - self.config = config def call( self, @@ -636,26 +569,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "vit_output", None) is not None: - with tf.name_scope(self.vit_output.name): - self.vit_output.build(None) - if getattr(self, "layernorm_before", None) is not None: - with tf.name_scope(self.layernorm_before.name): - self.layernorm_before.build([None, None, self.config.hidden_size]) - if getattr(self, "layernorm_after", None) is not None: - with tf.name_scope(self.layernorm_after.name): - self.layernorm_after.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE class TFViTMAEEncoder(tf.keras.layers.Layer): @@ -702,15 +615,6 @@ def call( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFViTMAEMainLayer(tf.keras.layers.Layer): @@ -783,20 +687,6 @@ def call( attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "layernorm", None) is not None: - with tf.name_scope(self.layernorm.name): - self.layernorm.build([None, None, self.config.hidden_size]) - class TFViTMAEPreTrainedModel(TFPreTrainedModel): """ @@ -939,14 +829,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vit", None) is not None: - with tf.name_scope(self.vit.name): - self.vit.build(None) - class TFViTMAEDecoder(tf.keras.layers.Layer): def __init__(self, config, num_patches, **kwargs): @@ -971,7 +853,7 @@ def __init__(self, config, num_patches, **kwargs): self.config = config self.num_patches = num_patches - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.mask_token = self.add_weight( shape=(1, 1, self.config.decoder_hidden_size), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), @@ -991,22 +873,7 @@ def build(self, input_shape=None): )[None, ...] self.decoder_pos_embed.assign(decoder_pos_embed) - if self.built: - return - self.built = True - if getattr(self, "decoder_embed", None) is not None: - with tf.name_scope(self.decoder_embed.name): - self.decoder_embed.build(self.config.hidden_size) - if getattr(self, "decoder_norm", None) is not None: - with tf.name_scope(self.decoder_norm.name): - self.decoder_norm.build([None, None, self.config.decoder_hidden_size]) - if getattr(self, "decoder_pred", None) is not None: - with tf.name_scope(self.decoder_pred.name): - self.decoder_pred.build(self.config.decoder_hidden_size) - if getattr(self, "decoder_layers", None) is not None: - for layer in self.decoder_layers: - with tf.name_scope(layer.name): - layer.build(None) + super().build(input_shape) def call( self, @@ -1261,14 +1128,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "vit", None) is not None: - with tf.name_scope(self.vit.name): - self.vit.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index df979ceff033f8..10563579fdd55f 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -502,14 +502,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: @@ -533,17 +525,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.out_conv_dim]) - class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None: @@ -569,17 +550,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build(None) - class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None: @@ -600,14 +570,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.activation(hidden_states) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv", None) is not None: - with tf.name_scope(self.conv.name): - self.conv.build(None) - class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): def __init__(self, num_conv_pos_embeddings, **kwargs): @@ -670,7 +632,6 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): name="projection", ) self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) - self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: norm_hidden_states = self.layer_norm(hidden_states) @@ -678,17 +639,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dropout(hidden_states, training=training) return hidden_states, norm_hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.conv_dim[-1]]) - if getattr(self, "projection", None) is not None: - with tf.name_scope(self.projection.name): - self.projection.build(self.config.conv_dim[-1]) - # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2 class TFWav2Vec2Attention(tf.keras.layers.Layer): @@ -843,23 +793,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFWav2Vec2FeedForward(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): @@ -882,7 +815,6 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): name="output_dense", ) self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) - self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.intermediate_dense(hidden_states) @@ -893,17 +825,6 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.output_dropout(hidden_states, training=training) return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "intermediate_dense", None) is not None: - with tf.name_scope(self.intermediate_dense.name): - self.intermediate_dense.build(self.config.hidden_size) - if getattr(self, "output_dense", None) is not None: - with tf.name_scope(self.output_dense.name): - self.output_dense.build(self.config.intermediate_size) - class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): @@ -921,7 +842,6 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) - self.config = config def call( self, @@ -948,23 +868,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "feed_forward", None) is not None: - with tf.name_scope(self.feed_forward.name): - self.feed_forward.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.config.hidden_size]) - class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): @@ -982,7 +885,6 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) - self.config = config def call( self, @@ -1007,23 +909,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "feed_forward", None) is not None: - with tf.name_scope(self.feed_forward.name): - self.feed_forward.build(None) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.config.hidden_size]) - class TFWav2Vec2Encoder(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): @@ -1089,21 +974,6 @@ def call( attentions=all_self_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "pos_conv_embed", None) is not None: - with tf.name_scope(self.pos_conv_embed.name): - self.pos_conv_embed.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): @@ -1171,21 +1041,6 @@ def call( attentions=all_self_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "pos_conv_embed", None) is not None: - with tf.name_scope(self.pos_conv_embed.name): - self.pos_conv_embed.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFWav2Vec2MainLayer(tf.keras.layers.Layer): @@ -1202,20 +1057,12 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): else: self.encoder = TFWav2Vec2Encoder(config, name="encoder") - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): self.masked_spec_embed = self.add_weight( shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" ) - if self.built: - return - self.built = True - if getattr(self, "feature_extractor", None) is not None: - with tf.name_scope(self.feature_extractor.name): - self.feature_extractor.build(None) - if getattr(self, "feature_projection", None) is not None: - with tf.name_scope(self.feature_projection.name): - self.feature_projection.build(None) + super().build(input_shape) def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor): """ @@ -1572,14 +1419,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wav2vec2", None) is not None: - with tf.name_scope(self.wav2vec2.name): - self.wav2vec2.build(None) - @add_start_docstrings( """TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", @@ -1736,17 +1575,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wav2vec2", None) is not None: - with tf.name_scope(self.wav2vec2.name): - self.wav2vec2.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(self.output_hidden_size) - class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel): def __init__(self, config): @@ -1844,17 +1672,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "wav2vec2", None) is not None: - with tf.name_scope(self.wav2vec2.name): - self.wav2vec2.build(None) - if getattr(self, "projector", None) is not None: - with tf.name_scope(self.projector.name): - self.projector.build(self.config.hidden_size) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.classifier_proj_size) diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 9818889aa83189..6789758076142b 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -313,23 +313,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper class TFWhisperEncoderLayer(tf.keras.layers.Layer): @@ -346,7 +329,6 @@ def __init__(self, config: WhisperConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False @@ -387,26 +369,6 @@ def call( return hidden_states, self_attn_weights - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.encoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper class TFWhisperDecoderLayer(tf.keras.layers.Layer): @@ -437,7 +399,6 @@ def __init__(self, config: WhisperConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config def call( self, @@ -521,32 +482,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "encoder_attn", None) is not None: - with tf.name_scope(self.encoder_attn.name): - self.encoder_attn.build(None) - if getattr(self, "encoder_attn_layer_norm", None) is not None: - with tf.name_scope(self.encoder_attn_layer_norm.name): - self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.decoder_ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - class TFWhisperPreTrainedModel(TFPreTrainedModel): config_class = WhisperConfig @@ -814,27 +749,6 @@ def call( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "conv1", None) is not None: - with tf.name_scope(self.conv1.name): - self.conv1.build(None) - if getattr(self, "conv2", None) is not None: - with tf.name_scope(self.conv2.name): - self.conv2.build(None) - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "encoder_layers", None) is not None: - for layer in self.encoder_layers: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable class TFWhisperDecoder(tf.keras.layers.Layer): @@ -1074,24 +988,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "embed_tokens", None) is not None: - with tf.name_scope(self.embed_tokens.name): - self.embed_tokens.build(None) - if getattr(self, "embed_positions", None) is not None: - with tf.name_scope(self.embed_positions.name): - self.embed_positions.build(None) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "decoder_layers", None) is not None: - for layer in self.decoder_layers: - with tf.name_scope(layer.name): - layer.build(None) - @add_start_docstrings( "The bare Whisper Model outputting raw hidden-states without any specific head on top.", @@ -1215,17 +1111,6 @@ def call( encoder_attentions=encoder_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "decoder", None) is not None: - with tf.name_scope(self.decoder.name): - self.decoder.build(None) - @add_start_docstrings( "The bare Whisper Model outputting raw hidden-states without any specific head on top.", @@ -1334,14 +1219,6 @@ def serving_output(self, output): encoder_attentions=enc_attns, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - @add_start_docstrings( "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.", @@ -1748,11 +1625,3 @@ def prepare_inputs_for_generation( "decoder_attention_mask": decoder_attention_mask, "decoder_position_ids": decoder_position_ids, } - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index fd7a2538f280ae..e2890edeb665af 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -301,23 +301,6 @@ def call( return attn_output, attn_weights, past_key_value - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "k_proj", None) is not None: - with tf.name_scope(self.k_proj.name): - self.k_proj.build(self.embed_dim) - if getattr(self, "q_proj", None) is not None: - with tf.name_scope(self.q_proj.name): - self.q_proj.build(self.embed_dim) - if getattr(self, "v_proj", None) is not None: - with tf.name_scope(self.v_proj.name): - self.v_proj.build(self.embed_dim) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.embed_dim) - class TFXGLMDecoderLayer(tf.keras.layers.Layer): def __init__(self, config: XGLMConfig, **kwargs: Any) -> None: @@ -350,7 +333,6 @@ def __init__(self, config: XGLMConfig, **kwargs: Any) -> None: self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") - self.config = config # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call def call( @@ -433,26 +415,6 @@ def call( present_key_value, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attn", None) is not None: - with tf.name_scope(self.self_attn.name): - self.self_attn.build(None) - if getattr(self, "self_attn_layer_norm", None) is not None: - with tf.name_scope(self.self_attn_layer_norm.name): - self.self_attn_layer_norm.build([None, None, self.embed_dim]) - if getattr(self, "fc1", None) is not None: - with tf.name_scope(self.fc1.name): - self.fc1.build(self.embed_dim) - if getattr(self, "fc2", None) is not None: - with tf.name_scope(self.fc2.name): - self.fc2.build(self.config.ffn_dim) - if getattr(self, "final_layer_norm", None) is not None: - with tf.name_scope(self.final_layer_norm.name): - self.final_layer_norm.build([None, None, self.embed_dim]) - @keras_serializable class TFXGLMMainLayer(tf.keras.layers.Layer): @@ -647,18 +609,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layers", None) is not None: - for layer in self.layers: - with tf.name_scope(layer.name): - layer.build(None) - class TFXGLMPreTrainedModel(TFPreTrainedModel): config_class = XGLMConfig @@ -842,14 +792,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - @add_start_docstrings( """ @@ -880,7 +822,6 @@ def __init__( kernel_initializer=get_initializer(config.init_std), name="lm_head", ) - self.config = config def get_output_embeddings(self): return self.lm_head @@ -983,14 +924,3 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "model", None) is not None: - with tf.name_scope(self.model.name): - self.model.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(self.config.hidden_size) diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 0500e9719c1c7e..9343f6cb524be0 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -316,7 +316,7 @@ def __init__(self, config, **kwargs): if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) - def build(self, input_shape=None): + def build(self, input_shape): with tf.name_scope("position_embeddings"): self.position_embeddings = self.add_weight( name="embeddings", @@ -332,15 +332,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.embed_init_std), ) - if self.built: - return - self.built = True - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - if getattr(self, "layer_norm_emb", None) is not None: - with tf.name_scope(self.layer_norm_emb.name): - self.layer_norm_emb.build([None, None, self.dim]) + super().build(input_shape) def get_input_embeddings(self): return self.embeddings @@ -742,14 +734,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - class TFXLMPredLayer(tf.keras.layers.Layer): """ @@ -887,17 +871,6 @@ def call( logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "pred_layer", None) is not None: - with tf.name_scope(self.pred_layer.name): - self.pred_layer.build(None) - @add_start_docstrings( """ @@ -976,17 +949,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - @add_start_docstrings( """ @@ -1004,7 +966,6 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) - self.config = config @property def dummy_inputs(self): @@ -1107,20 +1068,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "logits_proj", None) is not None: - with tf.name_scope(self.logits_proj.name): - self.logits_proj.build(self.config.num_labels) - @add_start_docstrings( """ @@ -1139,7 +1086,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1202,17 +1148,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1228,7 +1163,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1304,14 +1238,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py index 66320260ff39a2..65f3be9e2f277f 100644 --- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py @@ -178,7 +178,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape=None): + def build(self, input_shape: tf.TensorShape): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -200,12 +200,7 @@ def build(self, input_shape=None): initializer=get_initializer(self.initializer_range), ) - if self.built: - return - self.built = True - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -278,7 +273,6 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): activation="tanh", name="dense", ) - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -288,14 +282,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return pooled_output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta class TFXLMRobertaSelfAttention(tf.keras.layers.Layer): @@ -325,7 +311,6 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder - self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -415,20 +400,6 @@ def call( outputs = outputs + (past_key_value,) return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "query", None) is not None: - with tf.name_scope(self.query.name): - self.query.build(self.config.hidden_size) - if getattr(self, "key", None) is not None: - with tf.name_scope(self.key.name): - self.key.build(self.config.hidden_size) - if getattr(self, "value", None) is not None: - with tf.name_scope(self.value.name): - self.value.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta class TFXLMRobertaSelfOutput(tf.keras.layers.Layer): @@ -440,7 +411,6 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -449,17 +419,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta class TFXLMRobertaAttention(tf.keras.layers.Layer): @@ -501,17 +460,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "self_attention", None) is not None: - with tf.name_scope(self.self_attention.name): - self.self_attention.build(None) - if getattr(self, "dense_output", None) is not None: - with tf.name_scope(self.dense_output.name): - self.dense_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta class TFXLMRobertaIntermediate(tf.keras.layers.Layer): @@ -526,7 +474,6 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act - self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -534,14 +481,6 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta class TFXLMRobertaOutput(tf.keras.layers.Layer): @@ -553,7 +492,6 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -562,17 +500,6 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool return hidden_states - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.intermediate_size) - if getattr(self, "LayerNorm", None) is not None: - with tf.name_scope(self.LayerNorm.name): - self.LayerNorm.build([None, None, self.config.hidden_size]) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta class TFXLMRobertaLayer(tf.keras.layers.Layer): @@ -660,20 +587,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "attention", None) is not None: - with tf.name_scope(self.attention.name): - self.attention.build(None) - if getattr(self, "intermediate", None) is not None: - with tf.name_scope(self.intermediate.name): - self.intermediate.build(None) - if getattr(self, "bert_output", None) is not None: - with tf.name_scope(self.bert_output.name): - self.bert_output.build(None) - # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta class TFXLMRobertaEncoder(tf.keras.layers.Layer): @@ -744,15 +657,6 @@ def call( cross_attentions=all_cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) - @keras_serializable # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta @@ -951,20 +855,6 @@ def call( cross_attentions=encoder_outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "encoder", None) is not None: - with tf.name_scope(self.encoder.name): - self.encoder.build(None) - if getattr(self, "pooler", None) is not None: - with tf.name_scope(self.pooler.name): - self.pooler.build(None) - if getattr(self, "embeddings", None) is not None: - with tf.name_scope(self.embeddings.name): - self.embeddings.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta class TFXLMRobertaPreTrainedModel(TFPreTrainedModel): @@ -1050,14 +940,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta class TFXLMRobertaLMHead(tf.keras.layers.Layer): @@ -1078,18 +960,10 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape=None): + def build(self, input_shape): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.hidden_size]) + super().build(input_shape) def get_output_embeddings(self): return self.decoder @@ -1198,17 +1072,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - @add_start_docstrings( "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.", @@ -1336,17 +1199,6 @@ def call( cross_attentions=outputs.cross_attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "lm_head", None) is not None: - with tf.name_scope(self.lm_head.name): - self.lm_head.build(None) - # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta class TFXLMRobertaClassificationHead(tf.keras.layers.Layer): @@ -1367,7 +1219,6 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) - self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1377,17 +1228,6 @@ def call(self, features, training=False): x = self.out_proj(x) return x - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "dense", None) is not None: - with tf.name_scope(self.dense.name): - self.dense.build(self.config.hidden_size) - if getattr(self, "out_proj", None) is not None: - with tf.name_scope(self.out_proj.name): - self.out_proj.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1465,17 +1305,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(None) - @add_start_docstrings( """ @@ -1498,7 +1327,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1570,17 +1398,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1607,7 +1424,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1666,17 +1482,6 @@ def call( attentions=outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1698,7 +1503,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1770,14 +1574,3 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "roberta", None) is not None: - with tf.name_scope(self.roberta.name): - self.roberta.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index 48cd42eaf1c5dd..a0e6a8c2aa5072 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -85,9 +85,8 @@ def __init__(self, config, **kwargs): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout) - self.config = config - def build(self, input_shape=None): + def build(self, input_shape): initializer = get_initializer(self.initializer_range) self.q = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q" @@ -116,13 +115,7 @@ def build(self, input_shape=None): self.seg_embed = self.add_weight( shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" ) - - if self.built: - return - self.built = True - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) + super().build(input_shape) def prune_heads(self, heads): raise NotImplementedError @@ -351,7 +344,6 @@ def __init__(self, config, **kwargs): self.activation_function = get_tf_activation(config.ff_activation) else: self.activation_function = config.ff_activation - self.config = config def call(self, inp, training=False): output = inp @@ -363,20 +355,6 @@ def call(self, inp, training=False): output = self.layer_norm(output + inp) return output - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "layer_norm", None) is not None: - with tf.name_scope(self.layer_norm.name): - self.layer_norm.build([None, None, self.config.d_model]) - if getattr(self, "layer_1", None) is not None: - with tf.name_scope(self.layer_1.name): - self.layer_1.build(self.config.d_model) - if getattr(self, "layer_2", None) is not None: - with tf.name_scope(self.layer_2.name): - self.layer_2.build(self.config.d_inner) - class TFXLNetLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): @@ -421,17 +399,6 @@ def call( outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "rel_attn", None) is not None: - with tf.name_scope(self.rel_attn.name): - self.rel_attn.build(None) - if getattr(self, "ff", None) is not None: - with tf.name_scope(self.ff.name): - self.ff.build(None) - class TFXLNetLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): @@ -504,22 +471,12 @@ def set_input_embeddings(self, value): self.word_embedding.weight = value self.word_embedding.vocab_size = shape_list(value)[0] - def build(self, input_shape=None): + def build(self, input_shape): initializer = get_initializer(self.initializer_range) self.mask_emb = self.add_weight( shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb" ) - - if self.built: - return - self.built = True - if getattr(self, "word_embedding", None) is not None: - with tf.name_scope(self.word_embedding.name): - self.word_embedding.build(None) - if getattr(self, "layer", None) is not None: - for layer in self.layer: - with tf.name_scope(layer.name): - layer.build(None) + super().build(input_shape) def _prune_heads(self, heads_to_prune): raise NotImplementedError @@ -1220,14 +1177,6 @@ def call( return outputs - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - @add_start_docstrings( """ @@ -1387,17 +1336,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "lm_loss", None) is not None: - with tf.name_scope(self.lm_loss.name): - self.lm_loss.build(None) - @add_start_docstrings( """ @@ -1418,7 +1356,6 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1486,20 +1423,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "logits_proj", None) is not None: - with tf.name_scope(self.logits_proj.name): - self.logits_proj.build(self.config.d_model) - @add_start_docstrings( """ @@ -1519,7 +1442,6 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1602,20 +1524,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "sequence_summary", None) is not None: - with tf.name_scope(self.sequence_summary.name): - self.sequence_summary.build(None) - if getattr(self, "logits_proj", None) is not None: - with tf.name_scope(self.logits_proj.name): - self.logits_proj.build(self.config.d_model) - @add_start_docstrings( """ @@ -1633,7 +1541,6 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1697,17 +1604,6 @@ def call( attentions=transformer_outputs.attentions, ) - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "classifier", None) is not None: - with tf.name_scope(self.classifier.name): - self.classifier.build(self.config.hidden_size) - @add_start_docstrings( """ @@ -1723,7 +1619,6 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1802,14 +1697,3 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) - - def build(self, input_shape=None): - if self.built: - return - self.built = True - if getattr(self, "transformer", None) is not None: - with tf.name_scope(self.transformer.name): - self.transformer.build(None) - if getattr(self, "qa_outputs", None) is not None: - with tf.name_scope(self.qa_outputs.name): - self.qa_outputs.build(self.config.hidden_size)