diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index ad35b6182a4e21..9a94747ae0295c 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -168,7 +168,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( @@ -246,6 +253,7 @@ def __init__(self, config: AlbertConfig, **kwargs): # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -306,6 +314,25 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFAlbertLayer(tf.keras.layers.Layer): @@ -329,6 +356,7 @@ def __init__(self, config: AlbertConfig, **kwargs): epsilon=config.layer_norm_eps, name="full_layer_layer_norm" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call( self, @@ -355,6 +383,22 @@ def call( outputs = (hidden_states,) + attention_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(self.config.hidden_size) + if getattr(self, "ffn_output", None) is not None: + with tf.name_scope(self.ffn_output.name): + self.ffn_output.build(self.config.intermediate_size) + if getattr(self, "full_layer_layer_norm", None) is not None: + with tf.name_scope(self.full_layer_layer_norm.name): + self.full_layer_layer_norm.build([None, None, self.config.hidden_size]) class TFAlbertLayerGroup(tf.keras.layers.Layer): @@ -398,6 +442,14 @@ def call( layer_hidden_states = layer_hidden_states + (hidden_states,) return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert_layers", None) is not None: + for layer in self.albert_layers: + with tf.name_scope(layer.name): + layer.build(None) class TFAlbertTransformer(tf.keras.layers.Layer): @@ -416,6 +468,7 @@ def __init__(self, config: AlbertConfig, **kwargs): self.albert_layer_groups = [ TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups) ] + self.config = config def call( self, @@ -456,6 +509,17 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedding_hidden_mapping_in", None) is not None: + with tf.name_scope(self.embedding_hidden_mapping_in.name): + self.embedding_hidden_mapping_in.build(self.config.embedding_size) + if getattr(self, "albert_layer_groups", None) is not None: + for layer in self.albert_layer_groups: + with tf.name_scope(layer.name): + layer.build(None) class TFAlbertPreTrainedModel(TFPreTrainedModel): @@ -488,13 +552,23 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.decoder_bias = self.add_weight( shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.decoder @@ -649,6 +723,19 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) # TODO Matt might be wrong @dataclass @@ -824,6 +911,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) @add_start_docstrings( @@ -920,6 +1014,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + if getattr(self, "sop_classifier", None) is not None: + with tf.name_scope(self.sop_classifier.name): + self.sop_classifier.build(None) class TFAlbertSOPHead(tf.keras.layers.Layer): @@ -932,12 +1039,20 @@ def __init__(self, config: AlbertConfig, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor: dropout_pooled_output = self.dropout(inputs=pooled_output, training=training) logits = self.classifier(inputs=dropout_pooled_output) return logits + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING) @@ -1034,6 +1149,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) @add_start_docstrings( @@ -1058,6 +1183,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1116,6 +1242,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1145,6 +1281,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1199,6 +1336,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1221,6 +1368,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1294,6 +1442,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) @add_start_docstrings( @@ -1316,6 +1474,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1394,3 +1553,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index b13ecea213c49e..1ae43ca08366c9 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -295,6 +295,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFBartEncoderLayer(tf.keras.layers.Layer): @@ -311,6 +327,7 @@ def __init__(self, config: BartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -351,6 +368,25 @@ def call( hidden_states = self.final_layer_norm(hidden_states) return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFBartDecoderLayer(tf.keras.layers.Layer): @@ -380,6 +416,7 @@ def __init__(self, config: BartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -460,6 +497,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFBartClassificationHead(tf.keras.layers.Layer): @@ -471,6 +533,7 @@ def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name self.dropout = tf.keras.layers.Dropout(pooler_dropout) self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj") self.input_dim = inner_dim + self.inner_dim = inner_dim def call(self, inputs): hidden_states = self.dropout(inputs) @@ -479,6 +542,16 @@ def call(self, inputs): hidden_states = self.dropout(hidden_states) hidden_states = self.out_proj(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.input_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.inner_dim) class TFBartPretrainedModel(TFPreTrainedModel): @@ -804,6 +877,20 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1027,6 +1114,20 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1144,6 +1245,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1232,6 +1346,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) class BiasLayer(tf.keras.layers.Layer): @@ -1435,6 +1556,16 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1563,3 +1694,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "classification_head", None) is not None: + with tf.name_scope(self.classification_head.name): + self.classification_head.build(None) diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index fd0a07b415f4f2..66b3284e749122 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -156,7 +156,7 @@ def __init__(self, config: BertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -178,7 +178,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def call( self, @@ -248,6 +255,7 @@ def __init__(self, config: BertConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -336,6 +344,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) class TFBertSelfOutput(tf.keras.layers.Layer): @@ -347,6 +368,7 @@ def __init__(self, config: BertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -354,6 +376,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFBertAttention(tf.keras.layers.Layer): @@ -394,6 +426,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFBertIntermediate(tf.keras.layers.Layer): @@ -408,12 +450,20 @@ def __init__(self, config: BertConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFBertOutput(tf.keras.layers.Layer): @@ -425,6 +475,7 @@ def __init__(self, config: BertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -432,6 +483,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFBertLayer(tf.keras.layers.Layer): @@ -518,6 +579,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) class TFBertEncoder(tf.keras.layers.Layer): @@ -587,6 +661,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) class TFBertPooler(tf.keras.layers.Layer): @@ -599,6 +681,7 @@ def __init__(self, config: BertConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -607,6 +690,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFBertPredictionHeadTransform(tf.keras.layers.Layer): @@ -625,6 +715,7 @@ def __init__(self, config: BertConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -632,6 +723,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFBertLMPredictionHead(tf.keras.layers.Layer): @@ -647,10 +748,17 @@ def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -890,6 +998,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFBertPreTrainedModel(TFPreTrainedModel): @@ -1102,6 +1223,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) @add_start_docstrings( @@ -1214,6 +1342,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "nsp", None) is not None: + with tf.name_scope(self.nsp.name): + self.nsp.build(None) # TODO Matt might be wrong + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) @@ -1300,6 +1441,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): @@ -1425,6 +1576,16 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1507,6 +1668,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "nsp", None) is not None: + with tf.name_scope(self.nsp.name): + self.nsp.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1536,6 +1707,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1593,6 +1765,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1615,6 +1797,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1692,6 +1875,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1727,6 +1920,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1782,6 +1976,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1812,6 +2016,7 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1884,3 +2089,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index fdd85a7f87832c..ff4ba08bdca600 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -290,6 +290,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot @@ -307,6 +323,7 @@ def __init__(self, config: BlenderbotConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -347,6 +364,25 @@ def call( hidden_states = residual + hidden_states return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot @@ -377,6 +413,7 @@ def __init__(self, config: BlenderbotConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -457,6 +494,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFBlenderbotPreTrainedModel(TFPreTrainedModel): @@ -775,6 +837,20 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1005,6 +1081,20 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1113,6 +1203,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1216,6 +1319,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer @@ -1436,3 +1546,13 @@ def prepare_inputs_for_generation( "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index 1627079b2e96b1..0750d385ef029a 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -290,6 +290,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall @@ -307,6 +323,7 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -347,6 +364,25 @@ def call( hidden_states = self.final_layer_norm(hidden_states) return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall @@ -377,6 +413,7 @@ def __init__(self, config: BlenderbotSmallConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -457,6 +494,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel): @@ -781,6 +843,20 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1014,6 +1090,20 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1122,6 +1212,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1209,6 +1312,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer @@ -1414,3 +1524,13 @@ def prepare_inputs_for_generation( "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py index a291c3f07ece63..532c466ef1f5fb 100644 --- a/src/transformers/models/blip/modeling_tf_blip.py +++ b/src/transformers/models/blip/modeling_tf_blip.py @@ -254,7 +254,7 @@ def __init__(self, config: BlipVisionConfig, **kwargs): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 - def build(self, input_shape): + def build(self, input_shape=None): self.class_embedding = self.add_weight( shape=(1, 1, self.embed_dim), initializer=get_initializer(self.config.initializer_range), @@ -268,7 +268,14 @@ def build(self, input_shape): trainable=True, name="position_embedding", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embedding", None) is not None: + with tf.name_scope(self.patch_embedding.name): + self.patch_embedding.build(3) + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch @@ -411,6 +418,19 @@ def call( outputs = (output, attention_probs) if output_attentions else (output, None) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build(self.embed_dim) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.embed_dim) class TFBlipMLP(tf.keras.layers.Layer): @@ -428,12 +448,23 @@ def __init__(self, config: BlipConfig, **kwargs): self.fc2 = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc1(inputs=hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.config.hidden_size) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.intermediate_size) class TFBlipEncoderLayer(tf.keras.layers.Layer): @@ -484,6 +515,22 @@ def call( outputs += (attn_weights,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) class TFBlipPreTrainedModel(TFPreTrainedModel): @@ -644,6 +691,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFBlipVisionModel(TFBlipPreTrainedModel): @@ -724,6 +779,19 @@ def call( def get_input_embeddings(self): return self.embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "post_layernorm", None) is not None: + with tf.name_scope(self.post_layernorm.name): + self.post_layernorm.build([None, None, self.embed_dim]) class TFBlipMainLayer(tf.keras.layers.Layer): @@ -776,7 +844,23 @@ def build(self, input_shape=None): initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), trainable=True, ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build(self.vision_embed_dim) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build(self.text_embed_dim) + @unpack_inputs def call( @@ -995,6 +1079,13 @@ def get_image_features( image_features = self.blip.visual_projection(pooled_output) return image_features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blip", None) is not None: + with tf.name_scope(self.blip.name): + self.blip.build(None) @add_start_docstrings( @@ -1168,6 +1259,16 @@ def generate( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) @add_start_docstrings( @@ -1409,6 +1510,19 @@ def generate( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) @add_start_docstrings( @@ -1458,6 +1572,7 @@ def __init__(self, config: BlipConfig, *args, **kwargs): if not hasattr(config, "decoder_start_token_id") else config.decoder_start_token_id ) + self.config = config def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.vision_model.embeddings.patch_embedding @@ -1559,3 +1674,22 @@ def call( attentions=vision_outputs.attentions, question_embeds=question_embeds, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "vision_proj", None) is not None: + with tf.name_scope(self.vision_proj.name): + self.vision_proj.build(self.config.vision_self.config.hidden_size) + if getattr(self, "text_proj", None) is not None: + with tf.name_scope(self.text_proj.name): + self.text_proj.build(self.config.text_self.config.hidden_size) + if getattr(self, "itm_head", None) is not None: + with tf.name_scope(self.itm_head.name): + self.itm_head.build(self.config.text_self.config.hidden_size) diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index b7307c062f7911..e41772a1b51982 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -126,6 +126,22 @@ def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_v embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "word_embeddings", None) is not None: + with tf.name_scope(self.word_embeddings.name): + self.word_embeddings.build(None) + if getattr(self, "position_embeddings", None) is not None: + with tf.name_scope(self.position_embeddings.name): + self.position_embeddings.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 @@ -249,6 +265,19 @@ def call( outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(None) # TODO Matt might be wrong + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(None) # TODO Matt might be wrong class TFBlipTextSelfOutput(tf.keras.layers.Layer): @@ -260,6 +289,7 @@ def __init__(self, config: BlipTextConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -267,6 +297,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Opti hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 @@ -301,6 +341,16 @@ def call( attention_output = self.self_output(self_outputs[0], hidden_states, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText @@ -316,12 +366,20 @@ def __init__(self, config: BlipTextConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFBlipTextOutput(tf.keras.layers.Layer): @@ -333,6 +391,7 @@ def __init__(self, config: BlipTextConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -340,6 +399,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFBlipTextLayer(tf.keras.layers.Layer): @@ -399,6 +468,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 @@ -480,6 +562,14 @@ def call( attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText @@ -493,6 +583,7 @@ def __init__(self, config: BlipTextConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -501,6 +592,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText @@ -520,6 +618,7 @@ def __init__(self, config: BlipTextConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -527,6 +626,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): @@ -546,7 +655,17 @@ def __init__(self, config, **kwargs): def build(self, input_shape=None): self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(self.config.hidden_size) + def call(self, hidden_states): hidden_states = self.transform(hidden_states) @@ -562,6 +681,13 @@ def __init__(self, config, **kwargs): def call(self, sequence_output: tf.Tensor) -> tf.Tensor: prediction_scores = self.predictions(sequence_output) return prediction_scores + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 @@ -801,6 +927,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 @@ -942,3 +1081,13 @@ def _reorder_cache(self, past_key_values, beam_idx): for layer_past in past_key_values: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "cls", None) is not None: + with tf.name_scope(self.cls.name): + self.cls.build(None) diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py index 8def74a5b3045e..6ffcbc6a48aef3 100644 --- a/src/transformers/models/camembert/modeling_tf_camembert.py +++ b/src/transformers/models/camembert/modeling_tf_camembert.py @@ -184,7 +184,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -206,7 +206,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -279,6 +286,7 @@ def __init__(self, config: CamembertConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -287,6 +295,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert @@ -317,6 +332,7 @@ def __init__(self, config: CamembertConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -405,6 +421,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert @@ -417,6 +446,7 @@ def __init__(self, config: CamembertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -424,6 +454,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert @@ -465,6 +505,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert @@ -480,12 +530,20 @@ def __init__(self, config: CamembertConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert @@ -498,6 +556,7 @@ def __init__(self, config: CamembertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -505,6 +564,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert @@ -592,6 +661,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert @@ -662,6 +744,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -860,6 +950,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) class TFCamembertPreTrainedModel(TFPreTrainedModel): @@ -944,6 +1047,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert @@ -965,10 +1075,20 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_output_embeddings(self): return self.decoder @@ -1079,6 +1199,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead @@ -1100,6 +1230,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1108,6 +1239,16 @@ def call(self, features, training=False): x = self.dropout(x, training=training) x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -1185,6 +1326,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1212,6 +1363,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1269,6 +1421,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1292,6 +1454,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1362,6 +1525,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1384,6 +1557,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1455,6 +1629,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) @add_start_docstrings( @@ -1581,3 +1765,13 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index da63c5ff2150e1..76c090927d4b69 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -169,7 +169,14 @@ def build(self, input_shape: tf.TensorShape = None): name="embeddings", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embedding", None) is not None: + with tf.name_scope(self.patch_embedding.name): + self.patch_embedding.build(self.config.num_channels) + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: """`pixel_values` is expected to be of NCHW format.""" @@ -351,6 +358,22 @@ def call( outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFCLIPMLP(tf.keras.layers.Layer): @@ -369,12 +392,23 @@ def __init__(self, config: CLIPConfig, **kwargs): self.fc2 = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc1(inputs=hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.config.hidden_size) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.intermediate_size) class TFCLIPEncoderLayer(tf.keras.layers.Layer): @@ -427,6 +461,22 @@ def call( outputs = (hidden_states,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) class TFCLIPEncoder(tf.keras.layers.Layer): @@ -482,6 +532,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFCLIPTextTransformer(tf.keras.layers.Layer): @@ -586,6 +644,19 @@ def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32) to_mask = tf.linalg.set_diag(to_mask, diagonal=diag) return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) @keras_serializable @@ -634,6 +705,13 @@ def call( ) return text_model_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) class TFCLIPVisionTransformer(tf.keras.layers.Layer): @@ -680,6 +758,22 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "pre_layernorm", None) is not None: + with tf.name_scope(self.pre_layernorm.name): + self.pre_layernorm.build([None, None, self.embed_dim]) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "post_layernorm", None) is not None: + with tf.name_scope(self.post_layernorm.name): + self.post_layernorm.build([None, None, self.embed_dim]) @keras_serializable @@ -715,6 +809,13 @@ def call( ) return vision_model_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) @keras_serializable @@ -770,7 +871,23 @@ def build(self, input_shape: tf.TensorShape = None): name="logit_scale", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build(self.vision_embed_dim) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build(self.text_embed_dim) + @unpack_inputs def get_text_features( @@ -1111,6 +1228,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "clip", None) is not None: + with tf.name_scope(self.clip.name): + self.clip.build(None) class TFCLIPVisionModel(TFCLIPPreTrainedModel): @@ -1165,6 +1289,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "clip", None) is not None: + with tf.name_scope(self.clip.name): + self.clip.build(None) @add_start_docstrings(CLIP_START_DOCSTRING) @@ -1317,3 +1448,10 @@ def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput: # TensorFlow cannot trace through nested dataclasses. Reference: # https://github.com/huggingface/transformers/pull/16886 return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "clip", None) is not None: + with tf.name_scope(self.clip.name): + self.clip.build(None) diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index 4beb01cb78b0ac..08c6cc3d56439b 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -81,7 +81,7 @@ def __init__(self, config: ConvBertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -103,7 +103,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( @@ -208,6 +215,7 @@ def __init__(self, config, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -296,6 +304,28 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) + if getattr(self, "key_conv_attn_layer", None) is not None: + with tf.name_scope(self.key_conv_attn_layer.name): + self.key_conv_attn_layer.build(None) + if getattr(self, "conv_kernel_layer", None) is not None: + with tf.name_scope(self.conv_kernel_layer.name): + self.conv_kernel_layer.build(self.all_head_size) + if getattr(self, "conv_out_layer", None) is not None: + with tf.name_scope(self.conv_out_layer.name): + self.conv_out_layer.build(self.config.hidden_size) class TFConvBertSelfOutput(tf.keras.layers.Layer): @@ -307,6 +337,7 @@ def __init__(self, config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -314,6 +345,16 @@ def call(self, hidden_states, input_tensor, training=False): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFConvBertAttention(tf.keras.layers.Layer): @@ -334,6 +375,16 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, train outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class GroupedLinearLayer(tf.keras.layers.Layer): @@ -415,6 +466,7 @@ def __init__(self, config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -422,6 +474,13 @@ def call(self, hidden_states, input_tensor, training=False): hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFConvBertLayer(tf.keras.layers.Layer): @@ -442,6 +501,19 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) class TFConvBertEncoder(tf.keras.layers.Layer): @@ -485,6 +557,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) class TFConvBertPredictionHeadTransform(tf.keras.layers.Layer): @@ -501,6 +581,7 @@ def __init__(self, config, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -508,6 +589,16 @@ def call(self, hidden_states): hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) @keras_serializable @@ -615,6 +706,16 @@ def call( ) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) class TFConvBertPreTrainedModel(TFPreTrainedModel): @@ -769,6 +870,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) class TFConvBertMaskedLMHead(tf.keras.layers.Layer): @@ -814,6 +922,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") + self.config = config def call(self, generator_hidden_states, training=False): hidden_states = self.dense(generator_hidden_states) @@ -821,6 +930,16 @@ def call(self, generator_hidden_states, training=False): hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING) @@ -900,6 +1019,19 @@ def call( hidden_states=generator_hidden_states.hidden_states, attentions=generator_hidden_states.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "generator_predictions", None) is not None: + with tf.name_scope(self.generator_predictions.name): + self.generator_predictions.build(None) + if getattr(self, "generator_lm_head", None) is not None: + with tf.name_scope(self.generator_lm_head.name): + self.generator_lm_head.build(None) class TFConvBertClassificationHead(tf.keras.layers.Layer): @@ -930,6 +1062,16 @@ def call(self, hidden_states, **kwargs): x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -998,6 +1140,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1018,6 +1170,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1091,6 +1244,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1113,6 +1279,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1166,6 +1333,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1184,6 +1361,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1252,3 +1430,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py index 59a36b3983768c..c79558511bce1e 100644 --- a/src/transformers/models/convnext/modeling_tf_convnext.py +++ b/src/transformers/models/convnext/modeling_tf_convnext.py @@ -81,6 +81,7 @@ def __init__(self, config: ConvNextConfig, **kwargs): ) self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels + self.config = config def call(self, pixel_values): if isinstance(pixel_values, dict): @@ -100,6 +101,17 @@ def call(self, pixel_values): embeddings = self.patch_embeddings(pixel_values) embeddings = self.layernorm(embeddings) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build( + self.config.num_channels) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[0]]) class TFConvNextLayer(tf.keras.layers.Layer): @@ -167,7 +179,26 @@ def build(self, input_shape: tf.TensorShape = None): if self.config.layer_scale_init_value > 0 else None ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dwconv", None) is not None: + with tf.name_scope(self.dwconv.name): + self.dwconv.build(self.dim) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.dim]) + if getattr(self, "pwconv1", None) is not None: + with tf.name_scope(self.pwconv1.name): + self.pwconv1.build(self.dim) + if getattr(self, "pwconv2", None) is not None: + with tf.name_scope(self.pwconv2.name): + self.pwconv2.build(4 * self.dim) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + def call(self, hidden_states, training=False): input = hidden_states @@ -252,6 +283,14 @@ def call(self, hidden_states): for layer in self.layers: hidden_states = layer(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFConvNextEncoder(tf.keras.layers.Layer): @@ -352,6 +391,19 @@ def call( pooler_output=pooled_output, hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) class TFConvNextPreTrainedModel(TFPreTrainedModel): @@ -484,6 +536,13 @@ def call( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convnext", None) is not None: + with tf.name_scope(self.convnext.name): + self.convnext.build(None) @add_start_docstrings( @@ -577,3 +636,13 @@ def call( logits=logits, hidden_states=outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convnext", None) is not None: + with tf.name_scope(self.convnext.name): + self.convnext.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) diff --git a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py index 863e59406f1cfb..fc0432bf97324e 100644 --- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py @@ -133,6 +133,7 @@ def __init__(self, config: ConvNextV2Config, **kwargs): ) self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layernorm") self.num_channels = config.num_channels + self.config = config def call(self, pixel_values): if isinstance(pixel_values, dict): @@ -152,6 +153,17 @@ def call(self, pixel_values): embeddings = self.patch_embeddings(pixel_values) embeddings = self.layernorm(embeddings) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build( + self.config.num_channels) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[0]]) class TFConvNextV2Layer(tf.keras.layers.Layer): @@ -222,6 +234,28 @@ def call(self, hidden_states, training=False): x = self.drop_path(x, training=training) x = input + x return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dwconv", None) is not None: + with tf.name_scope(self.dwconv.name): + self.dwconv.build(self.dim) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.dim]) + if getattr(self, "pwconv1", None) is not None: + with tf.name_scope(self.pwconv1.name): + self.pwconv1.build(self.dim) + if getattr(self, "grn", None) is not None: + with tf.name_scope(self.grn.name): + self.grn.build(None) + if getattr(self, "pwconv2", None) is not None: + with tf.name_scope(self.pwconv2.name): + self.pwconv2.build(4 * self.dim) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextStage with ConvNext->ConvNextV2 @@ -293,6 +327,14 @@ def call(self, hidden_states): for layer in self.layers: hidden_states = layer(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFConvNextV2Encoder(tf.keras.layers.Layer): @@ -400,6 +442,19 @@ def call( pooler_output=pooled_output, hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) class TFConvNextV2PreTrainedModel(TFPreTrainedModel): @@ -518,6 +573,13 @@ def call( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convnextv2", None) is not None: + with tf.name_scope(self.convnextv2.name): + self.convnextv2.build(None) @add_start_docstrings( @@ -593,3 +655,13 @@ def call( logits=logits, hidden_states=outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convnextv2", None) is not None: + with tf.name_scope(self.convnextv2.name): + self.convnextv2.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 70a5c17462595a..4cacd67f7b0291 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -141,6 +141,22 @@ def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, outputs = outputs + (attn,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "Wq", None) is not None: + with tf.name_scope(self.Wq.name): + self.Wq.build(self.d_model_size) + if getattr(self, "Wk", None) is not None: + with tf.name_scope(self.Wk.name): + self.Wk.build(self.d_model_size) + if getattr(self, "Wv", None) is not None: + with tf.name_scope(self.Wv.name): + self.Wv.build(self.d_model_size) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.d_model_size) class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): @@ -175,6 +191,7 @@ def __init__( self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) + self.d_model_size = d_model_size def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): normed = self.layernorm1(x) @@ -201,6 +218,22 @@ def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output outputs = (out2,) + attn_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "multi_head_attention", None) is not None: + with tf.name_scope(self.multi_head_attention.name): + self.multi_head_attention.build(None) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) + if getattr(self, "layernorm1", None) is not None: + with tf.name_scope(self.layernorm1.name): + self.layernorm1.build([None, None, self.d_model_size]) + if getattr(self, "layernorm2", None) is not None: + with tf.name_scope(self.layernorm2.name): + self.layernorm2.build([None, None, self.d_model_size]) @keras_serializable @@ -395,6 +428,20 @@ def call( hidden_states=all_hidden_states, attentions=all_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "w", None) is not None: + with tf.name_scope(self.w.name): + self.w.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.n_embd]) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) class TFCTRLPreTrainedModel(TFPreTrainedModel): @@ -562,6 +609,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) class TFCTRLBiasLayer(tf.keras.layers.Layer): @@ -709,6 +763,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) @add_start_docstrings( @@ -737,6 +801,7 @@ def __init__(self, config, *inputs, **kwargs): use_bias=False, ) self.transformer = TFCTRLMainLayer(config, name="transformer") + self.config = config def get_output_embeddings(self): # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too. @@ -836,3 +901,13 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.n_embd) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py index d610792e48f8e5..da40fe783deae0 100644 --- a/src/transformers/models/cvt/modeling_tf_cvt.py +++ b/src/transformers/models/cvt/modeling_tf_cvt.py @@ -130,6 +130,13 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution_embeddings(pixel_values) hidden_state = self.dropout(hidden_state, training=training) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution_embeddings", None) is not None: + with tf.name_scope(self.convolution_embeddings.name): + self.convolution_embeddings.build(None) class TFCvtConvEmbeddings(tf.keras.layers.Layer): @@ -160,6 +167,7 @@ def __init__( # Using the same default epsilon as PyTorch self.normalization = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") self.num_channels = num_channels + self.embed_dim = embed_dim def call(self, pixel_values: tf.Tensor) -> tf.Tensor: if isinstance(pixel_values, dict): @@ -176,6 +184,16 @@ def call(self, pixel_values: tf.Tensor) -> tf.Tensor: # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels)) return pixel_values + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build([None, None, self.embed_dim]) class TFCvtSelfAttentionConvProjection(tf.keras.layers.Layer): @@ -196,11 +214,23 @@ def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: ) # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution(self.padding(hidden_state)) hidden_state = self.normalization(hidden_state, training=training) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution", None) is not None: + with tf.name_scope(self.convolution.name): + self.convolution.build( + self.embed_dim) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build(None) class TFCvtSelfAttentionLinearProjection(tf.keras.layers.Layer): @@ -359,6 +389,28 @@ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool context = tf.transpose(context, perm=(0, 2, 1, 3)) context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim)) return context + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution_projection_query", None) is not None: + with tf.name_scope(self.convolution_projection_query.name): + self.convolution_projection_query.build(None) + if getattr(self, "convolution_projection_key", None) is not None: + with tf.name_scope(self.convolution_projection_key.name): + self.convolution_projection_key.build(None) + if getattr(self, "convolution_projection_value", None) is not None: + with tf.name_scope(self.convolution_projection_value.name): + self.convolution_projection_value.build(None) + if getattr(self, "projection_query", None) is not None: + with tf.name_scope(self.projection_query.name): + self.projection_query.build(self.embed_dim) + if getattr(self, "projection_key", None) is not None: + with tf.name_scope(self.projection_key.name): + self.projection_key.build(self.embed_dim) + if getattr(self, "projection_value", None) is not None: + with tf.name_scope(self.projection_value.name): + self.projection_value.build(self.embed_dim) class TFCvtSelfOutput(tf.keras.layers.Layer): @@ -370,11 +422,19 @@ def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(drop_rate) + self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.dense(inputs=hidden_state) hidden_state = self.dropout(inputs=hidden_state, training=training) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.embed_dim) class TFCvtAttention(tf.keras.layers.Layer): @@ -422,6 +482,16 @@ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool self_output = self.attention(hidden_state, height, width, training=training) attention_output = self.dense_output(self_output, training=training) return attention_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFCvtIntermediate(tf.keras.layers.Layer): @@ -435,10 +505,18 @@ def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): activation="gelu", name="dense", ) + self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor) -> tf.Tensor: hidden_state = self.dense(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.embed_dim) class TFCvtOutput(tf.keras.layers.Layer): @@ -452,12 +530,21 @@ def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: int, **kwargs): units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(drop_rate) + self.mlp_ratio = config.mlp_ratio + self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.dense(inputs=hidden_state) hidden_state = self.dropout(inputs=hidden_state, training=training) hidden_state = hidden_state + input_tensor return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(int(self.embed_dim * self.mlp_ratio)) class TFCvtLayer(tf.keras.layers.Layer): @@ -514,6 +601,7 @@ def __init__( # Using the same default epsilon as PyTorch self.layernorm_before = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") self.layernorm_after = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") + self.embed_dim = embed_dim def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: # in Cvt, layernorm is applied before self-attention @@ -531,6 +619,28 @@ def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool layer_output = self.dense_output(layer_output, hidden_state) layer_output = self.drop_path(layer_output, training=training) return layer_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.embed_dim]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.embed_dim]) class TFCvtStage(tf.keras.layers.Layer): @@ -615,6 +725,17 @@ def call(self, hidden_state: tf.Tensor, training: bool = False): # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) return hidden_state, cls_token + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedding", None) is not None: + with tf.name_scope(self.embedding.name): + self.embedding.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFCvtEncoder(tf.keras.layers.Layer): @@ -667,6 +788,14 @@ def call( cls_token_value=cls_token, hidden_states=all_hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "stages", None) is not None: + for layer in self.stages: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -708,6 +837,13 @@ def call( cls_token_value=encoder_outputs.cls_token_value, hidden_states=encoder_outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) class TFCvtPreTrainedModel(TFPreTrainedModel): @@ -827,6 +963,13 @@ def call( cls_token_value=outputs.cls_token_value, hidden_states=outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "cvt", None) is not None: + with tf.name_scope(self.cvt.name): + self.cvt.build(None) @add_start_docstrings( @@ -853,6 +996,7 @@ def __init__(self, config: CvtConfig, *inputs, **kwargs): bias_initializer="zeros", name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) @@ -922,3 +1066,16 @@ def call( return ((loss,) + output) if loss is not None else output return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "cvt", None) is not None: + with tf.name_scope(self.cvt.name): + self.cvt.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.embed_dim[-1]]) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py index 96d3993dd7cd5f..a976b32e62010c 100644 --- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py @@ -137,7 +137,7 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), @@ -164,7 +164,14 @@ def build(self, input_shape: tf.TensorShape): else: self.position_embeddings = None - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build(None) + def call(self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None) -> tf.Tensor: embeddings = self.patch_embeddings(pixel_values) @@ -247,6 +254,13 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0]) return tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1)) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) class TFData2VecVisionSelfAttention(tf.keras.layers.Layer): @@ -284,6 +298,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] = ) else: self.relative_position_bias = None + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -343,6 +358,19 @@ def call( outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) class TFData2VecVisionSelfOutput(tf.keras.layers.Layer): @@ -358,12 +386,20 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, gamma=None, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFData2VecVisionAttention(tf.keras.layers.Layer): @@ -397,6 +433,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->Data2VecVision @@ -412,12 +458,20 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFData2VecVisionOutput(tf.keras.layers.Layer): @@ -428,12 +482,20 @@ def __init__(self, config: Data2VecVisionConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) class TFData2VecVisionLayer(tf.keras.layers.Layer): @@ -483,7 +545,29 @@ def build(self, input_shape: tf.TensorShape = None): else: self.lambda_1, self.lambda_2 = None, None - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "data2vec_output", None) is not None: + with tf.name_scope(self.data2vec_output.name): + self.data2vec_output.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.config.hidden_size]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.config.hidden_size]) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + def call( self, @@ -649,6 +733,14 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -740,6 +832,22 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFData2VecVisionPooler(tf.keras.layers.Layer): @@ -761,6 +869,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = hidden_states[:, 0] return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build(None) class TFData2VecVisionPreTrainedModel(TFPreTrainedModel): @@ -895,6 +1010,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "data2vec_vision", None) is not None: + with tf.name_scope(self.data2vec_vision.name): + self.data2vec_vision.build(None) @add_start_docstrings( @@ -917,6 +1039,7 @@ def __init__(self, config: Data2VecVisionConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DATA2VEC_VISION_INPUTS_DOCSTRING) @@ -967,6 +1090,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "data2vec_vision", None) is not None: + with tf.name_scope(self.data2vec_vision.name): + self.data2vec_vision.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) class TFData2VecVisionConvModule(tf.keras.layers.Layer): @@ -1005,6 +1138,16 @@ def call(self, input: tf.Tensor) -> tf.Tensor: output = self.bn(output) output = self.activation(output) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(self.in_channels) + if getattr(self, "bn", None) is not None: + with tf.name_scope(self.bn.name): + self.bn.build(None) # Copied from: @@ -1219,6 +1362,22 @@ def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor: output = self.classifier(output) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.channels) + if getattr(self, "psp_modules", None) is not None: + with tf.name_scope(self.psp_modules.name): + self.psp_modules.build(None) + if getattr(self, "bottleneck", None) is not None: + with tf.name_scope(self.bottleneck.name): + self.bottleneck.build(None) + if getattr(self, "fpn_bottleneck", None) is not None: + with tf.name_scope(self.fpn_bottleneck.name): + self.fpn_bottleneck.build(None) class TFData2VecVisionFCNHead(tf.keras.layers.Layer): @@ -1297,6 +1456,13 @@ def call(self, encoder_hidden_states: tf.Tensor) -> tf.Tensor: output = self.conv_cat(tf.concat([hidden_states, output], axis=-1)) output = self.classifier(output) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.channels) @add_start_docstrings( @@ -1457,3 +1623,24 @@ def reshape_features(x): hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "data2vec_vision", None) is not None: + with tf.name_scope(self.data2vec_vision.name): + self.data2vec_vision.build(None) + if getattr(self, "decode_head", None) is not None: + with tf.name_scope(self.decode_head.name): + self.decode_head.build(None) + if getattr(self, "auxiliary_head", None) is not None: + with tf.name_scope(self.auxiliary_head.name): + self.auxiliary_head.build(None) + if getattr(self, "fpn1", None) is not None: + for layer in self.fpn1: + with tf.name_scope(layer.name): + layer.build(None) + if getattr(self, "fpn2", None) is not None: + for layer in self.fpn2: + with tf.name_scope(layer.name): + layer.build(None) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index a470cb5493e01b..bbf67fe6131efd 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -167,12 +167,26 @@ def __init__(self, config: DebertaConfig, **kwargs): self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + self.config = config def call(self, hidden_states, input_tensor, training: bool = False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) class TFDebertaAttention(tf.keras.layers.Layer): @@ -210,6 +224,16 @@ def call( output = (attention_output,) + self_outputs[1:] return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFDebertaIntermediate(tf.keras.layers.Layer): @@ -224,12 +248,20 @@ def __init__(self, config: DebertaConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFDebertaOutput(tf.keras.layers.Layer): @@ -241,6 +273,7 @@ def __init__(self, config: DebertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -248,6 +281,19 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) class TFDebertaLayer(tf.keras.layers.Layer): @@ -285,6 +331,19 @@ def call( outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) class TFDebertaEncoder(tf.keras.layers.Layer): @@ -299,14 +358,22 @@ def __init__(self, config: DebertaConfig, **kwargs): if self.max_relative_positions < 1: self.max_relative_positions = config.max_position_embeddings - def build(self, input_shape): + def build(self, input_shape=None): if self.relative_attention: self.rel_embeddings = self.add_weight( name="rel_embeddings.weight", shape=[self.max_relative_positions * 2, self.config.hidden_size], initializer=get_initializer(self.config.initializer_range), ) - return super().build(input_shape) + return + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + def get_rel_embedding(self): rel_embeddings = self.rel_embeddings if self.relative_attention else None @@ -735,7 +802,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -763,7 +830,17 @@ def build(self, input_shape: tf.TensorShape): else: self.position_embeddings = None - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + def call( self, @@ -838,6 +915,7 @@ def __init__(self, config: DebertaConfig, **kwargs): else: self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -845,6 +923,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.embedding_size]) class TFDebertaLMPredictionHead(tf.keras.layers.Layer): @@ -860,10 +948,17 @@ def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Laye # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -899,6 +994,13 @@ def call(self, sequence_output: tf.Tensor) -> tf.Tensor: prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) # @keras_serializable @@ -983,6 +1085,16 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) class TFDebertaPreTrainedModel(TFPreTrainedModel): @@ -1123,6 +1235,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) @@ -1193,6 +1312,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) @add_start_docstrings( @@ -1275,6 +1404,22 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.output_dim) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1295,6 +1440,7 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1346,6 +1492,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1365,6 +1521,7 @@ def __init__(self, config: DebertaConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1431,3 +1588,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 67414267546427..6b40622b1e8806 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -150,12 +150,26 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") + self.config = config def call(self, hidden_states, input_tensor, training: bool = False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaAttention with Deberta->DebertaV2 @@ -194,6 +208,16 @@ def call( output = (attention_output,) + self_outputs[1:] return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaIntermediate with Deberta->DebertaV2 @@ -209,12 +233,20 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaOutput with Deberta->DebertaV2 @@ -227,6 +259,7 @@ def __init__(self, config: DebertaV2Config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -234,6 +267,19 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLayer with Deberta->DebertaV2 @@ -272,6 +318,19 @@ def call( outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) class TFDebertaV2ConvLayer(tf.keras.layers.Layer): @@ -354,14 +413,25 @@ def __init__(self, config: DebertaV2Config, **kwargs): self.conv = TFDebertaV2ConvLayer(config, name="conv") if getattr(config, "conv_kernel_size", 0) > 0 else None - def build(self, input_shape): + def build(self, input_shape=None): if self.relative_attention: self.rel_embeddings = self.add_weight( name="rel_embeddings.weight", shape=[self.pos_ebd_size, self.config.hidden_size], initializer=get_initializer(self.config.initializer_range), ) - return super().build(input_shape) + return + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + def get_rel_embedding(self): rel_embeddings = self.rel_embeddings if self.relative_attention else None @@ -825,7 +895,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = TFDebertaV2StableDropout(config.hidden_dropout_prob, name="dropout") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -853,7 +923,17 @@ def build(self, input_shape: tf.TensorShape): else: self.position_embeddings = None - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + def call( self, @@ -929,6 +1009,7 @@ def __init__(self, config: DebertaV2Config, **kwargs): else: self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -936,6 +1017,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.embedding_size]) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaLMPredictionHead with Deberta->DebertaV2 @@ -952,10 +1043,17 @@ def __init__(self, config: DebertaV2Config, input_embeddings: tf.keras.layers.La # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -992,6 +1090,13 @@ def call(self, sequence_output: tf.Tensor) -> tf.Tensor: prediction_scores = self.predictions(hidden_states=sequence_output) return prediction_scores + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaMainLayer with Deberta->DebertaV2 @@ -1076,6 +1181,16 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) # Copied from transformers.models.deberta.modeling_tf_deberta.TFDebertaPreTrainedModel with Deberta->DebertaV2 @@ -1218,6 +1333,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) @@ -1289,6 +1411,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) @add_start_docstrings( @@ -1372,6 +1504,22 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.output_dim) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1393,6 +1541,7 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1444,6 +1593,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1464,6 +1623,7 @@ def __init__(self, config: DebertaV2Config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1530,6 +1690,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) @add_start_docstrings( @@ -1630,3 +1800,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.output_dim) diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py index a8ba5c9a8a6634..49379069de577f 100644 --- a/src/transformers/models/deit/modeling_tf_deit.py +++ b/src/transformers/models/deit/modeling_tf_deit.py @@ -113,7 +113,7 @@ def __init__(self, config: DeiTConfig, use_mask_token: bool = False, **kwargs) - self.patch_embeddings = TFDeiTPatchEmbeddings(config=config, name="patch_embeddings") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), initializer=tf.keras.initializers.zeros(), @@ -141,7 +141,17 @@ def build(self, input_shape: tf.TensorShape): trainable=True, name="position_embeddings", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build(None) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + def call( self, pixel_values: tf.Tensor, bool_masked_pos: tf.Tensor | None = None, training: bool = False @@ -202,6 +212,13 @@ def call(self, pixel_values: tf.Tensor) -> tf.Tensor: batch_size, height, width, num_channels = shape_list(x) x = tf.reshape(x, (batch_size, height * width, num_channels)) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->DeiT @@ -230,6 +247,7 @@ def __init__(self, config: DeiTConfig, **kwargs): units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -278,6 +296,19 @@ def call( outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->DeiT @@ -294,12 +325,20 @@ def __init__(self, config: DeiTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->DeiT @@ -329,6 +368,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->DeiT @@ -344,12 +393,20 @@ def __init__(self, config: DeiTConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->DeiT @@ -361,6 +418,7 @@ def __init__(self, config: DeiTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -368,6 +426,13 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = hidden_states + input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) class TFDeiTLayer(tf.keras.layers.Layer): @@ -386,6 +451,7 @@ def __init__(self, config: DeiTConfig, **kwargs): self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) + self.config = config def call( self, @@ -418,6 +484,25 @@ def call( outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "deit_output", None) is not None: + with tf.name_scope(self.deit_output.name): + self.deit_output.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.config.hidden_size]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.config.hidden_size]) # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->DeiT @@ -464,6 +549,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -555,6 +648,22 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_size]) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) # Copied from transformers.models.vit.modeling_tf_vit.TFViTPreTrainedModel with ViT->DeiT all-casing @@ -646,6 +755,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deit", None) is not None: + with tf.name_scope(self.deit.name): + self.deit.build(None) # Copied from transformers.models.vit.modeling_tf_vit.TFViTPooler with ViT->DeiT @@ -659,6 +775,7 @@ def __init__(self, config: DeiTConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -667,6 +784,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFDeitPixelShuffle(tf.keras.layers.Layer): @@ -821,6 +945,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deit", None) is not None: + with tf.name_scope(self.deit.name): + self.deit.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings( @@ -918,6 +1052,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deit", None) is not None: + with tf.name_scope(self.deit.name): + self.deit.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -998,3 +1142,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deit", None) is not None: + with tf.name_scope(self.deit.name): + self.deit.build(None) + if getattr(self, "cls_classifier", None) is not None: + with tf.name_scope(self.cls_classifier.name): + self.cls_classifier.build(None) + if getattr(self, "distillation_classifier", None) is not None: + with tf.name_scope(self.distillation_classifier.name): + self.distillation_classifier.build(None) diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 6b0e1b0f3febcf..537a166725d8d3 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -84,7 +84,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.dropout) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -99,7 +99,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(initializer_range=self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.dim]) + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): """ @@ -152,6 +159,7 @@ def __init__(self, config, **kwargs): ) self.pruned_heads = set() + self.config = config def prune_heads(self, heads): raise NotImplementedError @@ -211,6 +219,22 @@ def unshape(x): return (context, weights) else: return (context,) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_lin", None) is not None: + with tf.name_scope(self.q_lin.name): + self.q_lin.build(self.config.dim) + if getattr(self, "k_lin", None) is not None: + with tf.name_scope(self.k_lin.name): + self.k_lin.build(self.config.dim) + if getattr(self, "v_lin", None) is not None: + with tf.name_scope(self.v_lin.name): + self.v_lin.build(self.config.dim) + if getattr(self, "out_lin", None) is not None: + with tf.name_scope(self.out_lin.name): + self.out_lin.build(self.config.dim) class TFFFN(tf.keras.layers.Layer): @@ -224,6 +248,7 @@ def __init__(self, config, **kwargs): config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" ) self.activation = get_tf_activation(config.activation) + self.config = config def call(self, input, training=False): x = self.lin1(input) @@ -231,6 +256,16 @@ def call(self, input, training=False): x = self.lin2(x) x = self.dropout(x, training=training) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lin1", None) is not None: + with tf.name_scope(self.lin1.name): + self.lin1.build(self.config.dim) + if getattr(self, "lin2", None) is not None: + with tf.name_scope(self.lin2.name): + self.lin2.build(self.config.hidden_dim) class TFTransformerBlock(tf.keras.layers.Layer): @@ -253,6 +288,7 @@ def __init__(self, config, **kwargs): self.ffn = TFFFN(config, name="ffn") self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") + self.config = config def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None """ @@ -280,6 +316,22 @@ def call(self, x, attn_mask, head_mask, output_attentions, training=False): # r if output_attentions: output = (sa_weights,) + output return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "sa_layer_norm", None) is not None: + with tf.name_scope(self.sa_layer_norm.name): + self.sa_layer_norm.build([None, None, self.config.dim]) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) + if getattr(self, "output_layer_norm", None) is not None: + with tf.name_scope(self.output_layer_norm.name): + self.output_layer_norm.build([None, None, self.config.dim]) class TFTransformer(tf.keras.layers.Layer): @@ -335,6 +387,14 @@ def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return TFBaseModelOutput( last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -411,6 +471,16 @@ def call( ) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # @@ -547,6 +617,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) class TFDistilBertLMHead(tf.keras.layers.Layer): @@ -666,6 +743,22 @@ def call( hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "vocab_transform", None) is not None: + with tf.name_scope(self.vocab_transform.name): + self.vocab_transform.build(self.config.dim) + if getattr(self, "vocab_layer_norm", None) is not None: + with tf.name_scope(self.vocab_layer_norm.name): + self.vocab_layer_norm.build([None, None, self.config.dim]) + if getattr(self, "vocab_projector", None) is not None: + with tf.name_scope(self.vocab_projector.name): + self.vocab_projector.build(None) @add_start_docstrings( @@ -691,6 +784,7 @@ def __init__(self, config, *inputs, **kwargs): config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -745,6 +839,19 @@ def call( hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "pre_classifier", None) is not None: + with tf.name_scope(self.pre_classifier.name): + self.pre_classifier.build(self.config.dim) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.dim) @add_start_docstrings( @@ -764,6 +871,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -813,6 +921,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -837,6 +955,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -907,6 +1026,19 @@ def call( hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "pre_classifier", None) is not None: + with tf.name_scope(self.pre_classifier.name): + self.pre_classifier.build(self.config.dim) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.dim) @add_start_docstrings( @@ -926,6 +1058,7 @@ def __init__(self, config, *inputs, **kwargs): ) assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" self.dropout = tf.keras.layers.Dropout(config.qa_dropout) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -991,3 +1124,13 @@ def call( hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.dim) diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py index 53efa41fda5dee..a533c89cdfca37 100644 --- a/src/transformers/models/dpr/modeling_tf_dpr.py +++ b/src/transformers/models/dpr/modeling_tf_dpr.py @@ -598,6 +598,13 @@ def call( return TFDPRContextEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "ctx_encoder", None) is not None: + with tf.name_scope(self.ctx_encoder.name): + self.ctx_encoder.build(None) @add_start_docstrings( @@ -678,6 +685,13 @@ def call( return TFDPRQuestionEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "question_encoder", None) is not None: + with tf.name_scope(self.question_encoder.name): + self.question_encoder.build(None) @add_start_docstrings( @@ -752,3 +766,10 @@ def call( return_dict=return_dict, training=training, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "span_predictor", None) is not None: + with tf.name_scope(self.span_predictor.name): + self.span_predictor.build(None) diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py index c44a1534287407..7a8de49703fc04 100644 --- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py +++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py @@ -99,6 +99,17 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: embeddings = self.projection(self.padding(pixel_values)) embeddings = self.norm(embeddings, training=training) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build( + self.num_channels) + if getattr(self, "norm", None) is not None: + with tf.name_scope(self.norm.name): + self.norm.build(None) class TFEfficientFormerSelfAttention(tf.keras.layers.Layer): @@ -130,6 +141,7 @@ def __init__( units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" ) self.resolution = resolution + self.dim = dim def build(self, input_shape: tf.TensorShape) -> None: points = list(itertools.product(range(self.resolution), range(self.resolution))) @@ -160,7 +172,17 @@ def build(self, input_shape: tf.TensorShape) -> None: self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points))) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build(self.dim) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.total_expanded_key_dim) + def call( self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False @@ -225,6 +247,8 @@ def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs): ) self.activation = tf.keras.layers.Activation(activation=tf.keras.activations.relu, name="activation") + self.out_channels = out_channels + self.config = config def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training) @@ -232,6 +256,25 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training) features = self.activation(features) return features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution1", None) is not None: + with tf.name_scope(self.convolution1.name): + self.convolution1.build(self.config.num_channels) + if getattr(self, "batchnorm_before", None) is not None: + with tf.name_scope(self.batchnorm_before.name): + self.batchnorm_before.build(None) + if getattr(self, "convolution2", None) is not None: + with tf.name_scope(self.convolution2.name): + self.convolution2.build(self.out_channels // 2) + if getattr(self, "batchnorm_after", None) is not None: + with tf.name_scope(self.batchnorm_after.name): + self.batchnorm_after.build(None) + if getattr(self, "activation", None) is not None: + with tf.name_scope(self.activation.name): + self.activation.build(None) class TFEfficientFormerPooling(tf.keras.layers.Layer): @@ -267,6 +310,8 @@ def __init__( self.linear_out = tf.keras.layers.Dense( units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out" ) + self.hidden_features = hidden_features + self.in_features = in_features def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.linear_in(inputs=hidden_states) @@ -276,6 +321,16 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dropout(inputs=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_in", None) is not None: + with tf.name_scope(self.linear_in.name): + self.linear_in.build(self.in_features) + if getattr(self, "linear_out", None) is not None: + with tf.name_scope(self.linear_out.name): + self.linear_out.build(self.hidden_features) class TFEfficientFormerConvMlp(tf.keras.layers.Layer): @@ -318,6 +373,8 @@ def __init__( self.batchnorm_after = tf.keras.layers.BatchNormalization( axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" ) + self.hidden_features = hidden_features + self.in_features = in_features def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution1(hidden_state) @@ -328,6 +385,22 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.batchnorm_after(hidden_state, training=training) hidden_state = self.dropout(hidden_state, training=training) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution1", None) is not None: + with tf.name_scope(self.convolution1.name): + self.convolution1.build(self.in_features) + if getattr(self, "convolution2", None) is not None: + with tf.name_scope(self.convolution2.name): + self.convolution2.build(self.hidden_features) + if getattr(self, "batchnorm_before", None) is not None: + with tf.name_scope(self.batchnorm_before.name): + self.batchnorm_before.build(None) + if getattr(self, "batchnorm_after", None) is not None: + with tf.name_scope(self.batchnorm_after.name): + self.batchnorm_after.build(None) # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer @@ -390,7 +463,7 @@ def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0 ) self.config = config - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.layer_scale_1 = None self.layer_scale_2 = None @@ -407,7 +480,26 @@ def build(self, input_shape: tf.TensorShape): trainable=True, name="layer_scale_2", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "token_mixer", None) is not None: + with tf.name_scope(self.token_mixer.name): + self.token_mixer.build(None) + if getattr(self, "layernorm1", None) is not None: + with tf.name_scope(self.layernorm1.name): + self.layernorm1.build([None, None, self.dim]) + if getattr(self, "layernorm2", None) is not None: + with tf.name_scope(self.layernorm2.name): + self.layernorm2.build([None, None, self.dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + def call( self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False @@ -475,6 +567,14 @@ def call( return outputs return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) class TFEfficientFormerMeta4D(tf.keras.layers.Layer): @@ -495,7 +595,7 @@ def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0 ) self.config = config - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.layer_scale_1 = None self.layer_scale_2 = None @@ -512,7 +612,20 @@ def build(self, input_shape: tf.TensorShape): trainable=True, name="layer_scale_2", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "token_mixer", None) is not None: + with tf.name_scope(self.token_mixer.name): + self.token_mixer.build(None) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]: outputs = self.token_mixer(hidden_states) @@ -559,6 +672,14 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Ten for layer_module in self.blocks: hidden_states = layer_module(hidden_states=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer): @@ -569,6 +690,13 @@ def __init__(self, config: EfficientFormerConfig, index: int, **kwargs): def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]: hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "meta4D_layers", None) is not None: + with tf.name_scope(self.meta4D_layers.name): + self.meta4D_layers.build(None) class TFEfficientFormerLastStage(tf.keras.layers.Layer): @@ -588,6 +716,19 @@ def call( ) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "meta4D_layers", None) is not None: + with tf.name_scope(self.meta4D_layers.name): + self.meta4D_layers.build(None) + if getattr(self, "flat", None) is not None: + with tf.name_scope(self.flat.name): + self.flat.build(None) + if getattr(self, "meta3D_layers", None) is not None: + with tf.name_scope(self.meta3D_layers.name): + self.meta3D_layers.build(None) class TFEfficientFormerEncoder(tf.keras.layers.Layer): @@ -657,6 +798,13 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "last_stage", None) is not None: + with tf.name_scope(self.last_stage.name): + self.last_stage.build(None) @keras_serializable @@ -727,6 +875,19 @@ def call( hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "patch_embed", None) is not None: + with tf.name_scope(self.patch_embed.name): + self.patch_embed.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) class TFEfficientFormerPreTrainedModel(TFPreTrainedModel): @@ -803,6 +964,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) @add_start_docstrings( @@ -872,6 +1040,16 @@ def call( return TFImageClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @dataclass @@ -984,3 +1162,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + if getattr(self, "distillation_classifier", None) is not None: + with tf.name_scope(self.distillation_classifier.name): + self.distillation_classifier.build(None) diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 41c64eed369d6a..3de7db1e75ec73 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -103,6 +103,7 @@ def __init__(self, config: ElectraConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -191,6 +192,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra @@ -203,6 +217,7 @@ def __init__(self, config: ElectraConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -210,6 +225,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra @@ -251,6 +276,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra @@ -266,12 +301,20 @@ def __init__(self, config: ElectraConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra @@ -284,6 +327,7 @@ def __init__(self, config: ElectraConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -291,6 +335,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra @@ -378,6 +432,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra @@ -448,6 +515,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra @@ -485,7 +560,7 @@ def __init__(self, config: ElectraConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -507,7 +582,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call def call( @@ -565,6 +647,16 @@ def call(self, discriminator_hidden_states, training=False): logits = tf.squeeze(self.dense_prediction(hidden_states), -1) return logits + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "dense_prediction", None) is not None: + with tf.name_scope(self.dense_prediction.name): + self.dense_prediction.build(self.config.hidden_size) class TFElectraGeneratorPredictions(tf.keras.layers.Layer): @@ -573,6 +665,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense") + self.config = config def call(self, generator_hidden_states, training=False): hidden_states = self.dense(generator_hidden_states) @@ -580,6 +673,16 @@ def call(self, generator_hidden_states, training=False): hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFElectraPreTrainedModel(TFPreTrainedModel): @@ -780,6 +883,16 @@ def call( ) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) @dataclass @@ -976,6 +1089,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) @add_start_docstrings( @@ -1048,6 +1168,16 @@ def call( hidden_states=discriminator_hidden_states.hidden_states, attentions=discriminator_hidden_states.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "discriminator_predictions", None) is not None: + with tf.name_scope(self.discriminator_predictions.name): + self.discriminator_predictions.build(None) class TFElectraMaskedLMHead(tf.keras.layers.Layer): @@ -1176,6 +1306,19 @@ def call( hidden_states=generator_hidden_states.hidden_states, attentions=generator_hidden_states.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "generator_predictions", None) is not None: + with tf.name_scope(self.generator_predictions.name): + self.generator_predictions.build(None) + if getattr(self, "generator_lm_head", None) is not None: + with tf.name_scope(self.generator_lm_head.name): + self.generator_lm_head.build(None) class TFElectraClassificationHead(tf.keras.layers.Layer): @@ -1196,6 +1339,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, inputs, **kwargs): x = inputs[:, 0, :] # take token (equiv. to [CLS]) @@ -1206,6 +1350,16 @@ def call(self, inputs, **kwargs): x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -1277,6 +1431,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1297,6 +1461,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1369,6 +1534,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1391,6 +1569,7 @@ def __init__(self, config, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1447,6 +1626,16 @@ def call( hidden_states=discriminator_hidden_states.hidden_states, attentions=discriminator_hidden_states.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1465,6 +1654,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1541,3 +1731,13 @@ def call( hidden_states=discriminator_hidden_states.hidden_states, attentions=discriminator_hidden_states.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py index 3e9223087ba9fc..9b912aaa09d2a0 100644 --- a/src/transformers/models/esm/modeling_tf_esm.py +++ b/src/transformers/models/esm/modeling_tf_esm.py @@ -149,10 +149,17 @@ def __init__( self.in_features = in_features self.regression = Dense(1, use_bias=bias, activation="sigmoid", name="regression") - def build(self, input_shape): - super().build(input_shape) + def build(self, input_shape=None): + with tf.name_scope("regression"): self.regression.build((None, self.in_features)) + if self.built: + return + self.built = True + if getattr(self, "regression", None) is not None: + with tf.name_scope(self.regression.name): + self.regression.build(self.in_features) + def call(self, tokens, attentions): # remove eos token attentions @@ -267,6 +274,16 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): start=self.padding_idx + 1, limit=sequence_length + self.padding_idx + 1, dtype=tf.int64 ) return tf.broadcast_to(tf.expand_dims(position_ids, 0), input_shape) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "word_embeddings", None) is not None: + with tf.name_scope(self.word_embeddings.name): + self.word_embeddings.build(None) + if getattr(self, "position_embeddings", None) is not None: + with tf.name_scope(self.position_embeddings.name): + self.position_embeddings.build(None) class TFEsmSelfAttention(Layer): @@ -306,6 +323,7 @@ def __init__(self, config, position_embedding_type=None, name=None): self.rotary_embeddings = TFRotaryEmbedding(dim=self.attention_head_size, name="rotary_embeddings") self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size] @@ -414,6 +432,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) class TFEsmSelfOutput(Layer): @@ -423,12 +454,20 @@ def __init__(self, config, name=None): config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states += input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFEsmAttention(Layer): @@ -438,6 +477,7 @@ def __init__(self, config, name=None): self.output_layer = TFEsmSelfOutput(config, name="output") self.pruned_heads = set() self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def prune_heads(self, heads): raise NotImplementedError @@ -467,6 +507,19 @@ def call( attention_output = self.output_layer(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "output_layer", None) is not None: + with tf.name_scope(self.output_layer.name): + self.output_layer.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFEsmIntermediate(tf.keras.layers.Layer): @@ -478,11 +531,19 @@ def __init__(self, config: EsmConfig, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = tf.nn.gelu(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFEsmOutput(Layer): @@ -492,12 +553,20 @@ def __init__(self, config, name=None): config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states += input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) class TFEsmLayer(Layer): @@ -515,6 +584,7 @@ def __init__(self, config, name=None): self.intermediate = TFEsmIntermediate(config, name="intermediate") self.output_layer = TFEsmOutput(config, name="output") self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call( self, @@ -585,6 +655,22 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "output_layer", None) is not None: + with tf.name_scope(self.output_layer.name): + self.output_layer.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFEsmEncoder(Layer): @@ -664,6 +750,17 @@ def call( attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "emb_layer_norm_after", None) is not None: + with tf.name_scope(self.emb_layer_norm_after.name): + self.emb_layer_norm_after.build([None, None, self.config.hidden_size]) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Esm @@ -677,6 +774,7 @@ def __init__(self, config: EsmConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -685,6 +783,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFEsmPreTrainedModel(TFPreTrainedModel): @@ -787,10 +892,26 @@ def __init__(self, config, add_pooling_layer=True, name=None, **kwargs): in_features=self.config.num_hidden_layers * self.config.num_attention_heads, bias=True, name="contact_head" ) - def build(self, input_shape): - super().build(input_shape) + def build(self, input_shape=None): + with tf.name_scope("contact_head"): self.contact_head.build(input_shape) + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "contact_head", None) is not None: + with tf.name_scope(self.contact_head.name): + self.contact_head.build(None) + def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -1040,6 +1161,13 @@ def call( def predict_contacts(self, tokens, attention_mask): return self.esm.predict_contacts(tokens, attention_mask) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "esm", None) is not None: + with tf.name_scope(self.esm.name): + self.esm.build(None) @add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING) @@ -1139,6 +1267,16 @@ def call( def predict_contacts(self, tokens, attention_mask): return self.esm.predict_contacts(tokens, attention_mask) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "esm", None) is not None: + with tf.name_scope(self.esm.name): + self.esm.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) class TFEsmLMHead(Layer): @@ -1162,11 +1300,21 @@ def __init__(self, config, name=None): ) self.config = config - def build(self, input_shape): - super().build(input_shape) + def build(self, input_shape=None): + # Separate bias to match the PT model and allow weight cross-loading to work # Put it in the build so it gets the right name when adding it as a weight self.bias = self.add_weight("bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_bias(self): return {"bias": self.bias} @@ -1256,6 +1404,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "esm", None) is not None: + with tf.name_scope(self.esm.name): + self.esm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1276,6 +1434,7 @@ def __init__(self, config): self.esm = TFEsmMainLayer(config, add_pooling_layer=False, name="esm") self.dropout = Dropout(config.hidden_dropout_prob) self.classifier = Dense(config.num_labels, name="classifier") + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ESM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1332,6 +1491,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "esm", None) is not None: + with tf.name_scope(self.esm.name): + self.esm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) class TFEsmClassificationHead(Layer): @@ -1352,6 +1521,7 @@ def __init__(self, config, name=None): activation="linear", name="out_proj", ) + self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1360,6 +1530,16 @@ def call(self, features, training=False): x = self.dropout(x, training=training) x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index 375e19360f2a43..df21bce66afb8f 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -289,6 +289,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) # Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert @@ -454,7 +461,7 @@ def __init__(self, config, **kwargs): tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") ) - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("position_embeddings"): self.position_embeddings = self.add_weight( name="embeddings", @@ -470,7 +477,17 @@ def build(self, input_shape): initializer=get_initializer(self.embed_init_std), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "layer_norm_emb", None) is not None: + with tf.name_scope(self.layer_norm_emb.name): + self.layer_norm_emb.build([None, None, self.dim]) + def get_input_embeddings(self): return self.embeddings @@ -840,6 +857,16 @@ def call( return TFFlaubertWithLMHeadModelOutput( logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "pred_layer", None) is not None: + with tf.name_scope(self.pred_layer.name): + self.pred_layer.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -919,6 +946,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) @add_start_docstrings( @@ -936,6 +973,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1011,6 +1049,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) @add_start_docstrings( @@ -1031,6 +1079,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1092,6 +1141,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1111,6 +1170,7 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) + self.config = config @property def dummy_inputs(self): @@ -1214,3 +1274,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "logits_proj", None) is not None: + with tf.name_scope(self.logits_proj.name): + self.logits_proj.build(self.config.num_labels) diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index ccd07b5954b78d..bec03b922b75bb 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -90,7 +90,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout) - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -98,7 +98,14 @@ def build(self, input_shape): initializer=get_initializer(initializer_range=self.initializer_std), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.d_model]) + def call(self, input_ids=None, inputs_embeds=None, training=False): """ @@ -407,7 +414,7 @@ def __init__(self, config, block_index, **kwargs): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.scale = 1.0 / (d_head**0.5) - def build(self, input_shape): + def build(self, input_shape=None): n_head, d_head, d_model = self.n_head, self.d_head, self.d_model initializer = get_initializer(self.initializer_range) @@ -426,7 +433,26 @@ def build(self, input_shape): self.seg_embed = self.add_weight( shape=(2, n_head, d_head), initializer=initializer, trainable=True, name="seg_embed" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "q_head", None) is not None: + with tf.name_scope(self.q_head.name): + self.q_head.build(d_model) + if getattr(self, "k_head", None) is not None: + with tf.name_scope(self.k_head.name): + self.k_head.build(d_model) + if getattr(self, "v_head", None) is not None: + with tf.name_scope(self.v_head.name): + self.v_head.build(d_model) + if getattr(self, "post_proj", None) is not None: + with tf.name_scope(self.post_proj.name): + self.post_proj.build(n_head * d_head) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, d_model]) + def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None): """Relative attention score for the positional encodings""" @@ -557,6 +583,7 @@ def __init__(self, config, **kwargs): self.linear_2 = tf.keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="linear_2") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.config = config def call(self, hidden, training=False): h = self.linear_1(hidden) @@ -565,6 +592,19 @@ def call(self, hidden, training=False): h = self.linear_2(h) h = self.dropout(h, training=training) return self.layer_norm(hidden + h) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_1", None) is not None: + with tf.name_scope(self.linear_1.name): + self.linear_1.build(self.config.d_model) + if getattr(self, "linear_2", None) is not None: + with tf.name_scope(self.linear_2.name): + self.linear_2.build(self.config.d_inner) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) class TFFunnelLayer(tf.keras.layers.Layer): @@ -579,6 +619,16 @@ def call(self, query, key, value, attention_inputs, output_attentions=False, tra ) output = self.ffn(attn[0], training=training) return (output, attn[1]) if output_attentions else (output,) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) class TFFunnelEncoder(tf.keras.layers.Layer): @@ -649,6 +699,14 @@ def call( if not return_dict: return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None) return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + for block in self.blocks: + for layer in block: + with tf.name_scope(layer.name): + layer.build(None) def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False): @@ -724,6 +782,14 @@ def call( if not return_dict: return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None) return TFBaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -894,6 +960,19 @@ def call( else None, attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) class TFFunnelDiscriminatorPredictions(tf.keras.layers.Layer): @@ -905,12 +984,23 @@ def __init__(self, config, **kwargs): self.dense = tf.keras.layers.Dense(config.d_model, kernel_initializer=initializer, name="dense") self.activation_function = get_tf_activation(config.hidden_act) self.dense_prediction = tf.keras.layers.Dense(1, kernel_initializer=initializer, name="dense_prediction") + self.config = config def call(self, discriminator_hidden_states): hidden_states = self.dense(discriminator_hidden_states) hidden_states = self.activation_function(hidden_states) logits = tf.squeeze(self.dense_prediction(hidden_states)) return logits + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.d_model) + if getattr(self, "dense_prediction", None) is not None: + with tf.name_scope(self.dense_prediction.name): + self.dense_prediction.build(self.config.d_model) class TFFunnelMaskedLMHead(tf.keras.layers.Layer): @@ -958,12 +1048,23 @@ def __init__(self, config, n_labels, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.linear_out = tf.keras.layers.Dense(n_labels, kernel_initializer=initializer, name="linear_out") + self.config = config def call(self, hidden, training=False): hidden = self.linear_hidden(hidden) hidden = tf.keras.activations.tanh(hidden) hidden = self.dropout(hidden, training=training) return self.linear_out(hidden) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_hidden", None) is not None: + with tf.name_scope(self.linear_hidden.name): + self.linear_hidden.build(self.config.d_model) + if getattr(self, "linear_out", None) is not None: + with tf.name_scope(self.linear_out.name): + self.linear_out.build(self.config.d_model) class TFFunnelPreTrainedModel(TFPreTrainedModel): @@ -1146,6 +1247,13 @@ def serving_output(self, output): hidden_states=output.hidden_states, attentions=output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1194,6 +1302,13 @@ def serving_output(self, output): hidden_states=output.hidden_states, attentions=output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) @add_start_docstrings( @@ -1267,6 +1382,16 @@ def serving_output(self, output): return TFFunnelForPreTrainingOutput( logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) + if getattr(self, "discriminator_predictions", None) is not None: + with tf.name_scope(self.discriminator_predictions.name): + self.discriminator_predictions.build(None) @add_start_docstrings("""Funnel Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING) @@ -1339,6 +1464,16 @@ def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of # different dimensions return TFMaskedLMOutput(logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) @add_start_docstrings( @@ -1414,6 +1549,16 @@ def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassi return TFSequenceClassifierOutput( logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1509,6 +1654,16 @@ def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoic return TFMultipleChoiceModelOutput( logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1528,6 +1683,7 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None: self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1586,6 +1742,16 @@ def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOu return TFTokenClassifierOutput( logits=output.logits, hidden_states=output.hidden_states, attentions=output.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1604,6 +1770,7 @@ def __init__(self, config: FunnelConfig, *inputs, **kwargs) -> None: self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1679,3 +1846,13 @@ def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAn hidden_states=output.hidden_states, attentions=output.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "funnel", None) is not None: + with tf.name_scope(self.funnel.name): + self.funnel.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index a71c6b3dfce48c..6e5a06a0ab141e 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -509,6 +509,23 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wte", None) is not None: + with tf.name_scope(self.wte.name): + self.wte.build(None) + if getattr(self, "wpe", None) is not None: + with tf.name_scope(self.wpe.name): + self.wpe.build(None) + if getattr(self, "ln_f", None) is not None: + with tf.name_scope(self.ln_f.name): + self.ln_f.build([None, None, self.embed_dim]) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) class TFGPT2PreTrainedModel(TFPreTrainedModel): @@ -751,6 +768,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) @add_start_docstrings( @@ -883,6 +907,13 @@ def call( attentions=transformer_outputs.attentions, cross_attentions=transformer_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) @add_start_docstrings( @@ -1012,6 +1043,16 @@ def input_signature(self): "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"), } + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "multiple_choice_head", None) is not None: + with tf.name_scope(self.multiple_choice_head.name): + self.multiple_choice_head.build(None) @add_start_docstrings( @@ -1040,6 +1081,7 @@ def __init__(self, config, *inputs, **kwargs): use_bias=False, ) self.transformer = TFGPT2MainLayer(config, name="transformer") + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) @@ -1128,3 +1170,13 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "score", None) is not None: + with tf.name_scope(self.score.name): + self.score.build(self.config.n_embd) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index 08c106c49f7036..907c4f5f276566 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -266,6 +266,22 @@ def call( outputs += (attn_weights,) return outputs # a, present, (attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFGPTJMLP(tf.keras.layers.Layer): @@ -283,6 +299,7 @@ def __init__(self, intermediate_size: int, config: GPTJConfig, **kwargs): self.act = get_tf_activation(config.activation_function) self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) self.embed_dim = config.n_embd + self.intermediate_size = intermediate_size def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc_in(hidden_states) @@ -290,6 +307,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.fc_out(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc_in", None) is not None: + with tf.name_scope(self.fc_in.name): + self.fc_in.build(self.embed_dim) + if getattr(self, "fc_out", None) is not None: + with tf.name_scope(self.fc_out.name): + self.fc_out.build(self.intermediate_size) class TFGPTJBlock(tf.keras.layers.Layer): @@ -299,6 +326,7 @@ def __init__(self, config: GPTJConfig, **kwargs): self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFGPTJAttention(config, name="attn") self.mlp = TFGPTJMLP(inner_dim, config, name="mlp") + self.config = config def call( self, @@ -332,6 +360,19 @@ def call( else: outputs = (hidden_states,) + outputs[1:] return outputs # hidden_states, present, (attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "ln_1", None) is not None: + with tf.name_scope(self.ln_1.name): + self.ln_1.build([None, None, self.config.n_embd]) + if getattr(self, "attn", None) is not None: + with tf.name_scope(self.attn.name): + self.attn.build(None) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) @keras_serializable @@ -501,6 +542,20 @@ def call( hidden_states=all_hidden_states, attentions=all_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wte", None) is not None: + with tf.name_scope(self.wte.name): + self.wte.build(None) + if getattr(self, "ln_f", None) is not None: + with tf.name_scope(self.ln_f.name): + self.ln_f.build([None, None, self.embed_dim]) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) class TFGPTJPreTrainedModel(TFPreTrainedModel): @@ -673,6 +728,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) @add_start_docstrings( @@ -688,6 +750,7 @@ def __init__(self, config, *inputs, **kwargs): self.lm_head = tf.keras.layers.Dense( config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="lm_head" ) + self.config = config def get_output_embeddings(self): return self.lm_head @@ -785,6 +848,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(self.config.n_embd) @add_start_docstrings( @@ -815,6 +888,7 @@ def __init__(self, config, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="score", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -907,6 +981,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "score", None) is not None: + with tf.name_scope(self.score.name): + self.score.build(self.config.n_embd) @add_start_docstrings( @@ -926,6 +1010,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( self.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -998,3 +1083,13 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index e61539e3040cf9..e712be980c5924 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -271,6 +271,7 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2") self.mlp = TFGroupViTMLP(config, name="mlp") self.norm_post = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post") + self.config = config def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor: x = query @@ -278,6 +279,22 @@ def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.T x = x + self.mlp(self.norm2(x)) x = self.norm_post(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attn", None) is not None: + with tf.name_scope(self.attn.name): + self.attn.build(None) + if getattr(self, "norm2", None) is not None: + with tf.name_scope(self.norm2.name): + self.norm2.build([None, None, self.config.hidden_size]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "norm_post", None) is not None: + with tf.name_scope(self.norm_post.name): + self.norm_post.build([None, None, self.config.hidden_size]) class TFGroupViTAssignAttention(tf.keras.layers.Layer): @@ -290,6 +307,7 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.v_proj = tf.keras.layers.Dense(config.hidden_size, name="v_proj") self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj") self.assign_eps = config.assign_eps + self.config = config def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor: if gumbel and training: @@ -326,6 +344,22 @@ def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False): out = self.proj(out) return out, soft_attn + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.config.hidden_size) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.config.hidden_size) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.config.hidden_size) + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build(self.config.hidden_size) class TFGroupViTTokenAssign(tf.keras.layers.Layer): @@ -353,6 +387,7 @@ def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_outpu self.mlp_channels = TFGroupViTMLP( config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels" ) + self.config = config def project_group_token(self, group_tokens: tf.Tensor) -> tf.Tensor: """ @@ -385,6 +420,34 @@ def call(self, image_tokens: tf.Tensor, group_tokens: tf.Tensor, training: bool new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens)) return new_image_tokens, attention + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "norm_tokens", None) is not None: + with tf.name_scope(self.norm_tokens.name): + self.norm_tokens.build([None, None, self.config.hidden_size]) + if getattr(self, "mlp_inter", None) is not None: + with tf.name_scope(self.mlp_inter.name): + self.mlp_inter.build(None) + if getattr(self, "norm_post_tokens", None) is not None: + with tf.name_scope(self.norm_post_tokens.name): + self.norm_post_tokens.build([None, None, self.config.hidden_size]) + if getattr(self, "norm_x", None) is not None: + with tf.name_scope(self.norm_x.name): + self.norm_x.build([None, None, self.config.hidden_size]) + if getattr(self, "pre_assign_attn", None) is not None: + with tf.name_scope(self.pre_assign_attn.name): + self.pre_assign_attn.build(None) + if getattr(self, "assign", None) is not None: + with tf.name_scope(self.assign.name): + self.assign.build(None) + if getattr(self, "norm_new_x", None) is not None: + with tf.name_scope(self.norm_new_x.name): + self.norm_new_x.build([None, None, self.config.hidden_size]) + if getattr(self, "mlp_channels", None) is not None: + with tf.name_scope(self.mlp_channels.name): + self.mlp_channels.build(None) # Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT @@ -456,6 +519,13 @@ def call( embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, self.hidden_size)) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) # Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings @@ -473,7 +543,7 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") self.config = config - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): num_patches = self.patch_embeddings.num_patches self.position_embeddings = self.add_weight( shape=(1, num_patches, self.config.hidden_size), @@ -482,7 +552,20 @@ def build(self, input_shape: tf.TensorShape): name="position_embeddings", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build(None) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_size]) + def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor: """ @@ -626,7 +709,7 @@ def __init__( else: self.group_projector = None - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): if self.num_group_token > 0: self.group_token = self.add_weight( shape=(1, self.num_group_token, self.config.hidden_size), @@ -636,7 +719,15 @@ def build(self, input_shape: tf.TensorShape): ) else: self.group_token = None - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + @property def with_group_token(self): @@ -720,12 +811,24 @@ def __init__( output_size = output_size if output_size is not None else hidden_size self.fc1 = tf.keras.layers.Dense(intermediate_size, name="fc1") self.fc2 = tf.keras.layers.Dense(output_size, name="fc2") + self.intermediate_size = intermediate_size + self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.fc1(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.hidden_size) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.intermediate_size) class TFGroupViTMixerMLP(TFGroupViTMLP): @@ -840,6 +943,22 @@ def call( outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer with CLIP->GroupViT @@ -893,6 +1012,22 @@ def call( outputs = (hidden_states,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) # Adapted from transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder @@ -938,6 +1073,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFGroupViTVisionEncoder(tf.keras.layers.Layer): @@ -989,6 +1132,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "stages", None) is not None: + for layer in self.stages: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder @@ -1094,6 +1245,19 @@ def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32) to_mask = tf.linalg.set_diag(to_mask, diagonal=diag) return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) # Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer @@ -1138,6 +1302,19 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.embed_dim]) @keras_serializable @@ -1187,6 +1364,13 @@ def call( ) return text_model_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) @keras_serializable @@ -1223,6 +1407,13 @@ def call( ) return vision_model_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) @keras_serializable @@ -1271,7 +1462,7 @@ def __init__(self, config: GroupViTConfig, **kwargs): tf.keras.layers.Dense(self.projection_dim, name="text_projection.3"), ] - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.logit_scale = self.add_weight( shape=(1,), initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), @@ -1279,7 +1470,25 @@ def build(self, input_shape: tf.TensorShape): name="logit_scale", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + for layer in self.visual_projection: + with tf.name_scope(layer.name): + layer.build(None) + if getattr(self, "text_projection", None) is not None: + for layer in self.text_projection: + with tf.name_scope(layer.name): + layer.build(None) + @unpack_inputs def get_text_features( @@ -1670,6 +1879,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "groupvit", None) is not None: + with tf.name_scope(self.groupvit.name): + self.groupvit.build(None) class TFGroupViTVisionModel(TFGroupViTPreTrainedModel): @@ -1724,6 +1940,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "groupvit", None) is not None: + with tf.name_scope(self.groupvit.name): + self.groupvit.build(None) @add_start_docstrings(GROUPVIT_START_DOCSTRING) @@ -1881,3 +2104,10 @@ def serving_output(self, output: TFGroupViTModelOutput) -> TFGroupViTModelOutput # TensorFlow cannot trace through nested dataclasses. Reference: # https://github.com/huggingface/transformers/pull/16886 return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "groupvit", None) is not None: + with tf.name_scope(self.groupvit.name): + self.groupvit.build(None) diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index ee52c09eee1366..38a258bc35788d 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -468,6 +468,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.conv(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2LayerNormConvLayer with Wav2Vec2->Hubert @@ -492,6 +499,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.layer_norm(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.out_conv_dim]) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2GroupNormConvLayer with Wav2Vec2->Hubert @@ -516,6 +533,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.layer_norm(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2PositionalConvEmbedding with Wav2Vec2->Hubert @@ -537,6 +564,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.padding(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2SamePadLayer with Wav2Vec2->Hubert @@ -601,12 +635,23 @@ def __init__(self, config: HubertConfig, **kwargs): name="projection", ) self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) + self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.layer_norm(hidden_states) hidden_states = self.projection(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.config.conv_dim[-1]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) # TODO Matt might be wrong # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFHubert @@ -761,6 +806,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2FeedForward with Wav2Vec2->Hubert @@ -785,6 +846,7 @@ def __init__(self, config: HubertConfig, **kwargs): name="output_dense", ) self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.intermediate_dense(hidden_states) @@ -794,6 +856,16 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.output_dense(hidden_states) hidden_states = self.output_dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "intermediate_dense", None) is not None: + with tf.name_scope(self.intermediate_dense.name): + self.intermediate_dense.build(self.config.hidden_size) + if getattr(self, "output_dense", None) is not None: + with tf.name_scope(self.output_dense.name): + self.output_dense.build(self.config.intermediate_size) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayer with Wav2Vec2->Hubert @@ -813,6 +885,7 @@ def __init__(self, config: HubertConfig, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) + self.config = config def call( self, @@ -838,6 +911,22 @@ def call( outputs += (attn_weights,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "feed_forward", None) is not None: + with tf.name_scope(self.feed_forward.name): + self.feed_forward.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert @@ -857,6 +946,7 @@ def __init__(self, config: HubertConfig, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) + self.config = config def call( self, @@ -880,6 +970,22 @@ def call( outputs += (attn_weights,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "feed_forward", None) is not None: + with tf.name_scope(self.feed_forward.name): + self.feed_forward.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2Encoder with Wav2Vec2->Hubert @@ -946,6 +1052,20 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "pos_conv_embed", None) is not None: + with tf.name_scope(self.pos_conv_embed.name): + self.pos_conv_embed.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.wav2vec2.modeling_tf_wav2vec2.TFWav2Vec2EncoderStableLayerNorm with Wav2Vec2->Hubert @@ -1014,6 +1134,20 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "pos_conv_embed", None) is not None: + with tf.name_scope(self.pos_conv_embed.name): + self.pos_conv_embed.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1031,12 +1165,22 @@ def __init__(self, config: HubertConfig, **kwargs): else: self.encoder = TFHubertEncoder(config, name="encoder") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.masked_spec_embed = self.add_weight( shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "feature_extractor", None) is not None: + with tf.name_scope(self.feature_extractor.name): + self.feature_extractor.build(None) + if getattr(self, "feature_projection", None) is not None: + with tf.name_scope(self.feature_projection.name): + self.feature_projection.build(None) + def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor): """ @@ -1344,6 +1488,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "hubert", None) is not None: + with tf.name_scope(self.hubert.name): + self.hubert.build(None) @add_start_docstrings( @@ -1500,3 +1651,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "hubert", None) is not None: + with tf.name_scope(self.hubert.name): + self.hubert.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(self.output_hidden_size) diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index c756609468598c..7d6ada2c5cca79 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -73,7 +73,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -123,7 +123,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def call( self, @@ -216,6 +223,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -304,6 +312,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->LayoutLM @@ -316,6 +337,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -323,6 +345,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->LayoutLM @@ -364,6 +396,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->LayoutLM @@ -379,12 +421,20 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->LayoutLM @@ -397,6 +447,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -404,6 +455,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->LayoutLM @@ -491,6 +552,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->LayoutLM @@ -561,6 +635,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->LayoutLM @@ -574,6 +656,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -582,6 +665,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->LayoutLM @@ -601,6 +691,7 @@ def __init__(self, config: LayoutLMConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -608,6 +699,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->LayoutLM @@ -624,10 +725,17 @@ def __init__(self, config: LayoutLMConfig, input_embeddings: tf.keras.layers.Lay # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -795,6 +903,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFLayoutLMPreTrainedModel(TFPreTrainedModel): @@ -985,6 +1106,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlm", None) is not None: + with tf.name_scope(self.layoutlm.name): + self.layoutlm.build(None) @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING) @@ -1106,6 +1234,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlm", None) is not None: + with tf.name_scope(self.layoutlm.name): + self.layoutlm.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1132,6 +1270,7 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1224,6 +1363,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlm", None) is not None: + with tf.name_scope(self.layoutlm.name): + self.layoutlm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1256,6 +1405,7 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1346,6 +1496,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlm", None) is not None: + with tf.name_scope(self.layoutlm.name): + self.layoutlm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1376,6 +1536,7 @@ def __init__(self, config: LayoutLMConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1485,3 +1646,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlm", None) is not None: + with tf.name_scope(self.layoutlm.name): + self.layoutlm.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py index feba69eafc2a71..4a548f1be9c83d 100644 --- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py @@ -87,6 +87,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): ) self.hidden_size = config.hidden_size self.num_patches = (config.input_size**2) // (patch_sizes[0] * patch_sizes[1]) + self.config = config def call(self, pixel_values: tf.Tensor) -> tf.Tensor: # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. @@ -96,6 +97,13 @@ def call(self, pixel_values: tf.Tensor) -> tf.Tensor: embeddings = self.proj(pixel_values) embeddings = tf.reshape(embeddings, (-1, self.num_patches, self.hidden_size)) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build(self.config.num_channels) class TFLayoutLMv3TextEmbeddings(tf.keras.layers.Layer): @@ -151,6 +159,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): name="w_position_embeddings", ) self.max_2d_positions = config.max_2d_position_embeddings + self.config = config def calculate_spatial_position_embeddings(self, bbox: tf.Tensor) -> tf.Tensor: try: @@ -259,6 +268,34 @@ def call( embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "word_embeddings", None) is not None: + with tf.name_scope(self.word_embeddings.name): + self.word_embeddings.build(None) + if getattr(self, "token_type_embeddings", None) is not None: + with tf.name_scope(self.token_type_embeddings.name): + self.token_type_embeddings.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "position_embeddings", None) is not None: + with tf.name_scope(self.position_embeddings.name): + self.position_embeddings.build(None) + if getattr(self, "x_position_embeddings", None) is not None: + with tf.name_scope(self.x_position_embeddings.name): + self.x_position_embeddings.build(None) + if getattr(self, "y_position_embeddings", None) is not None: + with tf.name_scope(self.y_position_embeddings.name): + self.y_position_embeddings.build(None) + if getattr(self, "h_position_embeddings", None) is not None: + with tf.name_scope(self.h_position_embeddings.name): + self.h_position_embeddings.build(None) + if getattr(self, "w_position_embeddings", None) is not None: + with tf.name_scope(self.w_position_embeddings.name): + self.w_position_embeddings.build(None) class TFLayoutLMv3SelfAttention(tf.keras.layers.Layer): @@ -294,6 +331,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.has_relative_attention_bias = config.has_relative_attention_bias self.has_spatial_attention_bias = config.has_spatial_attention_bias + self.config = config def transpose_for_scores(self, x: tf.Tensor): shape = tf.shape(x) @@ -371,6 +409,19 @@ def call( outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from models.roberta.modeling_tf_roberta.TFRobertaSelfOutput @@ -383,6 +434,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -390,6 +442,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFLayoutLMv3Attention(tf.keras.layers.Layer): @@ -420,6 +482,16 @@ def call( attention_output = self.self_output(self_outputs[0], hidden_states, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) # Copied from models.roberta.modeling_tf_bert.TFRobertaIntermediate @@ -435,12 +507,20 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from models.roberta.modeling_tf_bert.TFRobertaOutput @@ -453,6 +533,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -460,6 +541,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFLayoutLMv3Layer(tf.keras.layers.Layer): @@ -494,6 +585,19 @@ def call( layer_output = self.bert_output(intermediate_output, attention_output, training=training) outputs = (layer_output,) + outputs return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) class TFLayoutLMv3Encoder(tf.keras.layers.Layer): @@ -649,6 +753,14 @@ def call( return tuple( value for value in [hidden_states, all_hidden_states, all_self_attentions] if value is not None ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -676,7 +788,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): self.encoder = TFLayoutLMv3Encoder(config, name="encoder") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): if self.config.visual_embed: image_size = self.config.input_size // self.config.patch_size self.cls_token = self.add_weight( @@ -694,7 +806,14 @@ def build(self, input_shape: tf.TensorShape): name="pos_embed", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + def get_input_embeddings(self) -> tf.keras.layers.Layer: return self.embeddings.word_embeddings @@ -1179,6 +1298,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlmv3", None) is not None: + with tf.name_scope(self.layoutlmv3.name): + self.layoutlmv3.build(None) class TFLayoutLMv3ClassificationHead(tf.keras.layers.Layer): @@ -1206,6 +1332,7 @@ def __init__(self, config: LayoutLMv3Config, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="out_proj", ) + self.config = config def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: outputs = self.dropout(inputs, training=training) @@ -1213,6 +1340,19 @@ def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: outputs = self.dropout(outputs, training=training) outputs = self.out_proj(outputs) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1316,6 +1456,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlmv3", None) is not None: + with tf.name_scope(self.layoutlmv3.name): + self.layoutlmv3.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1439,6 +1589,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlmv3", None) is not None: + with tf.name_scope(self.layoutlmv3.name): + self.layoutlmv3.build(None) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) @add_start_docstrings( @@ -1567,3 +1727,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layoutlmv3", None) is not None: + with tf.name_scope(self.layoutlmv3.name): + self.layoutlmv3.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(None) diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index 8b8bd721e99e78..224972e06c2e40 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -200,7 +200,29 @@ def build(self, input_shape=None): self.key_global.build((self.config.hidden_size,)) with tf.name_scope("value_global"): self.value_global.build((self.config.hidden_size,)) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) + if getattr(self, "query_global", None) is not None: + with tf.name_scope(self.query_global.name): + self.query_global.build(self.config.hidden_size) + if getattr(self, "key_global", None) is not None: + with tf.name_scope(self.key_global.name): + self.key_global.build(self.config.hidden_size) + if getattr(self, "value_global", None) is not None: + with tf.name_scope(self.value_global.name): + self.value_global.build(self.config.hidden_size) + def call( self, @@ -983,6 +1005,7 @@ def __init__(self, config, layer_id, **kwargs): super().__init__(**kwargs) self.longformer_self_attn = TFLEDEncoderSelfAttention(config, layer_id=layer_id, name="longformer_self_attn") self.output_dense = tf.keras.layers.Dense(config.d_model, use_bias=True, name="output") + self.config = config def call(self, inputs, training=False): ( @@ -1003,6 +1026,16 @@ def call(self, inputs, training=False): outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer_self_attn", None) is not None: + with tf.name_scope(self.longformer_self_attn.name): + self.longformer_self_attn.build(None) + if getattr(self, "output_dense", None) is not None: + with tf.name_scope(self.output_dense.name): + self.output_dense.build(self.config.d_model) class TFLEDDecoderAttention(tf.keras.layers.Layer): @@ -1154,6 +1187,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFLEDEncoderLayer(tf.keras.layers.Layer): @@ -1168,6 +1217,7 @@ def __init__(self, config: LEDConfig, layer_id: int, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -1213,6 +1263,25 @@ def call( hidden_states = self.final_layer_norm(hidden_states) return (hidden_states,) + layer_outputs[1:] + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFLEDDecoderLayer(tf.keras.layers.Layer): @@ -1242,6 +1311,7 @@ def __init__(self, config: LEDConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -1322,6 +1392,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFLEDPreTrainedModel(TFPreTrainedModel): @@ -1884,6 +1979,20 @@ def _pad_to_window_size( attention_mask, inputs_embeds, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -2104,6 +2213,20 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -2210,6 +2333,19 @@ def call( encoder_attentions=encoder_outputs.attentions, encoder_global_attentions=encoder_outputs.global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -2296,6 +2432,13 @@ def serving_output(self, output): encoder_attentions=enc_attns, encoder_global_attentions=enc_g_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "led", None) is not None: + with tf.name_scope(self.led.name): + self.led.build(None) # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer @@ -2517,3 +2660,13 @@ def hf_compute_loss(self, labels, logits): masked_loss = unmasked_loss * loss_mask reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) return tf.reshape(reduced_masked_loss, (1,)) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "led", None) is not None: + with tf.name_scope(self.led.name): + self.led.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index 0397c2ba320ec5..dddffa95816492 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -434,10 +434,20 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_output_embeddings(self): return self.decoder @@ -484,7 +494,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -506,7 +516,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -582,12 +599,20 @@ def __init__(self, config: LongformerConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Longformer @@ -600,6 +625,7 @@ def __init__(self, config: LongformerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -607,6 +633,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Longformer @@ -620,6 +656,7 @@ def __init__(self, config: LongformerConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -628,6 +665,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Longformer @@ -640,6 +684,7 @@ def __init__(self, config: LongformerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -647,6 +692,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFLongformerSelfAttention(tf.keras.layers.Layer): @@ -717,7 +772,29 @@ def build(self, input_shape=None): self.key_global.build((self.config.hidden_size,)) with tf.name_scope("value_global"): self.value_global.build((self.config.hidden_size,)) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) + if getattr(self, "query_global", None) is not None: + with tf.name_scope(self.query_global.name): + self.query_global.build(self.config.hidden_size) + if getattr(self, "key_global", None) is not None: + with tf.name_scope(self.key_global.name): + self.key_global.build(self.config.hidden_size) + if getattr(self, "value_global", None) is not None: + with tf.name_scope(self.value_global.name): + self.value_global.build(self.config.hidden_size) + def call( self, @@ -1523,6 +1600,16 @@ def call(self, inputs, training=False): outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFLongformerLayer(tf.keras.layers.Layer): @@ -1553,6 +1640,19 @@ def call(self, inputs, training=False): outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "longformer_output", None) is not None: + with tf.name_scope(self.longformer_output.name): + self.longformer_output.build(None) class TFLongformerEncoder(tf.keras.layers.Layer): @@ -1631,6 +1731,14 @@ def call( attentions=all_attentions, global_attentions=all_global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1858,6 +1966,19 @@ def _merge_to_attention_mask(attention_mask: tf.Tensor, global_attention_mask: t attention_mask = global_attention_mask + 1 return attention_mask + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFLongformerPreTrainedModel(TFPreTrainedModel): @@ -2043,6 +2164,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer", None) is not None: + with tf.name_scope(self.longformer.name): + self.longformer.build(None) @add_start_docstrings( @@ -2127,6 +2255,16 @@ def call( attentions=outputs.attentions, global_attentions=outputs.global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer", None) is not None: + with tf.name_scope(self.longformer.name): + self.longformer.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) @add_start_docstrings( @@ -2150,6 +2288,7 @@ def __init__(self, config, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -2257,6 +2396,16 @@ def call( attentions=outputs.attentions, global_attentions=outputs.global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer", None) is not None: + with tf.name_scope(self.longformer.name): + self.longformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) class TFLongformerClassificationHead(tf.keras.layers.Layer): @@ -2274,6 +2423,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, hidden_states, training=False): hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) @@ -2282,6 +2432,16 @@ def call(self, hidden_states, training=False): hidden_states = self.dropout(hidden_states, training=training) output = self.out_proj(hidden_states) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -2385,6 +2545,16 @@ def call( attentions=outputs.attentions, global_attentions=outputs.global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer", None) is not None: + with tf.name_scope(self.longformer.name): + self.longformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -2406,6 +2576,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @property def input_signature(self): @@ -2499,6 +2670,16 @@ def call( attentions=outputs.attentions, global_attentions=outputs.global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer", None) is not None: + with tf.name_scope(self.longformer.name): + self.longformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -2522,6 +2703,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -2579,3 +2761,13 @@ def call( attentions=outputs.attentions, global_attentions=outputs.global_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "longformer", None) is not None: + with tf.name_scope(self.longformer.name): + self.longformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index f52dec26b33ec8..532334aa038d7a 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -176,6 +176,7 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.feat_dim = config.visual_feat_dim self.pos_dim = config.visual_pos_dim + self.config = config def call(self, visn_input, training=False): feats, boxes = visn_input @@ -188,6 +189,22 @@ def call(self, visn_input, training=False): output = self.dropout(output, training=training) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "visn_fc", None) is not None: + with tf.name_scope(self.visn_fc.name): + self.visn_fc.build(self.feat_dim) + if getattr(self, "visn_layer_norm", None) is not None: + with tf.name_scope(self.visn_layer_norm.name): + self.visn_layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "box_fc", None) is not None: + with tf.name_scope(self.box_fc.name): + self.box_fc.build(self.pos_dim) + if getattr(self, "box_layer_norm", None) is not None: + with tf.name_scope(self.box_layer_norm.name): + self.box_layer_norm.build([None, None, self.config.hidden_size]) class TFLxmertEmbeddings(tf.keras.layers.Layer): @@ -203,7 +220,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -225,7 +242,14 @@ def build(self, input_shape): initializer=get_initializer(initializer_range=self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def call(self, input_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ @@ -287,6 +311,7 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.ctx_dim = config.hidden_size + self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -330,6 +355,19 @@ def call(self, hidden_states, context, attention_mask, output_attentions, traini outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.ctx_dim) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.ctx_dim) class TFLxmertIntermediate(tf.keras.layers.Layer): @@ -344,11 +382,19 @@ def __init__(self, config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFLxmertOutput(tf.keras.layers.Layer): @@ -362,12 +408,23 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFLxmertAttentionOutput(tf.keras.layers.Layer): @@ -380,12 +437,23 @@ def __init__(self, config, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer): @@ -401,6 +469,16 @@ def call(self, input_tensor, attention_mask, output_attentions, training=False): attention_probs = self_output[1] attention_output = self.attention_output(self_output[0], input_tensor) return (attention_output, attention_probs) if output_attentions else (attention_output,) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "attention_output", None) is not None: + with tf.name_scope(self.attention_output.name): + self.attention_output.build(None) class TFLxmertCrossAttentionLayer(tf.keras.layers.Layer): @@ -423,6 +501,16 @@ def call( attention_output = self.attention_output(output[0], input_tensor, training=training) outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "att", None) is not None: + with tf.name_scope(self.att.name): + self.att.build(None) + if getattr(self, "attention_output", None) is not None: + with tf.name_scope(self.attention_output.name): + self.attention_output.build(None) class TFLxmertLayer(tf.keras.layers.Layer): @@ -439,6 +527,19 @@ def call(self, hidden_states, attention_mask, output_attentions, training=False) layer_output = self.transformer_output(intermediate_output, attention_output, training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "transformer_output", None) is not None: + with tf.name_scope(self.transformer_output.name): + self.transformer_output.build(None) class TFLxmertXLayer(tf.keras.layers.Layer): @@ -544,6 +645,31 @@ def call( lang_output, visn_output = self.output_fc(lang_att_output, visn_att_output, training=training) return (lang_output, visn_output, attention_probs[0]) if output_attentions else (lang_output, visn_output) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "visual_attention", None) is not None: + with tf.name_scope(self.visual_attention.name): + self.visual_attention.build(None) + if getattr(self, "lang_self_att", None) is not None: + with tf.name_scope(self.lang_self_att.name): + self.lang_self_att.build(None) + if getattr(self, "visn_self_att", None) is not None: + with tf.name_scope(self.visn_self_att.name): + self.visn_self_att.build(None) + if getattr(self, "lang_inter", None) is not None: + with tf.name_scope(self.lang_inter.name): + self.lang_inter.build(None) + if getattr(self, "lang_output", None) is not None: + with tf.name_scope(self.lang_output.name): + self.lang_output.build(None) + if getattr(self, "visn_inter", None) is not None: + with tf.name_scope(self.visn_inter.name): + self.visn_inter.build(None) + if getattr(self, "visn_output", None) is not None: + with tf.name_scope(self.visn_output.name): + self.visn_output.build(None) class TFLxmertEncoder(tf.keras.layers.Layer): @@ -633,6 +759,25 @@ def call( lang_encoder_outputs, cross_encoder_attentions if output_attentions else None, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "visn_fc", None) is not None: + with tf.name_scope(self.visn_fc.name): + self.visn_fc.build(None) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + if getattr(self, "x_layers", None) is not None: + for layer in self.x_layers: + with tf.name_scope(layer.name): + layer.build(None) + if getattr(self, "r_layers", None) is not None: + for layer in self.r_layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -773,6 +918,19 @@ def call( vision_attentions=vision_attentions if output_attentions else None, cross_encoder_attentions=cross_encoder_attentions if output_attentions else None, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFLxmertPreTrainedModel(TFPreTrainedModel): @@ -968,6 +1126,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lxmert", None) is not None: + with tf.name_scope(self.lxmert.name): + self.lxmert.build(None) class TFLxmertPooler(tf.keras.layers.Layer): @@ -979,6 +1144,7 @@ def __init__(self, config, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding @@ -986,6 +1152,13 @@ def call(self, hidden_states): first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Lxmert @@ -1005,6 +1178,7 @@ def __init__(self, config: LxmertConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -1012,6 +1186,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Lxmert @@ -1028,10 +1212,17 @@ def __init__(self, config: LxmertConfig, input_embeddings: tf.keras.layers.Layer # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -1081,11 +1272,22 @@ def __init__(self, config, input_embeddings, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship", ) + self.config = config def call(self, sequence_output, pooled_output): prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + if getattr(self, "seq_relationship", None) is not None: + with tf.name_scope(self.seq_relationship.name): + self.seq_relationship.build(self.config.hidden_size) class TFLxmertVisualAnswerHead(tf.keras.layers.Layer): @@ -1112,6 +1314,19 @@ def call(self, hidden_states): hidden_states = self.dense_1(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(None) # TODO Matt might be wrong + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) # TODO Matt might be wrong + if getattr(self, "dense_1", None) is not None: + with tf.name_scope(self.dense_1.name): + self.dense_1.build(None) # TODO Matt might be wrong class TFLxmertVisualObjHead(tf.keras.layers.Layer): @@ -1146,6 +1361,16 @@ def call(self, hidden_states): for key in self.visual_losses: output[key] = self.decoder_dict[key](hidden_states) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + if getattr(self, "decoder_dict", None) is not None: + with tf.name_scope(self.decoder_dict.name): + self.decoder_dict.build(None) # TODO Matt might be wrong @add_start_docstrings("""Lxmert Model with a `language modeling` head on top.""", LXMERT_START_DOCSTRING) @@ -1390,3 +1615,13 @@ def call( vision_attentions=lxmert_output.vision_attentions, cross_encoder_attentions=lxmert_output.cross_encoder_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lxmert", None) is not None: + with tf.name_scope(self.lxmert.name): + self.lxmert.build(None) + if getattr(self, "cls", None) is not None: + with tf.name_scope(self.cls.name): + self.cls.build(None) diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 76235b5f0f705c..e0f415c91dfe4d 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -327,6 +327,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->Marian @@ -344,6 +360,7 @@ def __init__(self, config: MarianConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -384,6 +401,25 @@ def call( hidden_states = self.final_layer_norm(hidden_states) return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->Marian @@ -414,6 +450,7 @@ def __init__(self, config: MarianConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -494,6 +531,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFMarianPreTrainedModel(TFPreTrainedModel): @@ -805,6 +867,17 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1037,6 +1110,17 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1148,6 +1232,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1235,6 +1332,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer @@ -1443,3 +1547,13 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index d786cd0b3ad882..5cf86ff21aa604 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -296,6 +296,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFMBartEncoderLayer(tf.keras.layers.Layer): @@ -312,6 +328,7 @@ def __init__(self, config: MBartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -352,6 +369,25 @@ def call( hidden_states = residual + hidden_states return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFMBartDecoderLayer(tf.keras.layers.Layer): @@ -381,6 +417,7 @@ def __init__(self, config: MBartConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -461,6 +498,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFMBartPreTrainedModel(TFPreTrainedModel): @@ -801,6 +863,23 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1040,6 +1119,23 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1154,6 +1250,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1241,6 +1350,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer @@ -1447,3 +1563,13 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index f0b537c465d05f..b01b308cb9bae1 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -130,12 +130,20 @@ def __init__(self, config, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.true_hidden_size) class TFLayerNorm(tf.keras.layers.LayerNormalization): @@ -182,7 +190,7 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.embedded_input_size = self.embedding_size * (3 if self.trigram_input else 1) - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -204,7 +212,17 @@ def build(self, input_shape): initializer=get_initializer(initializer_range=self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "embedding_transformation", None) is not None: + with tf.name_scope(self.embedding_transformation.name): + self.embedding_transformation.build(self.embedded_input_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) + def call(self, input_ids=None, position_ids=None, token_type_ids=None, inputs_embeds=None, training=False): """ @@ -282,6 +300,7 @@ def __init__(self, config, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -332,6 +351,20 @@ def call( outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.true_hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.true_hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build( + self.config.true_hidden_size if self.config.use_bottleneck_attention else self.config.hidden_size) class TFMobileBertSelfOutput(tf.keras.layers.Layer): @@ -346,6 +379,7 @@ def __init__(self, config, **kwargs): ) if not self.use_bottleneck: self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, residual_tensor, training=False): hidden_states = self.dense(hidden_states) @@ -353,6 +387,16 @@ def call(self, hidden_states, residual_tensor, training=False): hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + residual_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.true_hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) class TFMobileBertAttention(tf.keras.layers.Layer): @@ -382,6 +426,16 @@ def call( attention_output = self.mobilebert_output(self_outputs[0], layer_input, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "mobilebert_output", None) is not None: + with tf.name_scope(self.mobilebert_output.name): + self.mobilebert_output.build(None) class TFOutputBottleneck(tf.keras.layers.Layer): @@ -392,12 +446,23 @@ def __init__(self, config, **kwargs): config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def call(self, hidden_states, residual_tensor, training=False): layer_outputs = self.dense(hidden_states) layer_outputs = self.dropout(layer_outputs, training=training) layer_outputs = self.LayerNorm(layer_outputs + residual_tensor) return layer_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.true_hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) class TFMobileBertOutput(tf.keras.layers.Layer): @@ -414,6 +479,7 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) else: self.bottleneck = TFOutputBottleneck(config, name="bottleneck") + self.config = config def call(self, hidden_states, residual_tensor_1, residual_tensor_2, training=False): hidden_states = self.dense(hidden_states) @@ -424,6 +490,16 @@ def call(self, hidden_states, residual_tensor_1, residual_tensor_2, training=Fal hidden_states = self.LayerNorm(hidden_states + residual_tensor_1) hidden_states = self.bottleneck(hidden_states, residual_tensor_2) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) class TFBottleneckLayer(tf.keras.layers.Layer): @@ -433,11 +509,22 @@ def __init__(self, config, **kwargs): self.LayerNorm = NORM2FN[config.normalization_type]( config.intra_bottleneck_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) + self.config = config def call(self, inputs): hidden_states = self.dense(inputs) hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) class TFBottleneck(tf.keras.layers.Layer): @@ -474,6 +561,13 @@ def call(self, hidden_states): return (shared_attention_input, shared_attention_input, hidden_states, bottlenecked_hidden_states) else: return (hidden_states, hidden_states, hidden_states, bottlenecked_hidden_states) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bottleneck_input", None) is not None: + with tf.name_scope(self.bottleneck_input.name): + self.bottleneck_input.build(None) class TFFFNOutput(tf.keras.layers.Layer): @@ -483,11 +577,22 @@ def __init__(self, config, **kwargs): self.LayerNorm = NORM2FN[config.normalization_type]( config.true_hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm" ) + self.config = config def call(self, hidden_states, residual_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.LayerNorm(hidden_states + residual_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) class TFFFNLayer(tf.keras.layers.Layer): @@ -500,6 +605,16 @@ def call(self, hidden_states): intermediate_output = self.intermediate(hidden_states) layer_outputs = self.mobilebert_output(intermediate_output, hidden_states) return layer_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "mobilebert_output", None) is not None: + with tf.name_scope(self.mobilebert_output.name): + self.mobilebert_output.build(None) class TFMobileBertLayer(tf.keras.layers.Layer): @@ -560,6 +675,19 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, trai ) # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "mobilebert_output", None) is not None: + with tf.name_scope(self.mobilebert_output.name): + self.mobilebert_output.build(None) class TFMobileBertEncoder(tf.keras.layers.Layer): @@ -603,6 +731,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) class TFMobileBertPooler(tf.keras.layers.Layer): @@ -639,12 +775,23 @@ def __init__(self, config, **kwargs): else: self.transform_act_fn = config.hidden_act self.LayerNorm = NORM2FN["layer_norm"](config.hidden_size, epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build(None) class TFMobileBertLMPredictionHead(tf.keras.layers.Layer): @@ -653,7 +800,7 @@ def __init__(self, config, **kwargs): self.transform = TFMobileBertPredictionHeadTransform(config, name="transform") self.config = config - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") self.dense = self.add_weight( shape=(self.config.hidden_size - self.config.embedding_size, self.config.vocab_size), @@ -667,7 +814,14 @@ def build(self, input_shape): trainable=True, name="decoder/weight", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self): return self @@ -814,6 +968,19 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFMobileBertPreTrainedModel(TFPreTrainedModel): @@ -998,6 +1165,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) @add_start_docstrings( @@ -1088,6 +1262,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) # TODO Matt might be wrong + if getattr(self, "seq_relationship", None) is not None: + with tf.name_scope(self.seq_relationship.name): + self.seq_relationship.build(None) # TODO Matt might be wrong @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING) @@ -1168,16 +1355,34 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) # TODO Matt might be wrong class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship") + self.config = config def call(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) return seq_relationship_score + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "seq_relationship", None) is not None: + with tf.name_scope(self.seq_relationship.name): + self.seq_relationship.build(self.config.hidden_size) @add_start_docstrings( @@ -1260,6 +1465,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "cls", None) is not None: + with tf.name_scope(self.cls.name): + self.cls.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1291,6 +1506,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1350,6 +1566,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1377,6 +1603,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1449,6 +1676,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) @add_start_docstrings( @@ -1476,6 +1713,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1550,6 +1788,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1582,6 +1830,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1639,3 +1888,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilebert", None) is not None: + with tf.name_scope(self.mobilebert.name): + self.mobilebert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py index 723a18a1a60034..ffbac12eb637c4 100644 --- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py @@ -143,6 +143,13 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: if self.activation is not None: features = self.activation(features) return features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution", None) is not None: + with tf.name_scope(self.convolution.name): + self.convolution.build(self.in_channels) class TFMobileViTInvertedResidual(tf.keras.layers.Layer): @@ -193,6 +200,19 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.reduce_1x1(features, training=training) return residual + features if self.use_residual else features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "expand_1x1", None) is not None: + with tf.name_scope(self.expand_1x1.name): + self.expand_1x1.build(None) + if getattr(self, "conv_3x3", None) is not None: + with tf.name_scope(self.conv_3x3.name): + self.conv_3x3.build(None) + if getattr(self, "reduce_1x1", None) is not None: + with tf.name_scope(self.reduce_1x1.name): + self.reduce_1x1.build(None) class TFMobileViTMobileNetLayer(tf.keras.layers.Layer): @@ -246,6 +266,7 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: self.value = tf.keras.layers.Dense(self.all_head_size, use_bias=config.qkv_bias, name="value") self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.hidden_size = hidden_size def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: batch_size = tf.shape(x)[0] @@ -275,6 +296,19 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape(context_layer, shape=(batch_size, -1, self.all_head_size)) return context_layer + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.hidden_size) class TFMobileViTSelfOutput(tf.keras.layers.Layer): @@ -282,11 +316,19 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(hidden_size, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.hidden_size) class TFMobileViTAttention(tf.keras.layers.Layer): @@ -302,6 +344,16 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: self_outputs = self.attention(hidden_states, training=training) attention_output = self.dense_output(self_outputs, training=training) return attention_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFMobileViTIntermediate(tf.keras.layers.Layer): @@ -312,11 +364,19 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.hidden_size) class TFMobileViTOutput(tf.keras.layers.Layer): @@ -324,12 +384,20 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(hidden_size, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.intermediate_size = intermediate_size def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = hidden_states + input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.intermediate_size) class TFMobileViTTransformerLayer(tf.keras.layers.Layer): @@ -344,6 +412,7 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, intermediate_size: self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) + self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: attention_output = self.attention(self.layernorm_before(hidden_states), training=training) @@ -353,6 +422,25 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: layer_output = self.intermediate(layer_output) layer_output = self.mobilevit_output(layer_output, hidden_states, training=training) return layer_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "mobilevit_output", None) is not None: + with tf.name_scope(self.mobilevit_output.name): + self.mobilevit_output.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.hidden_size]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.hidden_size]) class TFMobileViTTransformer(tf.keras.layers.Layer): @@ -443,6 +531,7 @@ def __init__( kernel_size=config.conv_kernel_size, name="fusion", ) + self.hidden_size = hidden_size def unfolding(self, features: tf.Tensor) -> Tuple[tf.Tensor, Dict]: patch_width, patch_height = self.patch_width, self.patch_height @@ -540,6 +629,28 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.conv_projection(features, training=training) features = self.fusion(tf.concat([residual, features], axis=-1), training=training) return features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv_kxk", None) is not None: + with tf.name_scope(self.conv_kxk.name): + self.conv_kxk.build(None) + if getattr(self, "conv_1x1", None) is not None: + with tf.name_scope(self.conv_1x1.name): + self.conv_1x1.build(None) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.hidden_size]) + if getattr(self, "conv_projection", None) is not None: + with tf.name_scope(self.conv_projection.name): + self.conv_projection.build(None) + if getattr(self, "fusion", None) is not None: + with tf.name_scope(self.fusion.name): + self.fusion.build(None) class TFMobileViTEncoder(tf.keras.layers.Layer): @@ -741,6 +852,19 @@ def call( pooler_output=pooled_output, hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv_stem", None) is not None: + with tf.name_scope(self.conv_stem.name): + self.conv_stem.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) # TODO Matt might be wrong class TFMobileViTPreTrainedModel(TFPreTrainedModel): @@ -841,6 +965,13 @@ def call( ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]: output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilevit", None) is not None: + with tf.name_scope(self.mobilevit.name): + self.mobilevit.build(None) @add_start_docstrings( @@ -901,6 +1032,16 @@ def call( return ((loss,) + output) if loss is not None else output return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilevit", None) is not None: + with tf.name_scope(self.mobilevit.name): + self.mobilevit.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) class TFMobileViTASPPPooling(tf.keras.layers.Layer): @@ -926,6 +1067,16 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.conv_1x1(features, training=training) features = tf.image.resize(features, size=spatial_size, method="bilinear") return features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "global_pool", None) is not None: + with tf.name_scope(self.global_pool.name): + self.global_pool.build(None) + if getattr(self, "conv_1x1", None) is not None: + with tf.name_scope(self.conv_1x1.name): + self.conv_1x1.build(None) class TFMobileViTASPP(tf.keras.layers.Layer): @@ -997,6 +1148,13 @@ def call(self, features: tf.Tensor, training: bool = False) -> tf.Tensor: pooled_features = self.project(pyramid, training=training) pooled_features = self.dropout(pooled_features, training=training) return pooled_features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "project", None) is not None: + with tf.name_scope(self.project.name): + self.project.build(None) class TFMobileViTDeepLabV3(tf.keras.layers.Layer): @@ -1026,6 +1184,16 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: features = self.dropout(features, training=training) features = self.classifier(features, training=training) return features + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "aspp", None) is not None: + with tf.name_scope(self.aspp.name): + self.aspp.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1139,3 +1307,13 @@ def call( logits=logits, hidden_states=outputs.hidden_states if output_hidden_states else None, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mobilevit", None) is not None: + with tf.name_scope(self.mobilevit.name): + self.mobilevit.build(None) + if getattr(self, "segmentation_head", None) is not None: + with tf.name_scope(self.segmentation_head.name): + self.segmentation_head.build(None) diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 2982899340d203..def2347c55eedf 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -91,7 +91,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -106,7 +106,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(initializer_range=self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def create_position_ids_from_input_ids(self, input_ids): """ @@ -165,6 +172,7 @@ def __init__(self, config: MPNetConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -173,6 +181,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFMPNetSelfAttention(tf.keras.layers.Layer): @@ -203,6 +218,7 @@ def __init__(self, config, **kwargs): config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o" ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.config = config def transpose_for_scores(self, x, batch_size): # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -246,6 +262,22 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, posi outputs = (o, attention_probs) if output_attentions else (o,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q", None) is not None: + with tf.name_scope(self.q.name): + self.q.build(self.config.hidden_size) + if getattr(self, "k", None) is not None: + with tf.name_scope(self.k.name): + self.k.build(self.config.hidden_size) + if getattr(self, "v", None) is not None: + with tf.name_scope(self.v.name): + self.v.build(self.config.hidden_size) + if getattr(self, "o", None) is not None: + with tf.name_scope(self.o.name): + self.o.build(self.config.hidden_size) class TFMPNetAttention(tf.keras.layers.Layer): @@ -255,6 +287,7 @@ def __init__(self, config, **kwargs): self.attn = TFMPNetSelfAttention(config, name="attn") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config def prune_heads(self, heads): raise NotImplementedError @@ -266,6 +299,16 @@ def call(self, input_tensor, attention_mask, head_mask, output_attentions, posit attention_output = self.LayerNorm(self.dropout(self_outputs[0]) + input_tensor) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attn", None) is not None: + with tf.name_scope(self.attn.name): + self.attn.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->MPNet @@ -281,12 +324,20 @@ def __init__(self, config: MPNetConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->MPNet @@ -299,6 +350,7 @@ def __init__(self, config: MPNetConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -306,6 +358,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFMPNetLayer(tf.keras.layers.Layer): @@ -328,6 +390,19 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, posi outputs = (layer_output,) + outputs # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "out", None) is not None: + with tf.name_scope(self.out.name): + self.out.build(None) class TFMPNetEncoder(tf.keras.layers.Layer): @@ -344,7 +419,7 @@ def __init__(self, config, **kwargs): self.layer = [TFMPNetLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] self.relative_attention_num_buckets = config.relative_attention_num_buckets - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("relative_attention_bias"): self.relative_attention_bias = self.add_weight( name="embeddings", @@ -352,7 +427,15 @@ def build(self, input_shape): initializer=get_initializer(self.initializer_range), ) - return super().build(input_shape) + return + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + def call( self, @@ -560,6 +643,19 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) MPNET_START_DOCSTRING = r""" @@ -692,6 +788,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mpnet", None) is not None: + with tf.name_scope(self.mpnet.name): + self.mpnet.build(None) class TFMPNetLMHead(tf.keras.layers.Layer): @@ -712,10 +815,20 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_output_embeddings(self): return self.decoder @@ -815,6 +928,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mpnet", None) is not None: + with tf.name_scope(self.mpnet.name): + self.mpnet.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) class TFMPNetClassificationHead(tf.keras.layers.Layer): @@ -832,6 +955,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -840,6 +964,16 @@ def call(self, features, training=False): x = self.dropout(x, training=training) x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -912,6 +1046,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mpnet", None) is not None: + with tf.name_scope(self.mpnet.name): + self.mpnet.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -930,6 +1074,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -998,6 +1143,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mpnet", None) is not None: + with tf.name_scope(self.mpnet.name): + self.mpnet.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1019,6 +1174,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1072,6 +1228,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mpnet", None) is not None: + with tf.name_scope(self.mpnet.name): + self.mpnet.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1092,6 +1258,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(MPNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1159,3 +1326,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "mpnet", None) is not None: + with tf.name_scope(self.mpnet.name): + self.mpnet.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index 2fd5dbc79769b7..fe34c9566ae4b2 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -152,6 +152,16 @@ def call(self, x, attention_mask, head_mask, output_attentions, training=False): outputs = [a] + attn_outputs[1:] return outputs # a, (attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "c_attn", None) is not None: + with tf.name_scope(self.c_attn.name): + self.c_attn.build(None) + if getattr(self, "c_proj", None) is not None: + with tf.name_scope(self.c_proj.name): + self.c_proj.build(None) class TFMLP(tf.keras.layers.Layer): @@ -168,6 +178,16 @@ def call(self, x, training=False): h2 = self.c_proj(h) h2 = self.dropout(h2, training=training) return h2 + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "c_fc", None) is not None: + with tf.name_scope(self.c_fc.name): + self.c_fc.build(None) + if getattr(self, "c_proj", None) is not None: + with tf.name_scope(self.c_proj.name): + self.c_proj.build(None) class TFBlock(tf.keras.layers.Layer): @@ -190,6 +210,22 @@ def call(self, x, attention_mask, head_mask, output_attentions, training=False): outputs = [h] + output_attn[1:] return outputs # x, (attentions) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attn", None) is not None: + with tf.name_scope(self.attn.name): + self.attn.build(None) + if getattr(self, "ln_1", None) is not None: + with tf.name_scope(self.ln_1.name): + self.ln_1.build([None, None, self.nx]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "ln_2", None) is not None: + with tf.name_scope(self.ln_2.name): + self.ln_2.build([None, None, self.nx]) @keras_serializable @@ -214,7 +250,7 @@ def __init__(self, config, *inputs, **kwargs): self.drop = tf.keras.layers.Dropout(config.embd_pdrop) self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)] - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("positions_embed"): self.positions_embed = self.add_weight( name="embeddings", @@ -222,7 +258,18 @@ def build(self, input_shape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "tokens_embed", None) is not None: + with tf.name_scope(self.tokens_embed.name): + self.tokens_embed.build(None) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) + def get_input_embeddings(self): return self.tokens_embed @@ -528,6 +575,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) @add_start_docstrings( @@ -613,6 +667,13 @@ def call( def prepare_inputs_for_generation(self, inputs, **kwargs): return {"input_ids": inputs} + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) @add_start_docstrings( @@ -734,6 +795,16 @@ def input_signature(self): "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), } + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "multiple_choice_head", None) is not None: + with tf.name_scope(self.multiple_choice_head.name): + self.multiple_choice_head.build(None) @add_start_docstrings( @@ -762,6 +833,7 @@ def __init__(self, config, *inputs, **kwargs): use_bias=False, ) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(OPENAI_GPT_INPUTS_DOCSTRING) @@ -849,3 +921,13 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "score", None) is not None: + with tf.name_scope(self.score.name): + self.score.build(self.config.n_embd) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index 6c48d6e629273c..05efa387ed2ab4 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -267,6 +267,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFOPTDecoderLayer(tf.keras.layers.Layer): @@ -288,6 +304,7 @@ def __init__(self, config: OPTConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -353,6 +370,26 @@ def call( hidden_states = self.final_layer_norm(hidden_states) return (hidden_states, self_attn_weights, present_key_value) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, + self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) # TODO Matt might be wrong OPT_START_DOCSTRING = r""" @@ -695,6 +732,20 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_tokens", None) is not None: + with tf.name_scope(self.embed_tokens.name): + self.embed_tokens.build(None) + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -756,6 +807,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings( @@ -840,6 +898,13 @@ def serving_output(self, output): hidden_states=hs, attentions=attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) @add_start_docstrings( @@ -1006,3 +1071,10 @@ def serving_output(self, output): loss=output.loss, logits=output.logits, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 52171b884ca825..fc2cd0ebad07b4 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -329,6 +329,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus @@ -346,6 +362,7 @@ def __init__(self, config: PegasusConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -386,6 +403,25 @@ def call( hidden_states = residual + hidden_states return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus @@ -416,6 +452,7 @@ def __init__(self, config: PegasusConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -496,6 +533,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFPegasusPreTrainedModel(TFPreTrainedModel): @@ -811,6 +873,20 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1046,6 +1122,20 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1157,6 +1247,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1244,6 +1347,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer @@ -1452,3 +1562,13 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index d1151bcd5a64b3..08de257419d4f9 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -1291,6 +1291,13 @@ def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, loss = (1.0 - smooth_epsilon) * nll_loss + eps_i * smooth_loss return loss + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rag", None) is not None: + with tf.name_scope(self.rag.name): + self.rag.build(None) @add_start_docstrings_to_model_forward( @@ -1743,3 +1750,10 @@ def _cat_and_pad(tensors, pad_token_id): output = tf.convert_to_tensor(output) return tf.cast(output, tensors[0][0][0].dtype) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rag", None) is not None: + with tf.name_scope(self.rag.name): + self.rag.build(None) diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py index f847d00fa09a3b..727666c77207e8 100644 --- a/src/transformers/models/regnet/modeling_tf_regnet.py +++ b/src/transformers/models/regnet/modeling_tf_regnet.py @@ -83,6 +83,17 @@ def call(self, hidden_state): hidden_state = self.normalization(hidden_state) hidden_state = self.activation(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution", None) is not None: + with tf.name_scope(self.convolution.name): + self.convolution.build( + self.in_channels) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build(None) class TFRegNetEmbeddings(tf.keras.layers.Layer): @@ -115,6 +126,13 @@ def call(self, pixel_values): pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) hidden_state = self.embedder(pixel_values) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedder", None) is not None: + with tf.name_scope(self.embedder.name): + self.embedder.build(None) class TFRegNetShortCut(tf.keras.layers.Layer): @@ -133,6 +151,16 @@ def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwarg def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: return self.normalization(self.convolution(inputs), training=training) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution", None) is not None: + with tf.name_scope(self.convolution.name): + self.convolution.build(self.in_channels) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build(None) class TFRegNetSELayer(tf.keras.layers.Layer): @@ -155,6 +183,17 @@ def call(self, hidden_state): pooled = layer_module(pooled) hidden_state = hidden_state * pooled return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "attention", None) is not None: + for layer in self.attention: + with tf.name_scope(layer.name): + layer.build(None) class TFRegNetXLayer(tf.keras.layers.Layer): @@ -189,6 +228,17 @@ def call(self, hidden_state): hidden_state += residual hidden_state = self.activation(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shortcut", None) is not None: + with tf.name_scope(self.shortcut.name): + self.shortcut.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFRegNetYLayer(tf.keras.layers.Layer): @@ -223,6 +273,17 @@ def call(self, hidden_state): hidden_state += residual hidden_state = self.activation(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shortcut", None) is not None: + with tf.name_scope(self.shortcut.name): + self.shortcut.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFRegNetStage(tf.keras.layers.Layer): @@ -246,6 +307,14 @@ def call(self, hidden_state): for layer_module in self.layers: hidden_state = layer_module(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFRegNetEncoder(tf.keras.layers.Layer): @@ -336,6 +405,19 @@ def call( pooler_output=pooled_output, hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedder", None) is not None: + with tf.name_scope(self.embedder.name): + self.embedder.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFRegNetPreTrainedModel(TFPreTrainedModel): @@ -421,6 +503,13 @@ def call( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "regnet", None) is not None: + with tf.name_scope(self.regnet.name): + self.regnet.build(None) @add_start_docstrings( @@ -483,3 +572,14 @@ def call( return ((loss,) + output) if loss is not None else output return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "regnet", None) is not None: + with tf.name_scope(self.regnet.name): + self.regnet.build(None) + if getattr(self, "classifier", None) is not None: + for layer in self.classifier: + with tf.name_scope(layer.name): + layer.build(None) diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index 1595fd8118debd..de5a565cfb76ca 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -80,7 +80,7 @@ def __init__(self, config: RemBertConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -102,7 +102,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.input_embedding_size]) + def call( self, @@ -172,6 +179,7 @@ def __init__(self, config: RemBertConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -260,6 +268,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RemBert @@ -272,6 +293,7 @@ def __init__(self, config: RemBertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -279,6 +301,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->RemBert @@ -320,6 +352,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RemBert @@ -335,12 +377,20 @@ def __init__(self, config: RemBertConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RemBert @@ -353,6 +403,7 @@ def __init__(self, config: RemBertConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -360,6 +411,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RemBert @@ -447,6 +508,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) class TFRemBertEncoder(tf.keras.layers.Layer): @@ -523,6 +597,17 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedding_hidden_mapping_in", None) is not None: + with tf.name_scope(self.embedding_hidden_mapping_in.name): + self.embedding_hidden_mapping_in.build(self.config.input_embedding_size) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RemBert @@ -536,6 +621,7 @@ def __init__(self, config: RemBertConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -544,6 +630,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFRemBertLMPredictionHead(tf.keras.layers.Layer): @@ -562,7 +655,7 @@ def __init__(self, config: RemBertConfig, input_embeddings: tf.keras.layers.Laye self.activation = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.decoder = self.add_weight( name="decoder/weight", shape=[self.config.vocab_size, self.output_embedding_size], @@ -572,7 +665,17 @@ def build(self, input_shape: tf.TensorShape): shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.output_embedding_size]) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self @@ -799,6 +902,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFRemBertPreTrainedModel(TFPreTrainedModel): @@ -981,6 +1097,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) @add_start_docstrings("""RemBERT Model with a `language modeling` head on top.""", REMBERT_START_DOCSTRING) @@ -1053,6 +1176,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1169,6 +1302,16 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1190,6 +1333,7 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1245,6 +1389,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1263,6 +1417,7 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1341,6 +1496,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1361,6 +1526,7 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1414,6 +1580,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1433,6 +1609,7 @@ def __init__(self, config: RemBertConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(REMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1501,3 +1678,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rembert", None) is not None: + with tf.name_scope(self.rembert.name): + self.rembert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py index 54d0edb399452f..3f7d48fb6034e7 100644 --- a/src/transformers/models/resnet/modeling_tf_resnet.py +++ b/src/transformers/models/resnet/modeling_tf_resnet.py @@ -81,6 +81,17 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.normalization(hidden_state, training=training) hidden_state = self.activation(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build( + self.in_channels) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build(None) class TFResNetEmbeddings(tf.keras.layers.Layer): @@ -112,6 +123,16 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = tf.pad(hidden_state, [[0, 0], [1, 1], [1, 1], [0, 0]]) hidden_state = self.pooler(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedder", None) is not None: + with tf.name_scope(self.embedder.name): + self.embedder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFResNetShortCut(tf.keras.layers.Layer): @@ -134,6 +155,16 @@ def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = self.convolution(hidden_state) hidden_state = self.normalization(hidden_state, training=training) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution", None) is not None: + with tf.name_scope(self.convolution.name): + self.convolution.build(self.in_channels) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build(None) class TFResNetBasicLayer(tf.keras.layers.Layer): @@ -163,6 +194,19 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state += residual hidden_state = self.activation(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shortcut", None) is not None: + with tf.name_scope(self.shortcut.name): + self.shortcut.build(None) + if getattr(self, "conv1", None) is not None: + with tf.name_scope(self.conv1.name): + self.conv1.build(None) # TODO Matt might be wrong + if getattr(self, "conv2", None) is not None: + with tf.name_scope(self.conv2.name): + self.conv2.build(None) # TODO Matt might be wrong class TFResNetBottleNeckLayer(tf.keras.layers.Layer): @@ -204,6 +248,22 @@ def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state += residual hidden_state = self.activation(hidden_state) return hidden_state + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shortcut", None) is not None: + with tf.name_scope(self.shortcut.name): + self.shortcut.build(None) + if getattr(self, "conv0", None) is not None: + with tf.name_scope(self.conv0.name): + self.conv0.build(None) # TODO Matt might be wrong + if getattr(self, "conv1", None) is not None: + with tf.name_scope(self.conv1.name): + self.conv1.build(None) # TODO Matt might be wrong + if getattr(self, "conv2", None) is not None: + with tf.name_scope(self.conv2.name): + self.conv2.build(None) # TODO Matt might be wrong class TFResNetStage(tf.keras.layers.Layer): @@ -272,6 +332,14 @@ def call( return tuple(v for v in [hidden_state, hidden_states] if v is not None) return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "stages", None) is not None: + for layer in self.stages: + with tf.name_scope(layer.name): + layer.build(None) class TFResNetPreTrainedModel(TFPreTrainedModel): @@ -372,6 +440,16 @@ def call( pooler_output=pooled_output, hidden_states=hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedder", None) is not None: + with tf.name_scope(self.embedder.name): + self.embedder.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) @add_start_docstrings( @@ -411,6 +489,13 @@ def call( training=training, ) return resnet_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "resnet", None) is not None: + with tf.name_scope(self.resnet.name): + self.resnet.build(None) @add_start_docstrings( @@ -475,3 +560,13 @@ def call( return (loss,) + output if loss is not None else output return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "resnet", None) is not None: + with tf.name_scope(self.resnet.name): + self.resnet.build(None) + if getattr(self, "classifier_layer", None) is not None: + with tf.name_scope(self.classifier_layer.name): + self.classifier_layer.build(None) # TODO Matt might be wrong diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 9b6c491d2761e6..876c0e924f0590 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -89,7 +89,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -111,7 +111,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -184,6 +191,7 @@ def __init__(self, config: RobertaConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -192,6 +200,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta @@ -222,6 +237,7 @@ def __init__(self, config: RobertaConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -310,6 +326,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta @@ -322,6 +351,7 @@ def __init__(self, config: RobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -329,6 +359,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Roberta @@ -370,6 +410,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Roberta @@ -385,12 +435,20 @@ def __init__(self, config: RobertaConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Roberta @@ -403,6 +461,7 @@ def __init__(self, config: RobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -410,6 +469,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Roberta @@ -497,6 +566,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Roberta @@ -567,6 +649,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -764,6 +854,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) class TFRobertaPreTrainedModel(TFPreTrainedModel): @@ -945,6 +1048,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) class TFRobertaLMHead(tf.keras.layers.Layer): @@ -965,10 +1075,20 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_output_embeddings(self): return self.decoder @@ -1075,6 +1195,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) class TFRobertaForCausalLM(TFRobertaPreTrainedModel, TFCausalLanguageModelingLoss): @@ -1197,6 +1327,16 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) class TFRobertaClassificationHead(tf.keras.layers.Layer): @@ -1217,6 +1357,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1225,6 +1366,16 @@ def call(self, features, training=False): x = self.dropout(x, training=training) x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -1301,6 +1452,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1323,6 +1484,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1391,6 +1553,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1417,6 +1589,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1474,6 +1647,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1495,6 +1678,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1566,3 +1750,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py index 2f98a5f5d0cff4..610b514cf38fda 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py @@ -94,7 +94,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -116,7 +116,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -189,6 +196,7 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -197,6 +205,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm @@ -227,6 +242,7 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -315,6 +331,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) class TFRobertaPreLayerNormSelfOutput(tf.keras.layers.Layer): @@ -325,6 +354,7 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -332,6 +362,13 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = hidden_states + input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFRobertaPreLayerNormAttention(tf.keras.layers.Layer): @@ -341,6 +378,7 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self") self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output") self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads def prune_heads(self, heads): @@ -375,6 +413,19 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFRobertaPreLayerNormIntermediate(tf.keras.layers.Layer): @@ -390,6 +441,7 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) @@ -397,6 +449,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFRobertaPreLayerNormOutput(tf.keras.layers.Layer): @@ -407,6 +469,7 @@ def __init__(self, config: RobertaPreLayerNormConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -414,6 +477,13 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = hidden_states + input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm @@ -501,6 +571,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm @@ -571,6 +654,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -764,6 +855,22 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm @@ -947,6 +1054,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm @@ -968,10 +1082,20 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_output_embeddings(self): return self.decoder @@ -1084,6 +1208,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm @@ -1213,6 +1347,16 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm @@ -1234,6 +1378,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1242,6 +1387,16 @@ def call(self, features, training=False): x = self.dropout(x, training=training) x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -1321,6 +1476,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1344,6 +1509,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1414,6 +1580,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1442,6 +1618,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1498,6 +1675,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1521,6 +1708,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1591,3 +1779,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta_prelayernorm", None) is not None: + with tf.name_scope(self.roberta_prelayernorm.name): + self.roberta_prelayernorm.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index cea286c828b4df..44ef4432c161b2 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -142,7 +142,7 @@ def __init__(self, config: RoFormerConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -157,7 +157,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + def call( self, @@ -218,6 +225,7 @@ def __init__(self, config: RoFormerConfig, **kwargs): ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.rotary_value = config.rotary_value + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -306,6 +314,19 @@ def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, val value_layer = value_layer * cos_pos + rotate_half_value_layer * sin_pos return query_layer, key_layer, value_layer return query_layer, key_layer + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer @@ -318,6 +339,7 @@ def __init__(self, config: RoFormerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -325,6 +347,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFRoFormerAttention(tf.keras.layers.Layer): @@ -360,6 +392,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RoFormer @@ -375,12 +417,20 @@ def __init__(self, config: RoFormerConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RoFormer @@ -393,6 +443,7 @@ def __init__(self, config: RoFormerConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -400,6 +451,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) class TFRoFormerLayer(tf.keras.layers.Layer): @@ -435,6 +496,19 @@ def call( outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "roformer_output", None) is not None: + with tf.name_scope(self.roformer_output.name): + self.roformer_output.build(None) class TFRoFormerEncoder(tf.keras.layers.Layer): @@ -490,6 +564,17 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) class TFRoFormerPredictionHeadTransform(tf.keras.layers.Layer): @@ -508,6 +593,7 @@ def __init__(self, config: RoFormerConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -515,6 +601,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) class TFRoFormerLMPredictionHead(tf.keras.layers.Layer): @@ -530,10 +626,17 @@ def __init__(self, config: RoFormerConfig, input_embeddings: tf.keras.layers.Lay # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -686,6 +789,16 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) class TFRoFormerPreTrainedModel(TFPreTrainedModel): @@ -833,6 +946,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING) @@ -903,6 +1023,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -976,6 +1106,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) # TODO Matt might be wrong class TFRoFormerClassificationHead(tf.keras.layers.Layer): @@ -996,6 +1136,7 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.classifier_act_fn = get_tf_activation(config.hidden_act) else: self.classifier_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) @@ -1006,6 +1147,16 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.out_proj(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -1074,6 +1225,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1092,6 +1253,7 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1166,6 +1328,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1186,6 +1361,7 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1237,6 +1413,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1256,6 +1442,7 @@ def __init__(self, config: RoFormerConfig, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1321,3 +1508,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roformer", None) is not None: + with tf.name_scope(self.roformer.name): + self.roformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py index a0a48b5aa7cdc7..21e6391fd568ea 100644 --- a/src/transformers/models/sam/modeling_tf_sam.py +++ b/src/transformers/models/sam/modeling_tf_sam.py @@ -149,6 +149,13 @@ def call(self, pixel_values): ) embeddings = self.projection(tf.transpose(pixel_values, perm=[0, 2, 3, 1])) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) class TFSamMLPBlock(tf.keras.layers.Layer): @@ -157,12 +164,23 @@ def __init__(self, config, **kwargs): self.lin1 = tf.keras.layers.Dense(config.mlp_dim, name="lin1") self.lin2 = tf.keras.layers.Dense(config.hidden_size, name="lin2") self.act = ACT2FN[config.hidden_act] + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.lin1(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.lin2(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lin1", None) is not None: + with tf.name_scope(self.lin1.name): + self.lin1.build(self.config.hidden_size) + if getattr(self, "lin2", None) is not None: + with tf.name_scope(self.lin2.name): + self.lin2.build(self.config.mlp_dim) class TFSamLayerNorm(tf.keras.layers.Layer): @@ -256,6 +274,22 @@ def call(self, query: tf.Tensor, key: tf.Tensor, value: tf.Tensor) -> tf.Tensor: out = self.out_proj(out) return out + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.hidden_size) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.hidden_size) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.internal_dim) class TFSamTwoWayAttentionBlock(tf.keras.layers.Layer): @@ -344,6 +378,34 @@ def call( outputs = outputs + (None,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.hidden_size]) + if getattr(self, "cross_attn_token_to_image", None) is not None: + with tf.name_scope(self.cross_attn_token_to_image.name): + self.cross_attn_token_to_image.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.hidden_size]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm3", None) is not None: + with tf.name_scope(self.layer_norm3.name): + self.layer_norm3.build([None, None, self.hidden_size]) + if getattr(self, "layer_norm4", None) is not None: + with tf.name_scope(self.layer_norm4.name): + self.layer_norm4.build([None, None, self.hidden_size]) + if getattr(self, "cross_attn_image_to_token", None) is not None: + with tf.name_scope(self.cross_attn_image_to_token.name): + self.cross_attn_image_to_token.build(None) class TFSamTwoWayTransformer(tf.keras.layers.Layer): @@ -411,6 +473,16 @@ def call( queries = queries + attn_out queries = self.layer_norm_final_attn(queries) return queries, keys, all_attentions + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "final_attn_token_to_image", None) is not None: + with tf.name_scope(self.final_attn_token_to_image.name): + self.final_attn_token_to_image.build(None) + if getattr(self, "layer_norm_final_attn", None) is not None: + with tf.name_scope(self.layer_norm_final_attn.name): + self.layer_norm_final_attn.build([None, None, self.config.hidden_size]) class TFSamFeedForward(tf.keras.layers.Layer): @@ -427,6 +499,8 @@ def __init__( for i in range(num_layers - 2) ] self.sigmoid_output = sigmoid_output + self.hidden_dim = hidden_dim + self.input_dim = input_dim def call(self, hidden_states): hidden_states = self.proj_in(hidden_states) @@ -438,6 +512,20 @@ def call(self, hidden_states): if self.sigmoid_output: hidden_states = tf.sigmoid(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "proj_in", None) is not None: + with tf.name_scope(self.proj_in.name): + self.proj_in.build(self.input_dim) + if getattr(self, "proj_out", None) is not None: + with tf.name_scope(self.proj_out.name): + self.proj_out.build(self.hidden_dim) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFSamMaskDecoder(tf.keras.layers.Layer): @@ -483,12 +571,31 @@ def __init__(self, config: SamMaskDecoderConfig, **kwargs): name="iou_prediction_head", ) - def build(self, input_shape): + def build(self, input_shape=None): self.iou_token = self.add_weight(shape=(1, self.hidden_size), name="iou_token.weight", trainable=True) self.mask_tokens = self.add_weight( shape=(self.num_mask_tokens, self.hidden_size), name="mask_tokens.weight", trainable=True ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "upscale_conv1", None) is not None: + with tf.name_scope(self.upscale_conv1.name): + self.upscale_conv1.build(self.hidden_size) + if getattr(self, "upscale_conv2", None) is not None: + with tf.name_scope(self.upscale_conv2.name): + self.upscale_conv2.build(self.hidden_size // 4) + if getattr(self, "upscale_layer_norm", None) is not None: + with tf.name_scope(self.upscale_layer_norm.name): + self.upscale_layer_norm.build(None) + if getattr(self, "iou_prediction_head", None) is not None: + with tf.name_scope(self.iou_prediction_head.name): + self.iou_prediction_head.build(None) + def call( self, @@ -615,6 +722,7 @@ def __init__(self, config: SamPromptEncoderConfig, **kwargs): self.conv3 = tf.keras.layers.Conv2D(config.hidden_size, kernel_size=1, name="conv3") self.layer_norm1 = TFSamLayerNorm(self.mask_input_channels, config.layer_norm_eps, name="layer_norm1") self.layer_norm2 = TFSamLayerNorm(self.mask_input_channels * 4, config.layer_norm_eps, name="layer_norm2") + self.config = config def call(self, masks): masks = tf.transpose(masks, perm=(0, 2, 3, 1)) # Convert to channels-last @@ -629,7 +737,7 @@ def call(self, masks): dense_embeddings = tf.transpose(dense_embeddings, perm=(0, 3, 1, 2)) # Convert back to channels-first return dense_embeddings - def build(self, input_shape): + def build(self, input_shape=None): # This class needs an explicit build method because it isn't called with the standard dummy inputs conv1_shape = [None, None, None, 1] conv2_shape = [None, None, None, self.mask_input_channels] @@ -646,7 +754,26 @@ def build(self, input_shape): self.layer_norm1.build(layer_norm1_shape) with tf.name_scope("layer_norm2"): self.layer_norm2.build(layer_norm2_shape) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "conv1", None) is not None: + with tf.name_scope(self.conv1.name): + self.conv1.build(1) + if getattr(self, "conv2", None) is not None: + with tf.name_scope(self.conv2.name): + self.conv2.build(self.mask_input_channels) + if getattr(self, "conv3", None) is not None: + with tf.name_scope(self.conv3.name): + self.conv3.build(self.config.mask_input_channels) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build(None) + class TFSamPromptEncoder(tf.keras.layers.Layer): @@ -664,7 +791,7 @@ def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding, **kwa self.not_a_point_embed = None self.config = config - def build(self, input_shape): + def build(self, input_shape=None): self.no_mask_embed = self.add_weight( name="no_mask_embed.weight", shape=(1, self.hidden_size), @@ -691,7 +818,14 @@ def build(self, input_shape): self.mask_embed.build( (None, self.config.mask_input_channels, self.config.image_size, self.config.image_size) ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "mask_embed", None) is not None: + with tf.name_scope(self.mask_embed.name): + self.mask_embed.build(None) + def _embed_points(self, points: tf.Tensor, labels: tf.Tensor, pad: bool) -> tf.Tensor: """Embeds point prompts.""" @@ -812,7 +946,7 @@ def __init__(self, config, window_size, **kwargs): raise ValueError("Input size must be provided if using relative positional encoding.") self.config = config - def build(self, input_shape): + def build(self, input_shape=None): if self.input_size is not None: # initialize relative positional embeddings self.rel_pos_h = self.add_weight( @@ -821,7 +955,17 @@ def build(self, input_shape): self.rel_pos_w = self.add_weight( shape=(2 * self.input_size[1] - 1, self.head_dim), initializer="zeros", name="rel_pos_w" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build(self.config.hidden_size) + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build(self.config.hidden_size) + def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor: """ @@ -949,6 +1093,7 @@ def __init__(self, config, window_size, **kwargs): self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") self.mlp = TFSamMLPBlock(config, name="mlp") self.window_size = window_size + self.config = config def window_partition(self, hidden_states: tf.Tensor, window_size: int) -> Tuple[tf.Tensor, Tuple[int, int]]: batch_size, height, width, channel = shape_list(hidden_states) @@ -1015,6 +1160,22 @@ def call( outputs += (attn_weights,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.config.hidden_size]) + if getattr(self, "attn", None) is not None: + with tf.name_scope(self.attn.name): + self.attn.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.config.hidden_size]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) class TFSamVisionNeck(tf.keras.layers.Layer): @@ -1046,6 +1207,22 @@ def call(self, hidden_states): hidden_states = self.layer_norm2(hidden_states) hidden_states = tf.transpose(hidden_states, perm=[0, 3, 1, 2]) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv1", None) is not None: + with tf.name_scope(self.conv1.name): + self.conv1.build(self.config.hidden_size) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build(None) + if getattr(self, "conv2", None) is not None: + with tf.name_scope(self.conv2.name): + self.conv2.build(self.config.output_channels) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build(None) class TFSamVisionEncoder(tf.keras.layers.Layer): @@ -1069,7 +1246,7 @@ def __init__(self, config: SamVisionConfig, **kwargs): self.neck = TFSamVisionNeck(config, name="neck") - def build(self, input_shape): + def build(self, input_shape=None): if self.config.use_abs_pos: # Initialize absolute positional embedding with pretrain image size. self.pos_embed = self.add_weight( @@ -1083,7 +1260,17 @@ def build(self, input_shape): trainable=True, name="pos_embed", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embed", None) is not None: + with tf.name_scope(self.patch_embed.name): + self.patch_embed.build(None) + if getattr(self, "neck", None) is not None: + with tf.name_scope(self.neck.name): + self.neck.build(None) + def get_input_embeddings(self): return self.patch_embed @@ -1463,3 +1650,19 @@ def serving_output(self, output: TFSamImageSegmentationOutput) -> TFSamImageSegm vision_attentions=attns if self.config.output_attentions else None, mask_decoder_attentions=output.mask_decoder_attentions if self.config.output_attentions else None, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shared_image_embedding", None) is not None: + with tf.name_scope(self.shared_image_embedding.name): + self.shared_image_embedding.build(None) + if getattr(self, "vision_encoder", None) is not None: + with tf.name_scope(self.vision_encoder.name): + self.vision_encoder.build(None) + if getattr(self, "prompt_encoder", None) is not None: + with tf.name_scope(self.prompt_encoder.name): + self.prompt_encoder.build(None) + if getattr(self, "mask_decoder", None) is not None: + with tf.name_scope(self.mask_decoder.name): + self.mask_decoder.build(None) diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index 86eb3249382996..5615cc56564ed3 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -88,6 +88,7 @@ def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") self.num_channels = num_channels + self.hidden_size = hidden_size def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]: embeddings = self.proj(self.padding(pixel_values)) @@ -99,6 +100,17 @@ def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]: embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim)) embeddings = self.layer_norm(embeddings) return embeddings, height, width + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build( + self.num_channels) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.hidden_size]) class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer): @@ -196,6 +208,19 @@ def call( outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.hidden_size) class TFSegformerSelfOutput(tf.keras.layers.Layer): @@ -203,11 +228,19 @@ def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs): super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(hidden_size, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.hidden_size = hidden_size def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.hidden_size) class TFSegformerAttention(tf.keras.layers.Layer): @@ -237,6 +270,16 @@ def call( attention_output = self.dense_output(self_outputs[0]) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFSegformerDWConv(tf.keras.layers.Layer): @@ -245,6 +288,7 @@ def __init__(self, dim: int = 768, **kwargs): self.depthwise_convolution = tf.keras.layers.Conv2D( filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv" ) + self.dim = dim def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: batch_size = shape_list(hidden_states)[0] @@ -257,6 +301,13 @@ def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: num_channels = shape_list(hidden_states)[3] hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels)) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "depthwise_convolution", None) is not None: + with tf.name_scope(self.depthwise_convolution.name): + self.depthwise_convolution.build(self.dim) class TFSegformerMixFFN(tf.keras.layers.Layer): @@ -278,6 +329,8 @@ def __init__( self.intermediate_act_fn = config.hidden_act self.dense2 = tf.keras.layers.Dense(out_features, name="dense2") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.hidden_features = hidden_features + self.in_features = in_features def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: hidden_states = self.dense1(hidden_states) @@ -287,6 +340,19 @@ def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool hidden_states = self.dense2(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense1", None) is not None: + with tf.name_scope(self.dense1.name): + self.dense1.build(self.in_features) + if getattr(self, "depthwise_convolution", None) is not None: + with tf.name_scope(self.depthwise_convolution.name): + self.depthwise_convolution.build(None) + if getattr(self, "dense2", None) is not None: + with tf.name_scope(self.dense2.name): + self.dense2.build(self.hidden_features) class TFSegformerLayer(tf.keras.layers.Layer): @@ -315,6 +381,7 @@ def __init__( self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2") mlp_hidden_size = int(hidden_size * mlp_ratio) self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp") + self.hidden_size = hidden_size def call( self, @@ -347,6 +414,22 @@ def call( outputs = (layer_output,) + outputs return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm_1", None) is not None: + with tf.name_scope(self.layer_norm_1.name): + self.layer_norm_1.build([None, None, self.hidden_size]) + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "layer_norm_2", None) is not None: + with tf.name_scope(self.layer_norm_2.name): + self.layer_norm_2.build([None, None, self.hidden_size]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) class TFSegformerEncoder(tf.keras.layers.Layer): @@ -450,6 +533,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norms", None) is not None: + for layer in self.layer_norms: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -510,6 +601,13 @@ def call( hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) class TFSegformerPreTrainedModel(TFPreTrainedModel): @@ -606,6 +704,13 @@ def call( training=training, ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "segformer", None) is not None: + with tf.name_scope(self.segformer.name): + self.segformer.build(None) @add_start_docstrings( @@ -624,6 +729,7 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs): # Classifier head self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -669,6 +775,16 @@ def call( return TFSequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "segformer", None) is not None: + with tf.name_scope(self.segformer.name): + self.segformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_sizes[-1]) class TFSegformerMLP(tf.keras.layers.Layer): @@ -688,6 +804,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim)) hidden_states = self.proj(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build(self.input_dim) class TFSegformerDecodeHead(TFSegformerPreTrainedModel): @@ -743,6 +866,19 @@ def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.T logits = self.classifier(hidden_states) return logits + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_fuse", None) is not None: + with tf.name_scope(self.linear_fuse.name): + self.linear_fuse.build(self.config.decoder_hidden_size * self.config.num_encoder_blocks) + if getattr(self, "batch_norm", None) is not None: + with tf.name_scope(self.batch_norm.name): + self.batch_norm.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.decoder_hidden_size) @add_start_docstrings( @@ -854,3 +990,13 @@ def call( hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "segformer", None) is not None: + with tf.name_scope(self.segformer.name): + self.segformer.build(None) + if getattr(self, "decode_head", None) is not None: + with tf.name_scope(self.decode_head.name): + self.decode_head.build(None) diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 026d2241b461ea..1b65c380be3c0d 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -165,6 +165,14 @@ def call(self, input_features: tf.Tensor) -> tf.Tensor: hidden_states = conv(hidden_states) hidden_states = glu(hidden_states, axis=2) # GLU over the Channel dimension return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv_layers", None) is not None: + for layer in self.conv_layers: + with tf.name_scope(layer.name): + layer.build(None) class TFSpeech2TextSinusoidalPositionalEmbedding(tf.keras.layers.Layer): @@ -378,6 +386,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFSpeech2TextEncoderLayer(tf.keras.layers.Layer): @@ -394,6 +418,7 @@ def __init__(self, config: Speech2TextConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False @@ -433,6 +458,25 @@ def call( hidden_states = residual + hidden_states return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFSpeech2TextDecoderLayer(tf.keras.layers.Layer): @@ -463,6 +507,7 @@ def __init__(self, config: Speech2TextConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -545,6 +590,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFSpeech2TextPreTrainedModel(TFPreTrainedModel): @@ -869,6 +939,23 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1091,6 +1178,23 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_tokens", None) is not None: + with tf.name_scope(self.embed_tokens.name): + self.embed_tokens.build(None) + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1196,6 +1300,16 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings( @@ -1278,6 +1392,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) @add_start_docstrings( @@ -1291,6 +1412,7 @@ def __init__(self, config: Speech2TextConfig): self.lm_head = tf.keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head") # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate self.supports_xla_generation = False + self.config = config def get_encoder(self): return self.model.encoder @@ -1460,3 +1582,13 @@ def prepare_inputs_for_generation( "cross_attn_head_mask": cross_attn_head_mask, "use_cache": use_cache, # change this to avoid caching (presumably for debugging) } + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(self.config.d_model) diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py index 5d53561442457f..e300ca6ce01baf 100644 --- a/src/transformers/models/swin/modeling_tf_swin.py +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -283,6 +283,7 @@ def __init__(self, config: SwinConfig, use_mask_token: bool = False, **kwargs) - self.norm = tf.keras.layers.LayerNormalization(name="norm", epsilon=1e-5) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.config = config def build(self, input_shape: tf.TensorShape) -> None: if self.use_mask_token: @@ -296,7 +297,20 @@ def build(self, input_shape: tf.TensorShape) -> None: ) else: self.position_embeddings = None - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build(None) + if getattr(self, "norm", None) is not None: + with tf.name_scope(self.norm.name): + self.norm.build([None, None, self.config.embed_dim]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + def call( self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False @@ -380,6 +394,13 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> Tuple[tf.Tens embeddings = tf.reshape(embeddings, (batch_size, channels, -1)) embeddings = tf.transpose(embeddings, (0, 2, 1)) return embeddings, output_dimensions + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) class TFSwinPatchMerging(tf.keras.layers.Layer): @@ -442,6 +463,13 @@ def call(self, input_feature: tf.Tensor, input_dimensions: Tuple[int, int], trai input_feature = self.reduction(input_feature, training=training) return input_feature + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "reduction", None) is not None: + with tf.name_scope(self.reduction.name): + self.reduction.build(4 * self.dim) class TFSwinDropPath(tf.keras.layers.Layer): @@ -521,7 +549,20 @@ def build(self, input_shape: tf.TensorShape) -> None: relative_coords = tf.stack([stack_0, stack_1], axis=2) self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32)) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.all_head_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.all_head_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.all_head_size) + def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor: new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size] @@ -597,11 +638,22 @@ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(dim, name="dense") self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout") + self.dim = dim def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.dim) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) class TFSwinAttention(tf.keras.layers.Layer): @@ -630,6 +682,16 @@ def call( attention_output = self.self_output(self_outputs[0], hidden_states, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) class TFSwinIntermediate(tf.keras.layers.Layer): @@ -640,11 +702,19 @@ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act + self.dim = dim def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.dim) class TFSwinOutput(tf.keras.layers.Layer): @@ -652,11 +722,20 @@ def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None: super().__init__(**kwargs) self.dense = tf.keras.layers.Dense(dim, name="dense") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, "dropout") + self.config = config + self.dim = dim def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(int(self.config.mlp_ratio * self.dim)) class TFSwinLayer(tf.keras.layers.Layer): @@ -684,6 +763,7 @@ def __init__( ) self.intermediate = TFSwinIntermediate(config, dim, name="intermediate") self.swin_output = TFSwinOutput(config, dim, name="output") + self.dim = dim def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None: img_mask = tf.zeros((height, width)) @@ -788,6 +868,28 @@ def call( layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) return layer_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.dim]) + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.dim]) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "swin_output", None) is not None: + with tf.name_scope(self.swin_output.name): + self.swin_output.build(None) class TFSwinStage(tf.keras.layers.Layer): @@ -860,6 +962,14 @@ def call( if output_attentions: stage_outputs += layer_outputs[1:] return stage_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) class TFSwinEncoder(tf.keras.layers.Layer): @@ -940,6 +1050,14 @@ def call( attentions=all_self_attentions, reshaped_hidden_states=all_reshaped_hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFSwinPreTrainedModel(TFPreTrainedModel): @@ -1159,6 +1277,19 @@ def call( attentions=encoder_outputs.attentions, reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.num_features]) @add_start_docstrings( @@ -1216,6 +1347,13 @@ def call( ) return swin_outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "swin", None) is not None: + with tf.name_scope(self.swin.name): + self.swin.build(None) class TFSwinPixelShuffle(tf.keras.layers.Layer): @@ -1371,6 +1509,16 @@ def call( attentions=outputs.attentions, reshaped_hidden_states=outputs.reshaped_hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "swin", None) is not None: + with tf.name_scope(self.swin.name): + self.swin.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings( @@ -1446,3 +1594,13 @@ def call( attentions=outputs.attentions, reshaped_hidden_states=outputs.reshaped_hidden_states, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "swin", None) is not None: + with tf.name_scope(self.swin.name): + self.swin.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index 26eb7b9b6a2929..fbb56a34cc30ed 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -111,6 +111,7 @@ def __init__(self, config, **kwargs): ) # Update init weights as in flax self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = get_tf_activation(config.dense_act_fn) + self.config = config def call(self, hidden_states, training=False): hidden_states = self.wi(hidden_states) @@ -118,6 +119,16 @@ def call(self, hidden_states, training=False): hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.wo(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wi", None) is not None: + with tf.name_scope(self.wi.name): + self.wi.build(self.config.d_model) + if getattr(self, "wo", None) is not None: + with tf.name_scope(self.wo.name): + self.wo.build(self.config.d_ff) class TFT5DenseGatedActDense(tf.keras.layers.Layer): @@ -140,6 +151,7 @@ def __init__(self, config, **kwargs): ) # Update init weights as in flax self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = get_tf_activation(config.dense_act_fn) + self.config = config def call(self, hidden_states, training=False): hidden_gelu = self.act(self.wi_0(hidden_states)) @@ -148,6 +160,19 @@ def call(self, hidden_states, training=False): hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.wo(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wi_0", None) is not None: + with tf.name_scope(self.wi_0.name): + self.wi_0.build(self.config.d_model) + if getattr(self, "wi_1", None) is not None: + with tf.name_scope(self.wi_1.name): + self.wi_1.build(self.config.d_model) + if getattr(self, "wo", None) is not None: + with tf.name_scope(self.wo.name): + self.wo.build(self.config.d_ff) class TFT5LayerFF(tf.keras.layers.Layer): @@ -166,6 +191,13 @@ def call(self, hidden_states, training=False): dense_output = self.DenseReluDense(normed_hidden_states, training=training) hidden_states = hidden_states + self.dropout(dense_output, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) class TFT5Attention(tf.keras.layers.Layer): @@ -219,7 +251,7 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs): self.pruned_heads = set() - def build(self, input_shape): + def build(self, input_shape=None): if self.has_relative_attention_bias: with tf.name_scope("relative_attention_bias"): self.relative_attention_bias = self.add_weight( @@ -228,7 +260,23 @@ def build(self, input_shape): initializer=self.relative_attention_bias_initializer, # Add initializer ) - return super().build(input_shape) + return + if self.built: + return + self.built = True + if getattr(self, "q", None) is not None: + with tf.name_scope(self.q.name): + self.q.build(self.d_model) + if getattr(self, "k", None) is not None: + with tf.name_scope(self.k.name): + self.k.build(self.d_model) + if getattr(self, "v", None) is not None: + with tf.name_scope(self.v.name): + self.v.build(self.d_model) + if getattr(self, "o", None) is not None: + with tf.name_scope(self.o.name): + self.o.build(self.inner_dim) + def prune_heads(self, heads): raise NotImplementedError @@ -468,6 +516,16 @@ def call( hidden_states = hidden_states + self.dropout(attention_output[0], training=training) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "SelfAttention", None) is not None: + with tf.name_scope(self.SelfAttention.name): + self.SelfAttention.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) class TFT5LayerCrossAttention(tf.keras.layers.Layer): @@ -510,6 +568,16 @@ def call( hidden_states = hidden_states + self.dropout(attention_output[0], training=training) outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "EncDecAttention", None) is not None: + with tf.name_scope(self.EncDecAttention.name): + self.EncDecAttention.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) class TFT5Block(tf.keras.layers.Layer): @@ -848,6 +916,17 @@ def call( hidden_states=all_hidden_states, attentions=all_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "block", None) is not None: + for layer in self.block: + with tf.name_scope(layer.name): + layer.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build(None) # TODO Matt might be wrong #################################################### @@ -1223,6 +1302,19 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) @@ -1473,6 +1565,19 @@ def prepare_inputs_for_generation( def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): return self._shift_right(labels) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings( @@ -1552,3 +1657,13 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "shared", None) is not None: + with tf.name_scope(self.shared.name): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index a41b56e1a6caef..2a202f780a3d89 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -160,7 +160,7 @@ def __init__(self, config: TapasConfig, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -186,7 +186,14 @@ def build(self, input_shape: tf.TensorShape): ), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def call( self, @@ -279,6 +286,7 @@ def __init__(self, config: TapasConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -367,6 +375,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas @@ -379,6 +400,7 @@ def __init__(self, config: TapasConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -386,6 +408,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas @@ -427,6 +459,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas @@ -442,12 +484,20 @@ def __init__(self, config: TapasConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas @@ -460,6 +510,7 @@ def __init__(self, config: TapasConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -467,6 +518,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas @@ -554,6 +615,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas @@ -624,6 +698,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas @@ -637,6 +719,7 @@ def __init__(self, config: TapasConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -645,6 +728,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas @@ -664,6 +754,7 @@ def __init__(self, config: TapasConfig, **kwargs): self.transform_act_fn = config.hidden_act self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -671,6 +762,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.LayerNorm(inputs=hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas @@ -687,10 +788,17 @@ def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, # an output-only bias for each token. self.input_embeddings = input_embeddings - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + def get_output_embeddings(self) -> tf.keras.layers.Layer: return self.input_embeddings @@ -851,6 +959,19 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFTapasPreTrainedModel(TFPreTrainedModel): @@ -1032,6 +1153,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "tapas", None) is not None: + with tf.name_scope(self.tapas.name): + self.tapas.build(None) @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING) @@ -1128,6 +1256,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "tapas", None) is not None: + with tf.name_scope(self.tapas.name): + self.tapas.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) class TFTapasComputeTokenLogits(tf.keras.layers.Layer): @@ -1551,6 +1689,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "tapas", None) is not None: + with tf.name_scope(self.tapas.name): + self.tapas.build(None) + if getattr(self, "compute_token_logits", None) is not None: + with tf.name_scope(self.compute_token_logits.name): + self.compute_token_logits.build(None) # TODO Matt might be wrong + if getattr(self, "compute_column_logits", None) is not None: + with tf.name_scope(self.compute_column_logits.name): + self.compute_column_logits.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -1570,6 +1721,7 @@ def __init__(self, config: TapasConfig, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1653,6 +1805,19 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "tapas", None) is not None: + with tf.name_scope(self.tapas.name): + self.tapas.build(None) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) """ TAPAS utilities.""" diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py index 34349c8661757c..95b78f04949af9 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py @@ -220,12 +220,23 @@ def __init__( self.visual_projection = Dense(self.projection_dim, use_bias=False, name="visual_projection") self.text_projection = Dense(self.projection_dim, use_bias=False, name="text_projection") self.logit_scale = None + self.config = config def build(self, input_shape=None): # Build in the build() method to make sure the names are right initializer = tf.keras.initializers.Constant(self.config.logit_scale_init_value) self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build(self.vision_embed_dim) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build(self.text_embed_dim) + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index 727db8dfc6c081..3ff5263c9e1c20 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -66,7 +66,7 @@ def __init__(self, config: ViTConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.config = config - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): num_patches = self.patch_embeddings.num_patches self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), @@ -81,7 +81,14 @@ def build(self, input_shape: tf.TensorShape): name="position_embeddings", ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build(None) + def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor: """ @@ -204,6 +211,13 @@ def call( embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1)) return embeddings + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) class TFViTSelfAttention(tf.keras.layers.Layer): @@ -231,6 +245,7 @@ def __init__(self, config: ViTConfig, **kwargs): units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -279,6 +294,19 @@ def call( outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) class TFViTSelfOutput(tf.keras.layers.Layer): @@ -294,12 +322,20 @@ def __init__(self, config: ViTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFViTAttention(tf.keras.layers.Layer): @@ -328,6 +364,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) class TFViTIntermediate(tf.keras.layers.Layer): @@ -342,12 +388,20 @@ def __init__(self, config: ViTConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) class TFViTOutput(tf.keras.layers.Layer): @@ -358,6 +412,7 @@ def __init__(self, config: ViTConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -365,6 +420,13 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = hidden_states + input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) class TFViTLayer(tf.keras.layers.Layer): @@ -383,6 +445,7 @@ def __init__(self, config: ViTConfig, **kwargs): self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) + self.config = config def call( self, @@ -415,6 +478,25 @@ def call( outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "vit_output", None) is not None: + with tf.name_scope(self.vit_output.name): + self.vit_output.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.config.hidden_size]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.config.hidden_size]) class TFViTEncoder(tf.keras.layers.Layer): @@ -460,6 +542,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -538,6 +628,22 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_size]) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) class TFViTPreTrainedModel(TFPreTrainedModel): @@ -664,6 +770,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vit", None) is not None: + with tf.name_scope(self.vit.name): + self.vit.build(None) class TFViTPooler(tf.keras.layers.Layer): @@ -676,6 +789,7 @@ def __init__(self, config: ViTConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -684,6 +798,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) @add_start_docstrings( @@ -714,6 +835,7 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs): kernel_initializer=get_initializer(config.initializer_range), name="classifier", ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) @@ -764,3 +886,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vit", None) is not None: + with tf.name_scope(self.vit.name): + self.vit.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py index 21898bbe83bb2c..61cdb9b088fb95 100644 --- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py @@ -213,7 +213,7 @@ def __init__(self, config: ViTMAEConfig, **kwargs): self.config = config - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), @@ -233,7 +233,14 @@ def build(self, input_shape: tf.TensorShape): )[None, ...] self.position_embeddings.assign(pos_embed) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "patch_embeddings", None) is not None: + with tf.name_scope(self.patch_embeddings.name): + self.patch_embeddings.build(None) + def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None): """ @@ -351,6 +358,13 @@ def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: x = tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1)) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.num_channels) # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE @@ -379,6 +393,7 @@ def __init__(self, config: ViTMAEConfig, **kwargs): units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -427,6 +442,19 @@ def call( outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE @@ -443,12 +471,20 @@ def __init__(self, config: ViTMAEConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.dropout(inputs=hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE @@ -478,6 +514,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE @@ -493,12 +539,20 @@ def __init__(self, config: ViTMAEConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE @@ -510,6 +564,7 @@ def __init__(self, config: ViTMAEConfig, **kwargs): units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -517,6 +572,13 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = hidden_states + input_tensor return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) # Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE @@ -536,6 +598,7 @@ def __init__(self, config: ViTMAEConfig, **kwargs): self.layernorm_after = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="layernorm_after" ) + self.config = config def call( self, @@ -568,6 +631,25 @@ def call( outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "vit_output", None) is not None: + with tf.name_scope(self.vit_output.name): + self.vit_output.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.config.hidden_size]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.config.hidden_size]) # Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE @@ -614,6 +696,14 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -686,6 +776,19 @@ def call( hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_size]) class TFViTMAEPreTrainedModel(TFPreTrainedModel): @@ -828,6 +931,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vit", None) is not None: + with tf.name_scope(self.vit.name): + self.vit.build(None) class TFViTMAEDecoder(tf.keras.layers.Layer): @@ -853,7 +963,7 @@ def __init__(self, config, num_patches, **kwargs): self.config = config self.num_patches = num_patches - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.mask_token = self.add_weight( shape=(1, 1, self.config.decoder_hidden_size), initializer=tf.random_normal_initializer(stddev=self.config.initializer_range), @@ -873,7 +983,25 @@ def build(self, input_shape: tf.TensorShape): )[None, ...] self.decoder_pos_embed.assign(decoder_pos_embed) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "decoder_embed", None) is not None: + with tf.name_scope(self.decoder_embed.name): + self.decoder_embed.build(self.config.hidden_size) + if getattr(self, "decoder_norm", None) is not None: + with tf.name_scope(self.decoder_norm.name): + self.decoder_norm.build([None, None, self.config.decoder_hidden_size]) + if getattr(self, "decoder_pred", None) is not None: + with tf.name_scope(self.decoder_pred.name): + self.decoder_pred.build( + self.config.decoder_hidden_size) + if getattr(self, "decoder_layers", None) is not None: + for layer in self.decoder_layers: + with tf.name_scope(layer.name): + layer.build(None) + def call( self, @@ -1128,3 +1256,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vit", None) is not None: + with tf.name_scope(self.vit.name): + self.vit.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 10563579fdd55f..e02ba8956616c8 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -501,6 +501,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.conv(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) class TFWav2Vec2LayerNormConvLayer(tf.keras.layers.Layer): @@ -524,6 +531,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.layer_norm(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.out_conv_dim]) class TFWav2Vec2GroupNormConvLayer(tf.keras.layers.Layer): @@ -549,6 +566,16 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.layer_norm(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build(None) class TFWav2Vec2PositionalConvEmbedding(tf.keras.layers.Layer): @@ -569,6 +596,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.padding(hidden_states) hidden_states = self.activation(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv", None) is not None: + with tf.name_scope(self.conv.name): + self.conv.build(None) class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): @@ -632,12 +666,23 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): name="projection", ) self.dropout = tf.keras.layers.Dropout(rate=config.feat_proj_dropout) + self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: norm_hidden_states = self.layer_norm(hidden_states) hidden_states = self.projection(norm_hidden_states) hidden_states = self.dropout(hidden_states, training=training) return hidden_states, norm_hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.conv_dim[-1]]) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build(self.config.conv_dim[-1]) # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2 @@ -792,6 +837,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFWav2Vec2FeedForward(tf.keras.layers.Layer): @@ -815,6 +876,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): name="output_dense", ) self.output_dropout = tf.keras.layers.Dropout(config.hidden_dropout) + self.config = config def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.intermediate_dense(hidden_states) @@ -824,6 +886,16 @@ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.output_dense(hidden_states) hidden_states = self.output_dropout(hidden_states, training=training) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "intermediate_dense", None) is not None: + with tf.name_scope(self.intermediate_dense.name): + self.intermediate_dense.build(self.config.hidden_size) + if getattr(self, "output_dense", None) is not None: + with tf.name_scope(self.output_dense.name): + self.output_dense.build(self.config.intermediate_size) class TFWav2Vec2EncoderLayer(tf.keras.layers.Layer): @@ -842,6 +914,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) + self.config = config def call( self, @@ -867,6 +940,22 @@ def call( outputs += (attn_weights,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "feed_forward", None) is not None: + with tf.name_scope(self.feed_forward.name): + self.feed_forward.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.config.hidden_size]) class TFWav2Vec2EncoderLayerStableLayerNorm(tf.keras.layers.Layer): @@ -885,6 +974,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): self.final_layer_norm = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_eps, name="final_layer_norm" ) + self.config = config def call( self, @@ -908,6 +998,22 @@ def call( outputs += (attn_weights,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "feed_forward", None) is not None: + with tf.name_scope(self.feed_forward.name): + self.feed_forward.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.config.hidden_size]) class TFWav2Vec2Encoder(tf.keras.layers.Layer): @@ -973,6 +1079,20 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "pos_conv_embed", None) is not None: + with tf.name_scope(self.pos_conv_embed.name): + self.pos_conv_embed.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) class TFWav2Vec2EncoderStableLayerNorm(tf.keras.layers.Layer): @@ -1040,6 +1160,20 @@ def call( hidden_states=all_hidden_states, attentions=all_self_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "pos_conv_embed", None) is not None: + with tf.name_scope(self.pos_conv_embed.name): + self.pos_conv_embed.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -1057,12 +1191,22 @@ def __init__(self, config: Wav2Vec2Config, **kwargs): else: self.encoder = TFWav2Vec2Encoder(config, name="encoder") - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): self.masked_spec_embed = self.add_weight( shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "feature_extractor", None) is not None: + with tf.name_scope(self.feature_extractor.name): + self.feature_extractor.build(None) + if getattr(self, "feature_projection", None) is not None: + with tf.name_scope(self.feature_projection.name): + self.feature_projection.build(None) + def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor): """ @@ -1418,6 +1562,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wav2vec2", None) is not None: + with tf.name_scope(self.wav2vec2.name): + self.wav2vec2.build(None) @add_start_docstrings( @@ -1574,6 +1725,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wav2vec2", None) is not None: + with tf.name_scope(self.wav2vec2.name): + self.wav2vec2.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(self.output_hidden_size) class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel): @@ -1672,3 +1833,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "wav2vec2", None) is not None: + with tf.name_scope(self.wav2vec2.name): + self.wav2vec2.build(None) + if getattr(self, "projector", None) is not None: + with tf.name_scope(self.projector.name): + self.projector.build(self.config.hidden_size) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.classifier_proj_size) diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 6789758076142b..ef3cce63df805f 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -312,6 +312,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper @@ -329,6 +345,7 @@ def __init__(self, config: WhisperConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False @@ -368,6 +385,25 @@ def call( hidden_states = residual + hidden_states return hidden_states, self_attn_weights + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.encoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) # Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper @@ -399,6 +435,7 @@ def __init__(self, config: WhisperConfig, **kwargs): self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config def call( self, @@ -481,6 +518,31 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.decoder_ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) class TFWhisperPreTrainedModel(TFPreTrainedModel): @@ -748,6 +810,26 @@ def call( return TFBaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "conv1", None) is not None: + with tf.name_scope(self.conv1.name): + self.conv1.build(None) + if getattr(self, "conv2", None) is not None: + with tf.name_scope(self.conv2.name): + self.conv2.build(None) + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "encoder_layers", None) is not None: + for layer in self.encoder_layers: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -987,6 +1069,23 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_tokens", None) is not None: + with tf.name_scope(self.embed_tokens.name): + self.embed_tokens.build(None) + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "decoder_layers", None) is not None: + for layer in self.decoder_layers: + with tf.name_scope(layer.name): + layer.build(None) @add_start_docstrings( @@ -1110,6 +1209,16 @@ def call( encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) @add_start_docstrings( @@ -1218,6 +1327,13 @@ def serving_output(self, output): encoder_hidden_states=enc_hs, encoder_attentions=enc_attns, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) @add_start_docstrings( @@ -1625,3 +1741,10 @@ def prepare_inputs_for_generation( "decoder_attention_mask": decoder_attention_mask, "decoder_position_ids": decoder_position_ids, } + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index e2890edeb665af..4ae409948d09fd 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -300,6 +300,22 @@ def call( attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) return attn_output, attn_weights, past_key_value + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build(self.embed_dim) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build(self.embed_dim) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build(self.embed_dim) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.embed_dim) class TFXGLMDecoderLayer(tf.keras.layers.Layer): @@ -333,6 +349,7 @@ def __init__(self, config: XGLMConfig, **kwargs: Any) -> None: self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call def call( @@ -414,6 +431,25 @@ def call( cross_attn_weights, present_key_value, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build(self.embed_dim) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build(self.config.ffn_dim) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) @keras_serializable @@ -608,6 +644,17 @@ def call( attentions=all_self_attns, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) class TFXGLMPreTrainedModel(TFPreTrainedModel): @@ -791,6 +838,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) @add_start_docstrings( @@ -822,6 +876,7 @@ def __init__( kernel_initializer=get_initializer(config.init_std), name="lm_head", ) + self.config = config def get_output_embeddings(self): return self.lm_head @@ -924,3 +979,13 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(self.config.hidden_size) diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 9343f6cb524be0..5ed6c68a11c240 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -316,7 +316,7 @@ def __init__(self, config, **kwargs): if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) - def build(self, input_shape): + def build(self, input_shape=None): with tf.name_scope("position_embeddings"): self.position_embeddings = self.add_weight( name="embeddings", @@ -332,7 +332,17 @@ def build(self, input_shape): initializer=get_initializer(self.embed_init_std), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "layer_norm_emb", None) is not None: + with tf.name_scope(self.layer_norm_emb.name): + self.layer_norm_emb.build([None, None, self.dim]) + def get_input_embeddings(self): return self.embeddings @@ -733,6 +743,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) class TFXLMPredLayer(tf.keras.layers.Layer): @@ -870,6 +887,16 @@ def call( return TFXLMWithLMHeadModelOutput( logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "pred_layer", None) is not None: + with tf.name_scope(self.pred_layer.name): + self.pred_layer.build(None) # TODO Matt might be wrong @add_start_docstrings( @@ -948,6 +975,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) @add_start_docstrings( @@ -966,6 +1003,7 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) + self.config = config @property def dummy_inputs(self): @@ -1067,6 +1105,19 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "logits_proj", None) is not None: + with tf.name_scope(self.logits_proj.name): + self.logits_proj.build(self.config.num_labels) @add_start_docstrings( @@ -1086,6 +1137,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1147,6 +1199,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1163,6 +1225,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1238,3 +1301,13 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py index 65f3be9e2f277f..d75cb45e65b2dd 100644 --- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py @@ -178,7 +178,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) - def build(self, input_shape: tf.TensorShape): + def build(self, input_shape=None): with tf.name_scope("word_embeddings"): self.weight = self.add_weight( name="weight", @@ -200,7 +200,14 @@ def build(self, input_shape: tf.TensorShape): initializer=get_initializer(self.initializer_range), ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): """ @@ -273,6 +280,7 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): activation="tanh", name="dense", ) + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: # We "pool" the model by simply taking the hidden state corresponding @@ -281,6 +289,13 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor: pooled_output = self.dense(inputs=first_token_tensor) return pooled_output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta @@ -311,6 +326,7 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.is_decoder = config.is_decoder + self.config = config def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] @@ -399,6 +415,19 @@ def call( if self.is_decoder: outputs = outputs + (past_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build(self.config.hidden_size) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build(self.config.hidden_size) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta @@ -411,6 +440,7 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -418,6 +448,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta @@ -459,6 +499,16 @@ def call( outputs = (attention_output,) + self_outputs[1:] return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta @@ -474,12 +524,20 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): self.intermediate_act_fn = get_tf_activation(config.hidden_act) else: self.intermediate_act_fn = config.hidden_act + self.config = config def call(self, hidden_states: tf.Tensor) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) # Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta @@ -492,6 +550,7 @@ def __init__(self, config: XLMRobertaConfig, **kwargs): ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_states = self.dense(inputs=hidden_states) @@ -499,6 +558,16 @@ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.intermediate_size) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) # Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta @@ -586,6 +655,19 @@ def call( outputs = outputs + (present_key_value,) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) # Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta @@ -656,6 +738,14 @@ def call( attentions=all_attentions, cross_attentions=all_cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) @keras_serializable @@ -854,6 +944,19 @@ def call( attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta @@ -939,6 +1042,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta @@ -960,10 +1070,20 @@ def __init__(self, config, input_embeddings, **kwargs): # an output-only bias for each token. self.decoder = input_embeddings - def build(self, input_shape): + def build(self, input_shape=None): self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + def get_output_embeddings(self): return self.decoder @@ -1071,6 +1191,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) @add_start_docstrings( @@ -1198,6 +1328,16 @@ def call( attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta @@ -1219,6 +1359,7 @@ def __init__(self, config, **kwargs): self.out_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" ) + self.config = config def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -1227,6 +1368,16 @@ def call(self, features, training=False): x = self.dropout(x, training=training) x = self.out_proj(x) return x + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build(self.config.hidden_size) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build(self.config.hidden_size) @add_start_docstrings( @@ -1304,6 +1455,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) @add_start_docstrings( @@ -1327,6 +1488,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward( @@ -1397,6 +1559,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1424,6 +1596,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1481,6 +1654,16 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1503,6 +1686,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1574,3 +1758,13 @@ def call( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size) diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index a0e6a8c2aa5072..90584c9034c27c 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -85,8 +85,9 @@ def __init__(self, config, **kwargs): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout) + self.config = config - def build(self, input_shape): + def build(self, input_shape=None): initializer = get_initializer(self.initializer_range) self.q = self.add_weight( shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q" @@ -115,7 +116,14 @@ def build(self, input_shape): self.seg_embed = self.add_weight( shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + def prune_heads(self, heads): raise NotImplementedError @@ -344,6 +352,7 @@ def __init__(self, config, **kwargs): self.activation_function = get_tf_activation(config.ff_activation) else: self.activation_function = config.ff_activation + self.config = config def call(self, inp, training=False): output = inp @@ -354,6 +363,19 @@ def call(self, inp, training=False): output = self.dropout(output, training=training) output = self.layer_norm(output + inp) return output + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layer_1", None) is not None: + with tf.name_scope(self.layer_1.name): + self.layer_1.build(self.config.d_model) + if getattr(self, "layer_2", None) is not None: + with tf.name_scope(self.layer_2.name): + self.layer_2.build(self.config.d_inner) class TFXLNetLayer(tf.keras.layers.Layer): @@ -398,6 +420,16 @@ def call( outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "rel_attn", None) is not None: + with tf.name_scope(self.rel_attn.name): + self.rel_attn.build(None) + if getattr(self, "ff", None) is not None: + with tf.name_scope(self.ff.name): + self.ff.build(None) class TFXLNetLMHead(tf.keras.layers.Layer): @@ -471,12 +503,23 @@ def set_input_embeddings(self, value): self.word_embedding.weight = value self.word_embedding.vocab_size = shape_list(value)[0] - def build(self, input_shape): + def build(self, input_shape=None): initializer = get_initializer(self.initializer_range) self.mask_emb = self.add_weight( shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb" ) - super().build(input_shape) + + if self.built: + return + self.built = True + if getattr(self, "word_embedding", None) is not None: + with tf.name_scope(self.word_embedding.name): + self.word_embedding.build(None) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + def _prune_heads(self, heads_to_prune): raise NotImplementedError @@ -1176,6 +1219,13 @@ def call( ) return outputs + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) @add_start_docstrings( @@ -1335,6 +1385,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "lm_loss", None) is not None: + with tf.name_scope(self.lm_loss.name): + self.lm_loss.build(None) @add_start_docstrings( @@ -1356,6 +1416,7 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1422,6 +1483,19 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "logits_proj", None) is not None: + with tf.name_scope(self.logits_proj.name): + self.logits_proj.build(self.config.d_model) @add_start_docstrings( @@ -1442,6 +1516,7 @@ def __init__(self, config, *inputs, **kwargs): self.logits_proj = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @@ -1523,6 +1598,19 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "logits_proj", None) is not None: + with tf.name_scope(self.logits_proj.name): + self.logits_proj.build(self.config.d_model) @add_start_docstrings( @@ -1541,6 +1629,7 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1603,6 +1692,16 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(self.config.hidden_size) @add_start_docstrings( @@ -1619,6 +1718,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) + self.config = config @unpack_inputs @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @@ -1697,3 +1797,13 @@ def call( hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, ) + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build(self.config.hidden_size)