diff --git a/paddlevlp/__init__.py b/paddlevlp/__init__.py index 058cfa738ac127..555bc54942dc51 100644 --- a/paddlevlp/__init__.py +++ b/paddlevlp/__init__.py @@ -17,3 +17,4 @@ from .models import * from .optimization import * from .processors import * +from .tests import * \ No newline at end of file diff --git a/paddlevlp/models/blip2/__init__.py b/paddlevlp/models/blip2/__init__.py index 595add0aed9e11..10888d464703e6 100644 --- a/paddlevlp/models/blip2/__init__.py +++ b/paddlevlp/models/blip2/__init__.py @@ -11,3 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .configuration import * +from .modeling import * + +__all__ = configuration.__all__ + modeling.__all__ \ No newline at end of file diff --git a/paddlevlp/models/blip2/modeling.py b/paddlevlp/models/blip2/modeling.py index 19996e74ee9e83..27d5ec124c2d57 100644 --- a/paddlevlp/models/blip2/modeling.py +++ b/paddlevlp/models/blip2/modeling.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Paddle BLIP2 model.""" from paddlevlp.utils.log import logger import math @@ -61,8 +60,7 @@ def Parameter(tensor): return paddle.create_parameter( tensor.shape, dtype=tensor.dtype, - default_initializer=nn.initializer.Assign(tensor), - ) + default_initializer=nn.initializer.Assign(tensor), ) @dataclass @@ -90,11 +88,9 @@ class Blip2ForConditionalGenerationModelOutput(ModelOutput): def to_tuple(self) -> Tuple[Any]: return tuple( - self[k] - if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] - else getattr(self, k).to_tuple() - for k in self.keys() - ) + self[k] if k not in + ["vision_outputs", "qformer_outputs", "language_model_outputs"] else + getattr(self, k).to_tuple() for k in self.keys()) # Copied from paddlenlp.transformers.blip.modeling.BlipVisionEmbeddings with Blip->Blip2 @@ -107,40 +103,34 @@ def __init__(self, config: Blip2VisionConfig): self.patch_size = config.patch_size self.class_embedding = Parameter( - paddle.randn([1, 1, self.embed_dim], dtype=paddle.get_default_dtype()), - ) + paddle.randn( + [1, 1, self.embed_dim], dtype=paddle.get_default_dtype()), ) self.patch_embedding = nn.Conv2D( in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, - stride=self.patch_size, - ) + stride=self.patch_size, ) - self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_patches = (self.image_size // self.patch_size)**2 self.num_positions = self.num_patches + 1 self.position_embedding = Parameter( paddle.randn( [1, self.num_positions, self.embed_dim], - dtype=paddle.get_default_dtype(), - ) - ) + dtype=paddle.get_default_dtype(), )) def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding( - pixel_values - ) # shape = [*, width, grid, grid] + pixel_values) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) - class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast( - target_dtype - ) + class_embeds = self.class_embedding.expand( + [batch_size, 1, -1]).cast(target_dtype) embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) - embeddings = embeddings + self.position_embedding[ - :, : embeddings.shape[1], : - ].cast(target_dtype) + embeddings = embeddings + self.position_embedding[:, :embeddings.shape[ + 1], :].cast(target_dtype) return embeddings @@ -156,42 +146,44 @@ def __init__(self, config): if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) + f" {self.num_heads}).") self.scale = self.head_dim**-0.5 self.dropout = nn.Dropout(config.attention_dropout) # small tweak here compared to CLIP, no bias here - self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False) + self.qkv = nn.Linear( + self.embed_dim, 3 * self.embed_dim, bias_attr=False) if config.qkv_bias: q_bias = Parameter( - paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype()) - ) + paddle.zeros( + [self.embed_dim], dtype=paddle.get_default_dtype())) v_bias = Parameter( - paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype()) - ) + paddle.zeros( + [self.embed_dim], dtype=paddle.get_default_dtype())) else: q_bias = None v_bias = None if q_bias is not None: - qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias)) + qkv_bias = paddle.concat( + (q_bias, paddle.zeros_like(v_bias), v_bias)) self.qkv.bias = Parameter(qkv_bias) self.projection = nn.Linear(self.embed_dim, self.embed_dim) def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): - return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose( - [0, 2, 1, 3] - ) + return tensor.reshape( + [bsz, seq_len, self.num_heads, self.head_dim]).transpose( + [0, 2, 1, 3]) def forward( - self, - hidden_states: paddle.Tensor, - head_mask: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + self, + hidden_states: paddle.Tensor, + head_mask: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=False, ) -> Tuple[ + paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[ + paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.shape @@ -199,16 +191,16 @@ def forward( mixed_qkv = self.qkv(hidden_states) mixed_qkv = mixed_qkv.reshape( - [bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads] - ).transpose([2, 0, 3, 1, 4]) + [bsz, tgt_len, 3, self.num_heads, + embed_dim // self.num_heads]).transpose([2, 0, 3, 1, 4]) query_states, key_states, value_states = ( mixed_qkv[0], mixed_qkv[1], - mixed_qkv[2], - ) + mixed_qkv[2], ) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul(query_states, key_states, transpose_y=True) + attention_scores = paddle.matmul( + query_states, key_states, transpose_y=True) attention_scores = attention_scores * self.scale @@ -224,17 +216,15 @@ def forward( attention_probs = attention_probs * head_mask context_layer = paddle.matmul(attention_probs, value_states).transpose( - [0, 2, 1, 3] - ) + [0, 2, 1, 3]) - new_context_layer_shape = context_layer.shape[:-2] + [ - self.embed_dim, - ] + new_context_layer_shape = context_layer.shape[:-2] + [self.embed_dim, ] context_layer = context_layer.reshape(new_context_layer_shape) output = self.projection(context_layer) - outputs = (output, attention_probs) if output_attentions else (output, None) + outputs = (output, attention_probs) if output_attentions else (output, + None) return outputs @@ -261,16 +251,17 @@ def __init__(self, config: Blip2Config): super().__init__() self.embed_dim = config.hidden_size self.self_attn = Blip2Attention(config) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm1 = nn.LayerNorm( + self.embed_dim, epsilon=config.layer_norm_eps) self.mlp = Blip2MLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm2 = nn.LayerNorm( + self.embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: paddle.Tensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: """ Args: hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` @@ -288,8 +279,7 @@ def forward( hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, head_mask=attention_mask, - output_attentions=output_attentions, - ) + output_attentions=output_attentions, ) hidden_states = hidden_states + residual residual = hidden_states @@ -298,10 +288,10 @@ def forward( hidden_states = hidden_states + residual - outputs = (hidden_states,) + outputs = (hidden_states, ) if output_attentions: - outputs += (attn_weights,) + outputs += (attn_weights, ) return outputs @@ -326,11 +316,8 @@ class Blip2PretrainedModel(PretrainedModel): def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_range - if ( - isinstance(module, nn.Conv2D) - or isinstance(module, nn.Embedding) - or isinstance(module, nn.Linear) - ): + if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or + isinstance(module, nn.Linear)): normal_(module.weight, mean=0.0, std=factor) if hasattr(module, "bias") and module.bias is not None: zeros_(module.bias) @@ -340,9 +327,7 @@ def _init_weights(self, module): factor = self.config.vision_config.initializer_range trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) trunc_normal_(module.position_embedding) - trunc_normal_( - module.class_embedding, - ) + trunc_normal_(module.class_embedding, ) elif isinstance(module, nn.LayerNorm): zeros_(module.bias) @@ -367,19 +352,19 @@ class Blip2Encoder(nn.Layer): def __init__(self, config: Blip2Config): super().__init__() self.config = config - self.layers = nn.LayerList( - [Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)] - ) + self.layers = nn.LayerList([ + Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers) + ]) self.gradient_checkpointing = False def forward( - self, - inputs_embeds, - attention_mask: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ) -> Union[Tuple, + BaseModelOutput]: r""" Args: inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -400,27 +385,22 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + output_attentions = (output_attentions if output_attentions is not None + else self.config.output_attentions) + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None - logger.info("gradient_checkpointing of qformer:{}".format(self.gradient_checkpointing)) + logger.info("gradient_checkpointing of qformer:{}".format( + self.gradient_checkpointing)) hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + encoder_states = encoder_states + (hidden_states, ) if self.gradient_checkpointing and self.training: def create_custom_forward(module): @@ -432,34 +412,29 @@ def custom_forward(*inputs): layer_outputs = recompute( create_custom_forward(encoder_layer), hidden_states, - attention_mask, - ) + attention_mask, ) else: layer_outputs = encoder_layer( hidden_states, attention_mask, - output_attentions=output_attentions, - ) + output_attentions=output_attentions, ) hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) + all_attentions = all_attentions + (layer_outputs[1], ) if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) + encoder_states = encoder_states + (hidden_states, ) if not return_dict: return tuple( - v - for v in [hidden_states, encoder_states, all_attentions] - if v is not None - ) + v for v in [hidden_states, encoder_states, all_attentions] + if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, - attentions=all_attentions, - ) + attentions=all_attentions, ) class Blip2VisionModel(Blip2PretrainedModel): @@ -473,31 +448,26 @@ def __init__(self, config: Blip2VisionConfig): self.embeddings = Blip2VisionEmbeddings(config) self.encoder = Blip2Encoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) + self.post_layernorm = nn.LayerNorm( + embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - pixel_values: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + self, + pixel_values: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ) -> Union[ + Tuple, BaseModelOutputWithPooling]: r""" Returns: """ - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + output_attentions = (output_attentions if output_attentions is not None + else self.config.output_attentions) + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -508,8 +478,7 @@ def forward( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.post_layernorm(last_hidden_state) @@ -524,8 +493,7 @@ def forward( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) + attentions=encoder_outputs.attentions, ) def get_input_embeddings(self): return self.embeddings @@ -536,37 +504,34 @@ def __init__(self, config, is_cross_attention=False): super().__init__() self.config = config if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, "embedding_size" - ): + config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention heads (%d)" - % (config.hidden_size, config.num_attention_heads) - ) + % (config.hidden_size, config.num_attention_heads)) self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.attention_head_size = int(config.hidden_size / + config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) if is_cross_attention: self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) - self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, + self.all_head_size) else: self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute" - ) - if ( - self.position_embedding_type == "relative_key" - or self.position_embedding_type == "relative_key_query" - ): + config, "position_embedding_type", "absolute") + if (self.position_embedding_type == "relative_key" or + self.position_embedding_type == "relative_key_query"): self.max_position_embeddings = config.max_position_embeddings self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, self.attention_head_size - ) + 2 * config.max_position_embeddings - 1, + self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -590,29 +555,31 @@ def transpose_for_scores(self, x): return x.transpose([0, 2, 1, 3]) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, ): # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention: - key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) - value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) + value_layer = paddle.concat( + [past_key_value[1], value_layer], axis=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) @@ -624,42 +591,37 @@ def forward( past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True) + attention_scores = paddle.matmul( + query_layer, key_layer, transpose_y=True) - if ( - self.position_embedding_type == "relative_key" - or self.position_embedding_type == "relative_key_query" - ): + if (self.position_embedding_type == "relative_key" or + self.position_embedding_type == "relative_key_query"): seq_length = hidden_states.shape[1] - position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1]) - position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1]) + position_ids_l = paddle.arange( + seq_length, dtype="int64").reshape([-1, 1]) + position_ids_r = paddle.arange( + seq_length, dtype="int64").reshape([1, -1]) distance = position_ids_l - position_ids_r positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1 - ) + distance + self.max_position_embeddings - 1) positional_embedding = positional_embedding.cast( - dtype=query_layer.dtype - ) # fp16 compatibility + dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": relative_position_scores = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding - ) + "bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": relative_position_scores_query = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding - ) + "bhld,lrd->bhlr", query_layer, positional_embedding) relative_position_scores_key = paddle.einsum( - "bhrd,lrd->bhlr", key_layer, positional_embedding - ) + "bhrd,lrd->bhlr", key_layer, positional_embedding) attention_scores = ( - attention_scores - + relative_position_scores_query - + relative_position_scores_key - ) + attention_scores + relative_position_scores_query + + relative_position_scores_key) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) @@ -688,11 +650,10 @@ def forward( ] context_layer = context_layer.reshape(new_context_layer_shape) - outputs = ( - (context_layer, attention_probs) if output_attentions else (context_layer,) - ) + outputs = ((context_layer, attention_probs) + if output_attentions else (context_layer, )) - outputs = outputs + (past_key_value,) + outputs = outputs + (past_key_value, ) return outputs @@ -701,12 +662,12 @@ class Blip2QFormerSelfOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward( - self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor - ) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, + input_tensor: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -716,7 +677,8 @@ def forward( class Blip2QFormerAttention(nn.Layer): def __init__(self, config, is_cross_attention=False): super().__init__() - self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention) + self.attention = Blip2QFormerMultiHeadAttention(config, + is_cross_attention) self.output = Blip2QFormerSelfOutput(config) self.pruned_heads = set() @@ -727,8 +689,7 @@ def prune_heads(self, heads): heads, self.attention.num_attention_heads, self.attention.attention_head_size, - self.pruned_heads, - ) + self.pruned_heads, ) # Prune linear layers self.attention.query = prune_linear_layer(self.attention.query, index) @@ -738,23 +699,20 @@ def prune_heads(self, heads): # Update hyper params and store pruned heads self.attention.num_attention_heads = self.attention.num_attention_heads - len( - heads - ) - self.attention.all_head_size = ( - self.attention.attention_head_size * self.attention.num_attention_heads - ) + heads) + self.attention.all_head_size = (self.attention.attention_head_size * + self.attention.num_attention_heads) self.pruned_heads = self.pruned_heads.union(heads) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: Optional[paddle.Tensor] = None, - head_mask: Optional[paddle.Tensor] = None, - encoder_hidden_states: Optional[paddle.Tensor] = None, - encoder_attention_mask: Optional[paddle.Tensor] = None, - past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor]=None, + head_mask: Optional[paddle.Tensor]=None, + encoder_hidden_states: Optional[paddle.Tensor]=None, + encoder_attention_mask: Optional[paddle.Tensor]=None, + past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]]=None, + output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: self_outputs = self.attention( hidden_states, attention_mask, @@ -762,12 +720,10 @@ def forward( encoder_hidden_states, encoder_attention_mask, past_key_value, - output_attentions, - ) + output_attentions, ) attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[ - 1: - ] # add attentions if we output them + outputs = (attention_output, + ) + self_outputs[1:] # add attentions if we output them return outputs @@ -792,12 +748,12 @@ class Blip2QFormerOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm( + config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward( - self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor - ) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, + input_tensor: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -814,7 +770,8 @@ def __init__(self, config, layer_idx): self.layer_idx = layer_idx if layer_idx % config.cross_attention_frequency == 0: - self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.crossattention = Blip2QFormerAttention( + config, is_cross_attention=True) self.has_cross_attention = True else: self.has_cross_attention = False @@ -823,27 +780,24 @@ def __init__(self, config, layer_idx): self.output_query = Blip2QFormerOutput(config) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - query_length=0, - ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, ): # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = ( - past_key_value[:2] if past_key_value is not None else None - ) + self_attn_past_key_value = (past_key_value[:2] + if past_key_value is not None else None) self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, - ) + past_key_value=self_attn_past_key_value, ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] @@ -863,8 +817,7 @@ def forward( head_mask, encoder_hidden_states, encoder_attention_mask, - output_attentions=output_attentions, - ) + output_attentions=output_attentions, ) query_attention_output = cross_attention_outputs[0] # add cross attentions if we output attention weights outputs = outputs + cross_attention_outputs[1:-1] @@ -873,27 +826,25 @@ def forward( self.feed_forward_chunk_query, self.chunk_size_feed_forward, self.seq_len_dim, - query_attention_output, - ) + query_attention_output, ) if attention_output.shape[1] > query_length: layer_output_text = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output[:, query_length:, :], - ) - layer_output = paddle.concat([layer_output, layer_output_text], axis=1) + attention_output[:, query_length:, :], ) + layer_output = paddle.concat( + [layer_output, layer_output_text], axis=1) else: layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output, - ) - outputs = (layer_output,) + outputs + attention_output, ) + outputs = (layer_output, ) + outputs - outputs = outputs + (present_key_value,) + outputs = outputs + (present_key_value, ) return outputs @@ -912,28 +863,25 @@ class Blip2QFormerEncoder(nn.Layer): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.LayerList( - [ - Blip2QFormerLayer(config, layer_idx) - for layer_idx in range(config.num_hidden_layers) - ] - ) + self.layer = nn.LayerList([ + Blip2QFormerLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ]) self.gradient_checkpointing = False def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - query_length=0, - ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions else None @@ -943,12 +891,14 @@ def forward( for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) + all_hidden_states = all_hidden_states + (hidden_states, ) layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[i] if past_key_values is not None else None + past_key_value = past_key_values[ + i] if past_key_values is not None else None - if getattr(self.config, "gradient_checkpointing", False) and self.training: + if getattr(self.config, "gradient_checkpointing", + False) and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -957,9 +907,8 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): - return module( - *inputs, past_key_value, output_attentions, query_length - ) + return module(*inputs, past_key_value, + output_attentions, query_length) return custom_forward @@ -969,8 +918,7 @@ def custom_forward(*inputs): attention_mask, layer_head_mask, encoder_hidden_states, - encoder_attention_mask, - ) + encoder_attention_mask, ) else: layer_outputs = layer_module( hidden_states, @@ -980,39 +928,35 @@ def custom_forward(*inputs): encoder_attention_mask, past_key_value, output_attentions, - query_length, - ) + query_length, ) hidden_states = layer_outputs[0] if use_cache: - next_decoder_cache += (layer_outputs[-1],) + next_decoder_cache += (layer_outputs[-1], ) if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_self_attentions = all_self_attentions + (layer_outputs[1], ) if layer_module.has_cross_attention: - all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + all_cross_attentions = all_cross_attentions + ( + layer_outputs[2], ) if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) + all_hidden_states = all_hidden_states + (hidden_states, ) if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) + return tuple(v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] if v is not None) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, - cross_attentions=all_cross_attentions, - ) + cross_attentions=all_cross_attentions, ) class Blip2QFormerModel(Blip2PretrainedModel): @@ -1024,7 +968,8 @@ def __init__(self, config: Blip2QFormerConfig): super().__init__(config) self.config = config - self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.layernorm = nn.LayerNorm( + config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.encoder = Blip2QFormerEncoder(config) @@ -1044,11 +989,10 @@ class PreTrainedModel self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask( - self, - attention_mask: paddle.Tensor, - input_shape: Tuple[int], - has_query: bool = False, - ) -> paddle.Tensor: + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool=False, ) -> paddle.Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: @@ -1069,10 +1013,8 @@ def get_extended_attention_mask( extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( - "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( - input_shape, attention_mask.shape - ) - ) + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})". + format(input_shape, attention_mask.shape)) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for @@ -1080,14 +1022,12 @@ def get_extended_attention_mask( # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.cast( - dtype=self.config.dtype - ) # fp16 compatibility + dtype=self.config.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def invert_attention_mask( - self, encoder_attention_mask: paddle.Tensor - ) -> paddle.Tensor: + self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: """ Invert an attention mask (e.g., switches 0. and 1.). Args: @@ -1096,27 +1036,28 @@ def invert_attention_mask( `paddle.Tensor`: The inverted attention mask. """ if encoder_attention_mask.ndim == 3: - encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + encoder_extended_attention_mask = encoder_attention_mask[:, + None, :, :] if encoder_attention_mask.ndim == 2: - encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, + None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = (encoder_extended_attention_mask == # encoder_extended_attention_mask.transpose(-1, -2)) encoder_extended_attention_mask = encoder_extended_attention_mask.cast( - dtype=self.config.dtype - ) # fp16 compatibility - encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 + dtype=self.config.dtype) # fp16 compatibility + encoder_extended_attention_mask = ( + 1.0 - encoder_extended_attention_mask) * -1e4 return encoder_extended_attention_mask def get_head_mask( - self, - head_mask: Optional[paddle.Tensor], - num_hidden_layers: int, - is_attention_chunked: bool = False, - ) -> paddle.Tensor: + self, + head_mask: Optional[paddle.Tensor], + num_hidden_layers: int, + is_attention_chunked: bool=False, ) -> paddle.Tensor: """ Prepare the head mask if needed. Args: @@ -1131,7 +1072,8 @@ def get_head_mask( `[None]` for each layer. """ if head_mask is not None: - head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + head_mask = self._convert_head_mask_to_5d(head_mask, + num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: @@ -1142,31 +1084,30 @@ def get_head_mask( def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.ndim == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( + -1).unsqueeze(-1) head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1]) elif head_mask.ndim == 2: - head_mask = ( - head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) - ) # We can specify head_mask for each layer + head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" head_mask = head_mask.cast( - dtype=self.config.dtype - ) # switch to float if need + fp16 compatibility + dtype=self.config. + dtype) # switch to float if need + fp16 compatibility return head_mask def forward( - self, - query_embeds, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, ): r""" encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1186,75 +1127,61 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + output_attentions = (output_attentions if output_attentions is not None + else self.config.output_attentions) + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) # past_key_values_length past_key_values_length = ( past_key_values[0][0].shape[2] - self.config.query_length - if past_key_values is not None - else 0 - ) + if past_key_values is not None else 0) query_length = query_embeds.shape[1] if query_embeds is not None else 0 embedding_output = self.layernorm( - query_embeds.cast(self.layernorm.weight.dtype) - ) + query_embeds.cast(self.layernorm.weight.dtype)) embedding_output = self.dropout(embedding_output) input_shape = embedding_output.shape[:-1] batch_size, seq_length = input_shape if attention_mask is None: - attention_mask = paddle.ones( - ((batch_size, seq_length + past_key_values_length)) - ) + attention_mask = paddle.ones(( + (batch_size, seq_length + past_key_values_length))) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask = self.get_extended_attention_mask( - attention_mask, input_shape - ) + attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ - 0 - ].shape + 0].shape else: ( encoder_batch_size, encoder_sequence_length, - _, - ) = encoder_hidden_states.shape + _, ) = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: encoder_extended_attention_mask = [ - self.invert_attention_mask(mask) for mask in encoder_attention_mask + self.invert_attention_mask(mask) + for mask in encoder_attention_mask ] elif encoder_attention_mask is None: encoder_attention_mask = paddle.ones(encoder_hidden_shape) encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask - ) + encoder_attention_mask) else: encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask - ) + encoder_attention_mask) else: encoder_extended_attention_mask = None @@ -1276,8 +1203,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - query_length=query_length, - ) + query_length=query_length, ) sequence_output = encoder_outputs[0] pooled_output = sequence_output[:, 0, :] @@ -1290,8 +1216,7 @@ def forward( past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - ) + cross_attentions=encoder_outputs.cross_attentions, ) class Blip2Model(Blip2PretrainedModel): @@ -1303,19 +1228,18 @@ def __init__(self, config: Blip2Config): self.vision_model = Blip2VisionModel(config.vision_config) self.query_tokens = Parameter( - paddle.zeros( - [1, config.num_query_tokens, config.qformer_config.hidden_size] - ) - ) + paddle.zeros([ + 1, config.num_query_tokens, config.qformer_config.hidden_size + ])) self.qformer = Blip2QFormerModel(config.qformer_config) - self.language_projection = nn.Linear( - config.qformer_config.hidden_size, config.text_config.hidden_size - ) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, + config.text_config.hidden_size) if config.use_decoder_only_language_model: if isinstance(config.text_config, OPTConfig): - language_model = OPTForCausalLM.from_pretrained(opt_model,low_cpu_mem_usage=True,load_state_as_np=True) + language_model = OPTForCausalLM.from_pretrained( + opt_model, low_cpu_mem_usage=True, load_state_as_np=True) else: raise NotImplementedError else: @@ -1329,16 +1253,15 @@ def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def get_text_features( - self, - input_ids: Optional[paddle.Tensor] = None, - attention_mask: Optional[paddle.Tensor] = None, - decoder_input_ids: Optional[paddle.Tensor] = None, - decoder_attention_mask: Optional[paddle.Tensor] = None, - labels: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): + self, + input_ids: Optional[paddle.Tensor]=None, + attention_mask: Optional[paddle.Tensor]=None, + decoder_input_ids: Optional[paddle.Tensor]=None, + decoder_attention_mask: Optional[paddle.Tensor]=None, + labels: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ): r""" Returns: text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`): @@ -1355,19 +1278,13 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device) >>> text_features = model.get_text_features(**inputs) ```""" - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + output_attentions = (output_attentions if output_attentions is not None + else self.config.output_attentions) + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) if self.config.use_decoder_only_language_model: text_outputs = self.language_model( @@ -1375,10 +1292,10 @@ def get_text_features( attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) else: - inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = self.language_model.get_input_embeddings()( + input_ids) text_outputs = self.language_model( inputs_embeds=inputs_embeds, @@ -1388,18 +1305,16 @@ def get_text_features( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - labels=labels, - ) + labels=labels, ) return text_outputs def get_image_features( - self, - pixel_values: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): + self, + pixel_values: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ): r""" Returns: vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): @@ -1420,26 +1335,19 @@ def get_image_features( >>> inputs = processor(images=image, return_tensors="pd") >>> image_outputs = model.get_image_features(**inputs) ```""" - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + output_attentions = (output_attentions if output_attentions is not None + else self.config.output_attentions) + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) if not return_dict: last_hidden_state = vision_outputs[0] pooled_output = vision_outputs[1] @@ -1454,16 +1362,14 @@ def get_image_features( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=vision_outputs.hidden_states, - attentions=vision_outputs.attentions, - ) + attentions=vision_outputs.attentions, ) def get_qformer_features( - self, - pixel_values: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): + self, + pixel_values: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + return_dict: Optional[bool]=None, ): r""" Returns: vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): @@ -1484,31 +1390,25 @@ def get_qformer_features( >>> inputs = processor(images=image, return_tensors="pt") >>> qformer_outputs = model.get_qformer_features(**inputs) ```""" - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + output_attentions = (output_attentions if output_attentions is not None + else self.config.output_attentions) + output_hidden_states = (output_hidden_states + if output_hidden_states is not None else + self.config.output_hidden_states) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) image_embeds = vision_outputs[0] # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention - image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones( + image_embeds.shape[:-1], dtype="int64") query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.qformer( @@ -1517,23 +1417,22 @@ def get_qformer_features( encoder_attention_mask=image_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) return query_outputs def forward( - self, - pixel_values: paddle.Tensor, - input_ids: paddle.Tensor, - attention_mask: Optional[paddle.Tensor] = None, - decoder_input_ids: Optional[paddle.Tensor] = None, - decoder_attention_mask: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - labels: Optional[paddle.Tensor] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + attention_mask: Optional[paddle.Tensor]=None, + decoder_input_ids: Optional[paddle.Tensor]=None, + decoder_attention_mask: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + labels: Optional[paddle.Tensor]=None, + return_dict: Optional[bool]=None, ) -> Union[ + Tuple, Blip2ForConditionalGenerationModelOutput]: r""" Returns: Examples: @@ -1551,9 +1450,8 @@ def forward( >>> inputs = processor(images=image, text=prompt, return_tensors="pd") >>> outputs = model(pixel_values=inputs["pixel_values"],input_ids=inputs["input_ids"]) ```""" - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) @@ -1561,12 +1459,12 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) image_embeds = vision_outputs[0] # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention - image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones( + image_embeds.shape[:-1], dtype="int64") query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.qformer( @@ -1575,35 +1473,32 @@ def forward( encoder_attention_mask=image_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) + return_dict=return_dict, ) query_output = query_outputs[0] # step 3: use the language model, conditioned on the query outputs and the prompt language_model_inputs = self.language_projection(query_output) language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64" - ) + language_model_inputs.shape[:-1], dtype="int64") inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) + inputs_embeds = paddle.concat( + [language_model_inputs, inputs_embeds], axis=1) if attention_mask is None: attention_mask = paddle.ones_like(input_ids) attention_mask = paddle.concat( - [language_model_attention_mask, attention_mask], axis=1 - ) + [language_model_attention_mask, attention_mask], axis=1) with paddle.amp.auto_cast(level='O2'): outputs = self.language_model( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - return_dict=True, - labels=labels,) + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=labels, ) loss = outputs.loss return {"loss": loss} - class Blip2ForConditionalGeneration(Blip2PretrainedModel): config_class = Blip2Config main_input_name = "pixel_values" @@ -1612,10 +1507,14 @@ def __init__(self, config: Blip2Config): super().__init__(config) config.vision_config.image_size self.visual_encoder, self.ln_vision = self.init_vision_encoder( - "eva_clip_g",config.vision_config.image_size,config.vision_config.dropout, - config.vision_config.mp_degree if hasattr(config.vision_config, "mp_degree") else 1, - gradient_checkpointing=config.vision_config.gradient_checkpointing if hasattr(config.vision_config, "gradient_checkpointing") else False - ) + "eva_clip_g", + config.vision_config.image_size, + config.vision_config.dropout, + config.vision_config.mp_degree + if hasattr(config.vision_config, "mp_degree") else 1, + gradient_checkpointing=config.vision_config.gradient_checkpointing + if hasattr(config.vision_config, "gradient_checkpointing") else + False) self.freeze_vit = config.freeze_vit if self.freeze_vit: # freeze vit except the post layer norm layer. @@ -1626,19 +1525,21 @@ def __init__(self, config: Blip2Config): logger.info("freeze vision encoder") # self.qformer = Blip2QFormerModel(config.qformer_config) self.Qformer, self.query_tokens = self.init_Qformer( - config.num_query_tokens, config.vision_config.hidden_size, - mp_degree=config.qformer_config.mp_degree if hasattr(config.qformer_config, "mp_degree") else 1, - gradient_checkpointing=config.qformer_config.gradient_checkpointing if hasattr(config.qformer_config, "gradient_checkpointing") else False - ) + config.num_query_tokens, + config.vision_config.hidden_size, + mp_degree=config.qformer_config.mp_degree + if hasattr(config.qformer_config, "mp_degree") else 1, + gradient_checkpointing=config.qformer_config.gradient_checkpointing + if hasattr(config.qformer_config, "gradient_checkpointing") else + False) self.Qformer.cls = None self.Qformer.bert.embeddings.word_embeddings = None self.Qformer.bert.embeddings.position_embeddings = None for layer in self.Qformer.bert.encoder.layer: layer.output = None layer.intermediate = None - self.language_projection = nn.Linear( - config.qformer_config.hidden_size, config.text_config.hidden_size - ) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, + config.text_config.hidden_size) if config.use_decoder_only_language_model: if isinstance(config.text_config, OPTConfig): language_model = OPTForCausalLM(config.text_config) @@ -1653,25 +1554,36 @@ def __init__(self, config: Blip2Config): for name, param in self.language_model.named_parameters(): param.stop_gradient = True self.pad_token_id = config.text_config.pad_token_id + @classmethod - def init_vision_encoder( - cls, model_name, img_size, drop_path_rate,mp_degree,gradient_checkpointing=False - ): + def init_vision_encoder(cls, + model_name, + img_size, + drop_path_rate, + mp_degree, + gradient_checkpointing=False): - visual_encoder = create_eva_vit_g(img_size, drop_path_rate,mp_degree,gradient_checkpointing) + visual_encoder = create_eva_vit_g(img_size, drop_path_rate, mp_degree, + gradient_checkpointing) ln_vision = paddle.nn.LayerNorm(visual_encoder.num_features) return visual_encoder, ln_vision + @classmethod - def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2,mp_degree=1,gradient_checkpointing=False): + def init_Qformer(cls, + num_query_token, + vision_width, + cross_attention_freq=2, + mp_degree=1, + gradient_checkpointing=False): encoder_config = BertConfig.from_pretrained("bert-base-uncased") encoder_config.encoder_width = vision_width # insert cross-attention layer every other block encoder_config.add_cross_attention = True encoder_config.cross_attention_freq = cross_attention_freq encoder_config.query_length = num_query_token - encoder_config.mp_degree=mp_degree - encoder_config.gradient_checkpointing=gradient_checkpointing + encoder_config.mp_degree = mp_degree + encoder_config.gradient_checkpointing = gradient_checkpointing # todo check dropout # encoder_config.attention_probs_dropout_prob = 0 # encoder_config.hidden_dropout_prob = 0 @@ -1679,25 +1591,25 @@ def init_Qformer(cls, num_query_token, vision_width, cross_attention_freq=2,mp_d query_tokens = paddle.create_parameter( shape=(1, num_query_token, encoder_config.hidden_size), dtype='float32', - default_initializer=paddle.nn.initializer.Normal(mean=0.0, std=encoder_config.initializer_range) - ) + default_initializer=paddle.nn.initializer.Normal( + mean=0.0, std=encoder_config.initializer_range)) return Qformer, query_tokens + def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def forward( - self, - pixel_values: paddle.Tensor, - input_ids: paddle.Tensor, - attention_mask: Optional[paddle.Tensor] = None, - decoder_input_ids: Optional[paddle.Tensor] = None, - decoder_attention_mask: Optional[paddle.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - labels: Optional[paddle.Tensor] = None, - return_dict: Optional[bool] = None, - **kwargs - ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + attention_mask: Optional[paddle.Tensor]=None, + decoder_input_ids: Optional[paddle.Tensor]=None, + decoder_attention_mask: Optional[paddle.Tensor]=None, + output_attentions: Optional[bool]=None, + output_hidden_states: Optional[bool]=None, + labels: Optional[paddle.Tensor]=None, + return_dict: Optional[bool]=None, + **kwargs) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: r""" Returns: Examples: @@ -1738,85 +1650,83 @@ def forward( >>> print(generated_text) two ```""" - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = (return_dict if return_dict is not None else + self.config.use_return_dict) with paddle.amp.auto_cast(level='O2'): image_embeds = self.ln_vision(self.visual_encoder(pixel_values)) image_embeds = image_embeds.astype("float32") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention - image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones( + image_embeds.shape[:-1], dtype="int64") query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, - ) + return_dict=True, ) query_output = query_outputs[0] # step 3: use the language model, conditioned on the query outputs and the prompt language_model_inputs = self.language_projection(query_output) language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64" - ) + language_model_inputs.shape[:-1], dtype="int64") inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) + inputs_embeds = paddle.concat( + [language_model_inputs, inputs_embeds], axis=1) if attention_mask is None: attention_mask = paddle.ones_like(input_ids) attention_mask = paddle.concat( - [language_model_attention_mask, attention_mask], axis=1 - ) + [language_model_attention_mask, attention_mask], axis=1) - targets = input_ids * ( - 1 - (input_ids == self.pad_token_id).astype(input_ids.dtype) - ) + (input_ids == self.pad_token_id).astype(input_ids.dtype) * (-100) + targets = input_ids * (1 - ( + input_ids == self.pad_token_id).astype(input_ids.dtype)) + ( + input_ids == self.pad_token_id).astype(input_ids.dtype) * (-100) empty_targets = paddle.ones( - language_model_attention_mask.shape, dtype="int64" - ).fill_(-100) + language_model_attention_mask.shape, dtype="int64").fill_(-100) labels = paddle.concat([empty_targets, targets], axis=1) labels.stop_gradient = True with paddle.amp.auto_cast(level='O2'): outputs = self.language_model( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - return_dict=True, - labels=labels,) + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + labels=labels, ) loss = outputs.loss # print(loss) return {"loss": loss} + @paddle.no_grad() def encode_image( - self, - pixel_values: paddle.Tensor, - **kwargs, - ): - image_embeds = self.ln_vision(self.visual_encoder(pixel_values.astype("float16"))) + self, + pixel_values: paddle.Tensor, + **kwargs, ): + image_embeds = self.ln_vision( + self.visual_encoder(pixel_values.astype("float16"))) image_embeds = image_embeds.astype("float32") - image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones( + image_embeds.shape[:-1], dtype="int64") query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, - ) + return_dict=True, ) query_output = query_outputs[0] return query_output + @paddle.no_grad() def generate( - self, - pixel_values: paddle.Tensor, - input_ids: Optional[paddle.Tensor] = None, - attention_mask: Optional[paddle.Tensor] = None, - **generate_kwargs, - ) -> paddle.Tensor: + self, + pixel_values: paddle.Tensor, + input_ids: Optional[paddle.Tensor]=None, + attention_mask: Optional[paddle.Tensor]=None, + **generate_kwargs, ) -> paddle.Tensor: """ Overrides `generate` function to be able to use the model as a conditional generator. Args: @@ -1831,40 +1741,38 @@ def generate( """ batch_size = pixel_values.shape[0] image_embeds = self.ln_vision(self.visual_encoder(pixel_values)) - image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones( + image_embeds.shape[:-1], dtype="int64") query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, - ) + return_dict=True, ) query_output = query_outputs.last_hidden_state language_model_inputs = self.language_projection(query_output) language_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64" - ) + language_model_inputs.shape[:-1], dtype="int64") if input_ids is None: - input_ids = paddle.to_tensor([[self.config.text_config.bos_token_id]]).tile( - [batch_size, 1] - ) + input_ids = paddle.to_tensor( + [[self.config.text_config.bos_token_id]]).tile([batch_size, 1]) if attention_mask is None: attention_mask = paddle.ones_like(input_ids) attention_mask = paddle.concat( - [language_attention_mask, attention_mask], axis=1 - ) + [language_attention_mask, attention_mask], axis=1) # concatenate query embeddings with prompt embeddings inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) + inputs_embeds = paddle.concat( + [language_model_inputs, inputs_embeds], axis=1) outputs = self.language_model.generate( inputs_embeds=inputs_embeds, attention_mask=attention_mask, do_sample=False, top_p=0.9, - decode_strategy="greedy_search", # align to torch + decode_strategy="greedy_search", # align to torch temperature=1, num_beams=5, max_length=30, @@ -1872,8 +1780,7 @@ def generate( eos_token_id=50118, repetition_penalty=1, length_penalty=1, - num_return_sequences=1, - ) + num_return_sequences=1, ) return outputs @@ -1893,11 +1800,11 @@ def autocast_smart_context_manager(self): custom_black_list=custom_black_list, custom_white_list=custom_white_list, level="O2", - dtype='float16', - ) + dtype='float16', ) return ctx_manager + class CrossEntropyLoss(nn.Layer): """ Softmax Cross entropy loss diff --git a/paddlevlp/tests/__init__.py b/paddlevlp/tests/__init__.py new file mode 100644 index 00000000000000..9be0a27387bee4 --- /dev/null +++ b/paddlevlp/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/paddlevlp/tests/models/__init__.py b/paddlevlp/tests/models/__init__.py new file mode 100644 index 00000000000000..9be0a27387bee4 --- /dev/null +++ b/paddlevlp/tests/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/paddlevlp/tests/models/test_blip2.py b/paddlevlp/tests/models/test_blip2.py new file mode 100644 index 00000000000000..a9fadc957dfcf2 --- /dev/null +++ b/paddlevlp/tests/models/test_blip2.py @@ -0,0 +1,576 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import inspect +import tempfile +import unittest + +import numpy as np +import paddle +import paddle.nn as nn +import requests +from PIL import Image + +from paddlevlp.models.blip2 import ( + Blip2VisionConfig, + Blip2QFormerConfig, + Blip2Config, + Blip2QFormerModel, + Blip2Model, + Blip2VisionModel, + Blip2ForConditionalGeneration, ) +from paddlevlp.models.blip2.modeling import BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST +from paddlevlp.tests.testing_utils import slow +from paddlevlp.tests.models.test_configuration_common import ConfigTester +from paddlevlp.tests.models.test_modeling_common import ( + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, ) +from paddlenlp.transformers.opt.configuration import OPTConfig + + +def _config_zero_init(config): + configs_no_init = copy.deepcopy(config) + for key in configs_no_init.__dict__.keys(): + if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: + setattr(configs_no_init, key, 1e-10) + return configs_no_init + + +class Blip2VisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=1408, + projection_dim=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + scope=None, ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size)**2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([ + self.batch_size, self.num_channels, self.image_size, self.image_size + ]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return Blip2VisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, ) + + def create_and_check_model(self, config, pixel_values): + model = Blip2VisionModel(config=config) + model.eval() + with paddle.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // + patch_size[0]) + self.parent.assertEqual( + result.last_hidden_state.shape, + [self.batch_size, num_patches + 1, self.hidden_size]) + self.parent.assertEqual(result.pooler_output.shape, + [self.batch_size, self.hidden_size]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as BLIP-2's vision encoder does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (Blip2VisionModel, ) + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + use_test_model_name_list = False + + def setUp(self): + self.model_tester = Blip2VisionModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=Blip2VisionConfig, + has_text_modality=False, + hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="BLIP-2's vision encoder does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = Blip2VisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class Blip2QFormerModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=768, + projection_dim=32, + num_hidden_layers=6, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + bos_token_id=0, + scope=None, + num_patches=257, + encoder_hidden_size=1408, ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + self.bos_token_id = bos_token_id + self.num_patches = num_patches + self.encoder_hidden_size = encoder_hidden_size + + def prepare_config_and_inputs(self): + query_embeds = floats_tensor( + [self.batch_size, self.seq_length, self.hidden_size]) + encoder_hidden_states = floats_tensor( + [self.batch_size, self.num_patches, self.encoder_hidden_size]) + encoder_attention_mask = random_attention_mask( + [self.batch_size, self.num_patches]) + config = self.get_config() + + return config, query_embeds, encoder_hidden_states, encoder_attention_mask + + def get_config(self): + return Blip2QFormerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + encoder_hidden_size=self.encoder_hidden_size, ) + + def create_and_check_model(self, config, query_embeds, + encoder_hidden_states, encoder_attention_mask): + model = Blip2QFormerModel(config=config) + model.eval() + result = model( + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask) + self.parent.assertEqual( + result.last_hidden_state.shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + model = Blip2QFormerModel(config=config) + model.eval() + with paddle.no_grad(): + result = model( + query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask) + + self.parent.assertEqual( + result.last_hidden_state.shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, query_embeds, encoder_hidden_states, encoder_attention_mask = config_and_inputs + inputs_dict = { + "query_embeds": query_embeds, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": encoder_attention_mask + } + return config, inputs_dict + + +class Blip2QFormerModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (Blip2QFormerModel, ) + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + use_test_model_name_list = False + + def setUp(self): + self.model_tester = Blip2QFormerModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=Blip2QFormerConfig, + has_text_modality=False, + hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["query_embeds"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_save_load(self): + pass + + +class Blip2TextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + embed_dim=16, + num_labels=3, + word_embed_proj_dim=16, + type_sequence_label_size=2, ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.embed_dim = embed_dim + self.num_labels = num_labels + self.type_sequence_label_size = type_sequence_label_size + self.word_embed_proj_dim = word_embed_proj_dim + self.is_encoder_decoder = False + + def prepare_config_and_inputs(self): + config = self.get_config() + + input_ids = ids_tensor( + [self.batch_size, self.seq_length], self.vocab_size, + dtype="int64").clip(3, ) + input_ids[:, -1] = self.eos_token_id # Eos Token + + attention_mask = input_ids.not_equal( + paddle.to_tensor( + [self.pad_token_id], dtype="int64")).cast("int64") + + return config, input_ids, attention_mask + + def get_config(self): + return OPTConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + embed_dim=self.embed_dim, + is_encoder_decoder=False, + word_embed_proj_dim=self.word_embed_proj_dim, ) + + +class Blip2ModelTester: + def __init__(self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {} + if text_kwargs is None: + text_kwargs = {} + + self.parent = parent + self.vision_model_tester = Blip2VisionModelTester(parent, + **vision_kwargs) + self.qformer_model_tester = Blip2QFormerModelTester(parent, + **qformer_kwargs) + self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs) + self.is_training = is_training + self.num_query_tokens = num_query_tokens + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + ( + _, + input_ids, + attention_mask, + ) = self.text_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return Blip2Config.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + text_config=self.text_model_tester.get_config(), + num_query_tokens=self.num_query_tokens, ) + + @unittest.skip(reason="BLIP-2's output needs to unified") + def create_and_check_for_conditional_generation( + self, config, input_ids, attention_mask, pixel_values): + model = Blip2ForConditionalGeneration(config) + model.eval() + with paddle.no_grad(): + result = model( + pixel_values, input_ids, attention_mask, return_dict=True) + + self.parent.assertEqual( + result.logits.shape, + [ + self.vision_model_tester.batch_size, + self.text_model_tester.seq_length + self.num_query_tokens, + self.text_model_tester.vocab_size, + ], ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + attention_mask, + pixel_values, ) = config_and_inputs + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +class Blip2ModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (Blip2ForConditionalGeneration, ) + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + use_test_model_name_list = False + use_test_inputs_embeds: bool = False + + def setUp(self): + self.model_tester = Blip2ModelTester(self) + + def test_for_conditional_generation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_conditional_generation( + *config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + def test_determinism(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + + def check_determinism(first, second): + out_1 = first.numpy() + out_2 = second.numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + model.eval() + with paddle.no_grad(): + input = self._prepare_for_class(inputs_dict, model_class) + first = model(**input)['loss'] + second = model(**input)['loss'] + + if isinstance(first, tuple) and isinstance(second, tuple): + for tensor1, tensor2 in zip(first, second): + check_determinism(tensor1, tensor2) + else: + check_determinism(first, second) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_load_vision_qformer_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + + # Save Blip2Config and check if we can load Blip2VisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), + vision_config.to_dict()) + + # Save Blip2Config and check if we can load Blip2QFormerConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.qformer_config.to_dict(), + qformer_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + for model_name in BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST: + model = Blip2ForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_save_load(self): + pass diff --git a/paddlevlp/tests/models/test_configuration_common.py b/paddlevlp/tests/models/test_configuration_common.py new file mode 100644 index 00000000000000..19c25ba7f673a0 --- /dev/null +++ b/paddlevlp/tests/models/test_configuration_common.py @@ -0,0 +1,65 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import os +import tempfile +import unittest.mock as mock + +from requests.exceptions import HTTPError + +from paddlenlp.transformers.configuration_utils import PretrainedConfig + + +class ConfigTester(object): + def __init__(self, + parent, + config_class=None, + has_text_modality=True, + **kwargs): + self.parent = parent + self.config_class = config_class + self.has_text_modality = has_text_modality + self.inputs_dict = kwargs + + def create_and_test_config_to_json_string(self): + config = self.config_class(**self.inputs_dict) + obj = json.loads(config.to_json_string()) + for key, value in self.inputs_dict.items(): + self.parent.assertEqual(obj[key], value) + + def create_and_test_config_to_json_file(self): + config_first = self.config_class(**self.inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "config.json") + config_first.to_json_file(json_file_path) + config_second = self.config_class.from_json_file(json_file_path) + + self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + + def create_and_test_config_from_and_save_pretrained(self): + config_first = self.config_class(**self.inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + config_first.save_pretrained(tmpdirname) + config_second = self.config_class.from_pretrained(tmpdirname) + + self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + + def run_common_tests(self): + self.create_and_test_config_to_json_string() + self.create_and_test_config_to_json_file() + self.create_and_test_config_from_and_save_pretrained() diff --git a/paddlevlp/tests/models/test_modeling_common.py b/paddlevlp/tests/models/test_modeling_common.py new file mode 100644 index 00000000000000..b643089ae3344f --- /dev/null +++ b/paddlevlp/tests/models/test_modeling_common.py @@ -0,0 +1,806 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +import inspect +import os +import random +import shutil +import subprocess +import tempfile +import time +import unittest +from typing import Optional, Tuple, Type + +import numpy as np +import paddle + +from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlenlp.transformers.model_utils import PretrainedModel + +from paddlevlp.utils.env import CONFIG_NAME, LEGACY_CONFIG_NAME, MODEL_HOME +from paddlevlp.tests.testing_utils import slow + + +def _config_zero_init(config): + configs_no_init = copy.deepcopy(config) + for key in configs_no_init.__dict__.keys(): + if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: + setattr(configs_no_init, key, 1e-10) + return configs_no_init + + +def get_gpus(selected_gpus): + selected_gpus = [x.strip() for x in selected_gpus.split(",")] + return selected_gpus + + +def ids_tensor(shape, vocab_size, dtype="int32"): + # Creates a random int32 tensor of the shape within the vocab size + return paddle.randint(low=1, high=vocab_size, dtype=dtype, shape=shape) + + +def random_attention_mask(shape, dtype="int32"): + attn_mask = ids_tensor(shape, vocab_size=2, dtype=dtype) + # make sure that at least one token is attended to for each batch + attn_mask[:, -1] = 1 + return attn_mask + + +def floats_tensor(shape, scale=1.0): + """Creates a random float32 tensor""" + return scale * paddle.randn(shape, dtype="float32") + + +def check_two_model_parameter(first_model: PretrainedModel, + second_model: PretrainedModel): + assert len( + set(first_model.state_dict().keys()) - set(second_model.state_dict() + .keys())) == 0 + + # random choice the keys to compare + key = random.choice(list(first_model.state_dict().keys())) + diff = first_model.state_dict()[key] - second_model.state_dict()[key] + assert diff.sum().item() == 0 + + +class ModelTesterMixin: + model_tester = None + base_model_class: Optional[Type[PretrainedModel]] = None + all_model_classes: Tuple[Type[PretrainedModel]] = () + all_generative_model_classes = () + test_resize_embeddings = True + test_resize_position_embeddings = False + test_mismatched_shapes = True + test_missing_keys = True + test_model_compatibility_keys = True + test_tie_weights = False + use_test_inputs_embeds = False + use_test_model_name_list = True + is_encoder_decoder = False + has_attentions = True + model_split_percents = [0.5, 0.7, 0.9] + + def _prepare_for_class(self, inputs_dict, model_class): + inputs_dict = copy.deepcopy(inputs_dict) + return inputs_dict + + def _make_model_instance(self, config, model_class): + if isinstance(config, PretrainedConfig): + return model_class(config) + if model_class == self.base_model_class: + return model_class(**config) + + return model_class(self.base_model_class(**config)) + + def test_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + + def check_save_load(out1, out2): + # make sure we don't have nans + out_2 = out2.numpy() + out_2[np.isnan(out_2)] = 0 + + out_1 = out1.numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + model.eval() + with paddle.no_grad(): + first = model(**self._prepare_for_class(inputs_dict, + model_class))[0] + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname) + model.eval() + with paddle.no_grad(): + second = model(**self._prepare_for_class(inputs_dict, + model_class))[0] + + # support tuple of tensor + if isinstance(first, tuple) and isinstance(second, tuple): + for tensor1, tensor2 in zip(first, second): + check_save_load(tensor1, tensor2) + else: + check_save_load(first, second) + + def test_determinism(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + + def check_determinism(first, second): + out_1 = first.numpy() + out_2 = second.numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + model.eval() + with paddle.no_grad(): + first = model(**self._prepare_for_class(inputs_dict, + model_class))[0] + second = model(**self._prepare_for_class(inputs_dict, + model_class))[0] + + if isinstance(first, tuple) and isinstance(second, tuple): + for tensor1, tensor2 in zip(first, second): + check_determinism(tensor1, tensor2) + else: + check_determinism(first, second) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + expected_arg_names = ["input_ids"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + @unittest.skip("Not implemented yet") + def test_training(self): + pass + + @unittest.skip("Not implemented yet") + def test_training_gradient_checkpointing(self): + pass + + def test_attention_outputs(self): + if not self.has_attentions: + return + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", + seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", + seq_len) + decoder_key_length = getattr(self.model_tester, "decoder_key_length", + decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", + encoder_seq_length) + chunk_length = getattr(self.model_tester, "chunk_length", None) + if chunk_length is not None and hasattr(self.model_tester, + "num_hashes"): + encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes + + for model_class in self.all_model_classes: + signature = inspect.signature(model_class.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + if not all( + name in arg_names + for name in + ["output_attentions", "output_hidden_states", "return_dict"]): + continue + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + inputs_dict["return_dict"] = True + model = self._make_model_instance(config, model_class) + model.eval() + with paddle.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, + model_class)) + attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions + self.assertEqual( + len(attentions), self.model_tester.num_hidden_layers) + + # TODO(guosheng): check that output_attentions also work using config + + if chunk_length is not None: + self.assertListEqual( + list(attentions[0].shape[-4:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, chunk_length, encoder_key_length + ], ) + else: + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, encoder_key_length + ], ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Question Answering model returns start_logits and end_logits + if model_class.__name__.endswith("ForQuestionAnswering"): + correct_outlen += 1 # start_logits and end_logits instead of only 1 output + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual( + len(decoder_attentions), + self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, decoder_key_length + ], ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual( + len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = self._make_model_instance(config, model_class) + model.eval() + with paddle.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, + model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions + + self.assertEqual( + len(self_attentions), self.model_tester.num_hidden_layers) + if chunk_length is not None: + self.assertListEqual( + list(self_attentions[0].shape[-4:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, chunk_length, encoder_key_length + ], ) + else: + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + encoder_seq_length, encoder_key_length + ], ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = self._make_model_instance(config, model_class) + model.eval() + + with paddle.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, + model_class)) + + hidden_states = outputs.encoder_hidden_states if self.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", + self.model_tester.num_hidden_layers + 1) + self.assertEqual(len(hidden_states), expected_num_layers) + + if hasattr(self.model_tester, "encoder_seq_length"): + seq_length = self.model_tester.encoder_seq_length + if hasattr( + self.model_tester, + "chunk_length") and self.model_tester.chunk_length > 1: + seq_length = seq_length * self.model_tester.chunk_length + else: + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], ) + + if self.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, + "decoder_seq_length", seq_len) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + inputs_dict["return_dict"] = True + for model_class in self.all_model_classes: + signature = inspect.signature(model_class.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + if not all( + name in arg_names + for name in + ["output_attentions", "output_hidden_states", "return_dict"]): + continue + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + # TODO(guosheng): check that output_hidden_states also work using config + + @unittest.skip("Not implemented") + def test_retain_grad_hidden_states_attentions(self): + pass + + def test_resize_position_vector_embeddings(self): + if not self.test_resize_position_embeddings: + return + + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = self._make_model_instance(config, model_class) + + if self.model_tester.is_training is False: + model.eval() + + max_position_embeddings = config.max_position_embeddings + + # Retrieve the embeddings and clone theme + if self.is_encoder_decoder: + encoder_model_embed, decoder_model_embed = model.get_position_embeddings( + ) + encoder_cloned_embeddings = encoder_model_embed.weight.clone() + decoder_cloned_embeddings = decoder_model_embed.weight.clone() + else: + model_embed = model.get_position_embeddings() + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the position embeddings with a larger max_position_embeddings increases + # the model's postion embeddings size + model.resize_position_embeddings(max_position_embeddings + 10) + self.assertEqual(model.config.max_position_embeddings, + max_position_embeddings + 10) + + # Check that it actually resizes the embeddings matrix + if model.config.is_encoder_decoder: + encoder_model_embed, decoder_model_embed = model.get_position_embeddings( + ) + self.assertEqual(encoder_model_embed.weight.shape[0], + encoder_cloned_embeddings.shape[0] + 10) + self.assertEqual(decoder_model_embed.weight.shape[0], + decoder_cloned_embeddings.shape[0] + 10) + else: + model_embed = model.get_position_embeddings() + self.assertEqual(model_embed.weight.shape[0], + cloned_embeddings.shape[0] + 10) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the position embeddings with a smaller max_position_embeddings decreases + # the model's max_position_embeddings + model.resize_position_embeddings(max_position_embeddings - 5) + self.assertEqual(model.base_model.config["max_position_embeddings"], + max_position_embeddings - 5) + + # Check that it actually resizes the embeddings matrix + if self.is_encoder_decoder: + encoder_model_embed, decoder_model_embed = model.get_position_embeddings( + ) + self.assertEqual(encoder_model_embed.weight.shape[0], + encoder_cloned_embeddings.shape[0] - 5) + self.assertEqual(decoder_model_embed.weight.shape[0], + decoder_cloned_embeddings.shape[0] - 5) + else: + model_embed = model.get_position_embeddings() + self.assertEqual(model_embed.weight.shape[0], + cloned_embeddings.shape[0] - 5) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + + if model.config.is_encoder_decoder: + for p1, p2 in zip(encoder_cloned_embeddings, + encoder_model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + for p1, p2 in zip(decoder_cloned_embeddings, + decoder_model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + else: + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_resize_tokens_embeddings(self): + ( + original_config, + inputs_dict, + ) = self.model_tester.prepare_config_and_inputs_for_common() + if not self.test_resize_embeddings: + return + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = self._make_model_instance(config, model_class) + if self.model_tester.is_training is False: + model.eval() + + model_vocab_size = config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.base_model.config.vocab_size, + model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], + cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.base_model.config.vocab_size, + model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], + cloned_embeddings.shape[0] - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary + inputs_dict["input_ids"] = paddle.clip( + inputs_dict["input_ids"], max=model_vocab_size - 15 - 1) + + # make sure that decoder_input_ids are resized as well + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"] = paddle.clip( + inputs_dict["decoder_input_ids"], + max=model_vocab_size - 15 - 1) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if not paddle.equal_all(p1, p2).item(): + models_equal = False + break + + self.assertTrue(models_equal) + + def _compare_tensor(self, tensor1, tensor2, rtol=1e-04, atol=1e-04): + if tensor1.dtype != tensor2.dtype: + return False + + if tensor1.dtype in [paddle.float32, paddle.float64]: + return paddle.allclose(tensor1, tensor2, rtol=rtol, atol=atol) + else: + return paddle.equal_all(tensor1, tensor2) + + def test_inputs_embeds(self): + # pass the test if don't need to test inputs embeddings + if not self.use_test_inputs_embeds: + return + # get config for model and inputs_dict for model forward + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + # test all model classes + for model_class in self.all_model_classes: + model = self._make_model_instance(config, model_class) + model.eval() + + inputs = copy.deepcopy( + self._prepare_for_class(inputs_dict, model_class)) + + with paddle.no_grad(): + ids_output = model(**inputs) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", + encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + wte = model.get_input_embeddings() + if not self.is_encoder_decoder: + inputs["inputs_embeds"] = wte(input_ids) + else: + inputs["inputs_embeds"] = wte(encoder_input_ids) + inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) + + with paddle.no_grad(): + embeds_output = model(**inputs) + + if isinstance(embeds_output, paddle.Tensor): + self.assertTrue(self._compare_tensor(ids_output, embeds_output)) + else: + for ids_item, embeds_item in zip(ids_output, embeds_output): + self.assertTrue(self._compare_tensor(ids_item, embeds_item)) + + def test_model_name_list(self): + if not self.use_test_model_name_list: + return + config = self.model_tester.get_config() + if isinstance(config, PretrainedConfig): + model = self.base_model_class(config) + else: + model = self.base_model_class(**config) + self.assertTrue(len(model.model_name_list) != 0) + + def test_pretrained_config_save_load(self): + if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config( + ): + return + + config_class = self.base_model_class.config_class + with tempfile.TemporaryDirectory() as tempdir: + config = config_class() + + config.save_pretrained(tempdir) + + # check the file exist + self.assertFalse( + os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME))) + self.assertTrue(os.path.exists(os.path.join(tempdir, CONFIG_NAME))) + + # rename the CONFIG_NAME + shutil.move( + os.path.join(tempdir, CONFIG_NAME), + os.path.join(tempdir, LEGACY_CONFIG_NAME)) + + loaded_config = config.__class__.from_pretrained(tempdir) + for key in config.__dict__.keys(): + self.assertEqual( + getattr(config, key), getattr(loaded_config, key)) + + def random_choice_pretrained_config_field(self) -> Optional[str]: + if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config( + ): + return None + + config = self.base_model_class.config_class() + fields = [key for key, value in config.to_dict() if value] + return random.choice(fields) + + def test_for_missed_attribute(self): + if not self.test_model_compatibility_keys: + self.skipTest( + f"Do not test model_compatibility_keys on {self.base_model_class}" + ) + return + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + for model_class in self.all_model_classes: + if not model_class.constructed_from_pretrained_config(): + continue + + model = self._make_model_instance(config, model_class) + + all_maps: dict = copy.deepcopy( + model_class.config_class.attribute_map) + + for old_attribute, new_attribute in all_maps.items(): + old_value = getattr(model.config, old_attribute) + new_value = getattr(model.config, new_attribute) + + # eg: dropout can be an instance of nn.Dropout, so we should check it attribute + if type(new_value) != type(old_value): + continue + + self.assertEqual(old_value, new_value) + + def test_tie_weight(self): + # test whether id of input_embeding equal id of output_embeding ? + if not self.test_tie_weights: + return + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( + ) + for model_class in self.all_model_classes: + if "CausalLM" not in model_class.__name__ and "MaskedLM" not in model_class.__name__: + continue + + model = self._make_model_instance(config, model_class) + + if not model.config.tie_word_embeddings: + continue + + if hasattr(model, "get_input_embeddings") and hasattr( + model, "get_output_embeddings"): + try: + input_embeddings = model.get_input_embeddings() + except NotImplementedError: + continue + + try: + output_embeddings = model.get_output_embeddings() + except NotImplementedError: + continue + + if input_embeddings is not None and output_embeddings is not None: + if hasattr(output_embeddings, "weight"): + output_embeddings_weight = output_embeddings.weight + else: + output_embeddings_weight = output_embeddings + + if hasattr(input_embeddings, "weight"): + input_embeddings_weight = input_embeddings.weight + else: + input_embeddings_weight = input_embeddings + print( + input_embeddings_weight, + output_embeddings_weight, ) + print("model name :{},id is{},{}".format( + model_class, + id(output_embeddings_weight), + id(input_embeddings_weight))) + self.assertEqual( + id(output_embeddings_weight), + id(input_embeddings_weight)) + + +class ModelTesterPretrainedMixin: + base_model_class: PretrainedModel = None + hf_remote_test_model_path: str = None + paddlehub_remote_test_model_path: str = None + + # Download from HF doesn't work in CI yet + @slow + def test_model_from_pretrained_hf_hub(self): + if self.hf_remote_test_model_path is None or self.base_model_class is None: + return + model = self.base_model_class.from_pretrained( + self.hf_remote_test_model_path, from_hf_hub=True) + self.assertIsNotNone(model) + + def test_model_from_pretrained_paddle_hub(self): + if self.paddlehub_remote_test_model_path is None or self.base_model_class is None: + return + model = self.base_model_class.from_pretrained( + self.paddlehub_remote_test_model_path) + self.assertIsNotNone(model) + + def test_model_from_config_paddle_hub(self): + if self.paddlehub_remote_test_model_path is None or self.base_model_class is None: + return + config = self.base_model_class.config_class.from_pretrained( + self.paddlehub_remote_test_model_path) + model = self.base_model_class._from_config(config) + self.assertIsNotNone(model) + + @slow + def test_model_from_pretrained_with_cache_dir(self): + for model_name in list( + self.base_model_class.pretrained_init_configuration)[:1]: + with tempfile.TemporaryDirectory() as tempdir: + tempdir = str(tempdir) + + model = self.base_model_class.from_pretrained( + model_name, cache_dir=tempdir) + self.assertIsNotNone(model) + self.assertTrue( + os.path.isfile( + os.path.join(tempdir, model_name, self.base_model_class. + resource_files_names["model_state"]))) + self.assertTrue( + os.path.isfile( + os.path.join(tempdir, model_name, + self.base_model_class.model_config_file))) + + @slow + def test_pretrained_save_and_load(self): + """test the pretrained model save and load with two different ways: url-file-name & model_state name + + eg: `bert-base-uncased.pdparams` and `model_state.pdparams` + """ + for model_name in list( + self.base_model_class.pretrained_init_configuration)[:1]: + model = self.base_model_class.from_pretrained(model_name) + self.assertIsNotNone(model) + + # 1. save and load + with tempfile.TemporaryDirectory() as tempdir: + tempdirname = str(tempdir) + model.save_pretrained(tempdirname) + + loaded_model = self.base_model_class.from_pretrained( + tempdirname) + + check_two_model_parameter(model, loaded_model) + + # 2. convert the weight file name + with tempfile.TemporaryDirectory() as tempdir: + tempdirname = str(tempdir) + "_old" + + shutil.copytree( + os.path.join(MODEL_HOME, model_name), + tempdirname, ) + + saved_model_state_file = os.path.join( + tempdirname, + self.base_model_class.resource_files_names["model_state"]) + + self.assertTrue(os.path.isfile(saved_model_state_file)) + + # rename it to the old style: name of url, eg: model_state.pdparams -> bert-base-uncased.pdparams + url = self.base_model_class.pretrained_resource_files_map[ + "model_state"][model_name] + pretrained_resource_file_name = os.path.split(url)[-1] + target_file_path = os.path.join(tempdirname, + pretrained_resource_file_name) + + shutil.copyfile(saved_model_state_file, target_file_path) + os.remove(saved_model_state_file) + + new_model = self.base_model_class.from_pretrained(tempdirname) + + check_two_model_parameter(model, new_model) diff --git a/paddlevlp/tests/testing_utils.py b/paddlevlp/tests/testing_utils.py new file mode 100644 index 00000000000000..27448810955d75 --- /dev/null +++ b/paddlevlp/tests/testing_utils.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import copy +import gc +import inspect +import os +import sys +import unittest +from argparse import ArgumentTypeError + +import numpy as np +import paddle +import yaml + + +def strtobool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise ArgumentTypeError( + f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)." + ) + + +def get_bool_from_env(key, default_value=False): + if key not in os.environ: + return default_value + value = os.getenv(key) + try: + value = strtobool(value) + except ValueError: + raise ValueError( + f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive)." + ) + return value + + +_run_slow_test = get_bool_from_env("RUN_SLOW_TEST") + + +def slow(test): + """ + Mark a test which spends too much time. + Slow tests are skipped by default. Excute the command `export RUN_SLOW_TEST=True` to run them. + """ + if not _run_slow_test: + return unittest.skip("test spends too much time")(test) + else: + return test