Skip to content

Commit

Permalink
🚨 Add Blip2ForImageTextRetrieval (huggingface#29261)
Browse files Browse the repository at this point in the history
* add Blip2ForImageTextRetrieval

* use one line and remove unnecessary space in tests

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* use  value from the config, rather than hardcoded

* change order of params in Blip2QFormerModel.forward

* update docstring

* fix style

* update test_inference_opt

* move embeddings out of Blip2QFormerModel

* remove from_vision_qformer_configs

* remove autocast float16 in Blip2QFormerModel

* rename fiels into vision_projection,text_projection,use_image_text_matching_head

* use CLIPOutput for  Blip2ImageTextMatchingModelOutput

* remove past_key_values_length from Blip2TextEmbeddings

* fix small typo in the CLIPOutput docstring

* add Blip2ForImageTextRetrieval to Zero Shot Image Classification mapping

* update docstring and add require_torch_fp16

* rollback test_inference_opt

* use use_image_text_matching_head=True in convert

* skip test_model_get_set_embeddings

* fix create_rename_keys error on new itm fields

* revert to do  scale after dot product between "query" and "key"

* fix ValueError on convert script for blip2-opt-2.7b

* update org of paths to Salesforce

* add is_pipeline_test_to_skip for VisualQuestionAnsweringPipelineTests

* [run_slow] blip_2

* removed Blip2ForImageTextRetrieval from IGNORE_NON_AUTO_CONFIGURED

* fix docstring of Blip2ImageTextMatchingModelOutput

* [run_slow] blip_2

* fix multi-gpu tests

* [run_slow] blip_2

* [run_slow] blip_2

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
  • Loading branch information
2 people authored and itazap committed Sep 20, 2024
1 parent 9f73b59 commit 09ba746
Show file tree
Hide file tree
Showing 17 changed files with 1,573 additions and 106 deletions.
15 changes: 14 additions & 1 deletion docs/source/en/model_doc/blip-2.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,17 @@ If you're interested in submitting a resource to be included here, please feel f

[[autodoc]] Blip2ForConditionalGeneration
- forward
- generate
- generate

## Blip2ForImageTextRetrieval

[[autodoc]] Blip2ForImageTextRetrieval
- forward

## Blip2TextModelWithProjection

[[autodoc]] Blip2TextModelWithProjection

## Blip2VisionModelWithProjection

[[autodoc]] Blip2VisionModelWithProjection
6 changes: 6 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1579,10 +1579,13 @@
_import_structure["models.blip_2"].extend(
[
"Blip2ForConditionalGeneration",
"Blip2ForImageTextRetrieval",
"Blip2Model",
"Blip2PreTrainedModel",
"Blip2QFormerModel",
"Blip2TextModelWithProjection",
"Blip2VisionModel",
"Blip2VisionModelWithProjection",
]
)
_import_structure["models.bloom"].extend(
Expand Down Expand Up @@ -6329,10 +6332,13 @@
)
from .models.blip_2 import (
Blip2ForConditionalGeneration,
Blip2ForImageTextRetrieval,
Blip2Model,
Blip2PreTrainedModel,
Blip2QFormerModel,
Blip2TextModelWithProjection,
Blip2VisionModel,
Blip2VisionModelWithProjection,
)
from .models.bloom import (
BloomForCausalLM,
Expand Down
12 changes: 6 additions & 6 deletions src/transformers/models/altclip/modeling_altclip.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,19 +161,19 @@ class AltCLIPOutput(ModelOutput):
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
text_model_output(`BaseModelOutputWithPooling`):
text_model_output (`BaseModelOutputWithPooling`):
The output of the [`AltCLIPTextModel`].
vision_model_output(`BaseModelOutputWithPooling`):
vision_model_output (`BaseModelOutputWithPooling`):
The output of the [`AltCLIPVisionModel`].
"""

Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1266,6 +1266,7 @@
("align", "AlignModel"),
("altclip", "AltCLIPModel"),
("blip", "BlipModel"),
("blip-2", "Blip2ForImageTextRetrieval"),
("chinese_clip", "ChineseCLIPModel"),
("clip", "CLIPModel"),
("clipseg", "CLIPSegModel"),
Expand Down
6 changes: 6 additions & 0 deletions src/transformers/models/blip_2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@
else:
_import_structure["modeling_blip_2"] = [
"Blip2Model",
"Blip2VisionModelWithProjection",
"Blip2QFormerModel",
"Blip2PreTrainedModel",
"Blip2ForConditionalGeneration",
"Blip2ForImageTextRetrieval",
"Blip2VisionModel",
"Blip2TextModelWithProjection",
]

if TYPE_CHECKING:
Expand All @@ -55,10 +58,13 @@
else:
from .modeling_blip_2 import (
Blip2ForConditionalGeneration,
Blip2ForImageTextRetrieval,
Blip2Model,
Blip2PreTrainedModel,
Blip2QFormerModel,
Blip2TextModelWithProjection,
Blip2VisionModel,
Blip2VisionModelWithProjection,
)

else:
Expand Down
22 changes: 19 additions & 3 deletions src/transformers/models/blip_2/configuration_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""BLIP-2 model configuration"""

import os
from typing import Union
from typing import Optional, Union

from ...configuration_utils import PretrainedConfig
from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
Expand Down Expand Up @@ -172,6 +172,8 @@ class Blip2QFormerConfig(PretrainedConfig):
The frequency of adding cross-attention to the Transformer layers.
encoder_hidden_size (`int`, *optional*, defaults to 1408):
The hidden size of the hidden states for cross-attention.
use_qformer_text_input (`bool`, *optional*, defaults to `False`):
Whether to use BERT-style embeddings.
Examples:
Expand Down Expand Up @@ -206,6 +208,7 @@ def __init__(
position_embedding_type="absolute",
cross_attention_frequency=2,
encoder_hidden_size=1408,
use_qformer_text_input=False,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
Expand All @@ -224,6 +227,7 @@ def __init__(
self.position_embedding_type = position_embedding_type
self.cross_attention_frequency = cross_attention_frequency
self.encoder_hidden_size = encoder_hidden_size
self.use_qformer_text_input = use_qformer_text_input

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
Expand Down Expand Up @@ -263,6 +267,8 @@ class Blip2Config(PretrainedConfig):
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
num_query_tokens (`int`, *optional*, defaults to 32):
The number of query tokens passed through the Transformer.
image_text_hidden_size (`int`, *optional*, defaults to 256):
Dimentionality of the hidden state of the image-text fusion layer.
image_token_index (`int`, *optional*):
Token index of special image token.
Expand Down Expand Up @@ -307,6 +313,7 @@ def __init__(
qformer_config=None,
text_config=None,
num_query_tokens=32,
image_text_hidden_size=256,
image_token_index=None,
**kwargs,
):
Expand All @@ -333,6 +340,7 @@ def __init__(
self.is_encoder_decoder = self.text_config.is_encoder_decoder

self.num_query_tokens = num_query_tokens
self.image_text_hidden_size = image_text_hidden_size
self.image_token_index = image_token_index
self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
Expand All @@ -344,20 +352,28 @@ def from_vision_qformer_text_configs(
cls,
vision_config: Blip2VisionConfig,
qformer_config: Blip2QFormerConfig,
text_config: PretrainedConfig,
text_config: Optional[PretrainedConfig] = None,
**kwargs,
):
r"""
Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
configurations.
Args:
vision_config (`dict`):
Dictionary of configuration options used to initialize [`Blip2VisionConfig`].
qformer_config (`dict`):
Dictionary of configuration options used to initialize [`Blip2QFormerConfig`].
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
Returns:
[`Blip2Config`]: An instance of a configuration object
"""

return cls(
vision_config=vision_config.to_dict(),
qformer_config=qformer_config.to_dict(),
text_config=text_config.to_dict(),
text_config=text_config.to_dict() if text_config is not None else None,
**kwargs,
)
Loading

0 comments on commit 09ba746

Please sign in to comment.