huggingface · yonigozlan · Oct 8, 2024 · Aug 6, 2024 · Aug 7, 2024 · Sep 12, 2024
diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
 ### FlaxAutoModelForVision2Seq
 
 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
@@ -166,10 +166,10 @@ LLaVa-Next can perform inference with multiple images as input, where images eit
 import requests
 from PIL import Image
 import torch
-from transformers import AutoProcessor, LlavaNextForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText
 
 # Load the model in half-precision
-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
 
 # Get three different images
@@ -246,7 +246,7 @@ We value your feedback to help identify bugs before the full release! Check out
 Simply change the snippet above with:
 
 ```python
-from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
+from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
 
 # specify how to quantize the model
 quantization_config = BitsAndBytesConfig(
@@ -255,17 +255,17 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.float16,
 )
 
-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
 ```
 
 ### Use Flash-Attention 2 to further speed-up generation
 
 First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
 
 ```python
-from transformers import LlavaNextForConditionalGeneration
+from transformers import AutoModelForImageTextToText
 
-model = LlavaNextForConditionalGeneration.from_pretrained(
+model = AutoModelForImageTextToText.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
     low_cpu_mem_usage=True,

diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md
@@ -27,22 +27,22 @@ To begin with, there are multiple types of VLMs:
 - chat fine-tuned models for conversation
 - instruction fine-tuned models
 
-This guide focuses on inference with an instruction-tuned model. 
+This guide focuses on inference with an instruction-tuned model.
 
 Let's begin installing the dependencies.
 
 ```bash
-pip install -q transformers accelerate flash_attn 
+pip install -q transformers accelerate flash_attn
 ```
 
-Let's initialize the model and the processor. 
+Let's initialize the model and the processor.
 
 ```python
-from transformers import AutoProcessor, Idefics2ForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText
 import torch
 
 device = torch.device("cuda")
-model = Idefics2ForConditionalGeneration.from_pretrained(
+model = AutoModelForImageTextToText.from_pretrained(
     "HuggingFaceM4/idefics2-8b",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
@@ -51,7 +51,7 @@ model = Idefics2ForConditionalGeneration.from_pretrained(
 processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
 ```
 
-This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs. 
+This model has a [chat template](./chat_templating) that helps user parse chat outputs. Moreover, the model can also accept multiple images as input in a single conversation or message. We will now prepare the inputs.
 
 The image inputs look like the following.
 
@@ -74,7 +74,7 @@ images = [Image.open(requests.get(img_urls[0], stream=True).raw),
           Image.open(requests.get(img_urls[1], stream=True).raw)]
 ```
 
-Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template. 
+Below is an example of the chat template. We can feed conversation turns and the last message as an input by appending it at the end of the template.
 
 
 ```python
@@ -98,7 +98,7 @@ messages = [
             {"type": "image"},
             {"type": "text", "text": "And how about this image?"},
         ]
-    },       
+    },
 ]
 ```
 
@@ -180,11 +180,11 @@ def model_inference(
         if acc_text.endswith("<end_of_utterance>"):
             acc_text = acc_text[:-18]
         yield acc_text
-    
+
     thread.join()
 ```
 
-Now let's call the `model_inference` function we created and stream the values. 
+Now let's call the `model_inference` function we created and stream the values.
 
 ```python
 generator = model_inference(
@@ -204,7 +204,7 @@ for value in generator:
 
 ## Fit models in smaller hardware
 
-VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. 
+VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency.
 
 First, install dependencies.
 
@@ -215,18 +215,20 @@ pip install -U quanto bitsandbytes
 To quantize a model during loading, we need to first create [`QuantoConfig`]. Then load the model as usual, but pass `quantization_config` during model initialization.
 
 ```python
-from transformers import Idefics2ForConditionalGeneration, AutoTokenizer, QuantoConfig
+from transformers import AutoModelForImageTextToText, QuantoConfig
 
 model_id = "HuggingFaceM4/idefics2-8b"
 quantization_config = QuantoConfig(weights="int8")
-quantized_model = Idefics2ForConditionalGeneration.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)
+quantized_model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="cuda", quantization_config=quantization_config
+)
 ```
 
-And that's it, we can use the model the same way with no changes. 
+And that's it, we can use the model the same way with no changes.
 
 ## Further Reading
 
 Here are some more resources for the image-text-to-text task.
 
-- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more. 
+- [Image-text-to-text task page](https://huggingface.co/tasks/image-text-to-text) covers model types, use cases, datasets, and more.
 - [Vision Language Models Explained](https://huggingface.co/blog/vlms) is a blog post that covers everything about vision language models and supervised fine-tuning using [TRL](https://huggingface.co/docs/trl/en/index).
diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md
@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
 ### FlaxAutoModelForVision2Seq
 
 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1405,6 +1405,7 @@
             "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_IMAGE_MAPPING",
             "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
+            "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
             "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
             "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
             "MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
@@ -1446,6 +1447,7 @@
             "AutoModelForDocumentQuestionAnswering",
             "AutoModelForImageClassification",
             "AutoModelForImageSegmentation",
+            "AutoModelForImageTextToText",
             "AutoModelForImageToImage",
             "AutoModelForInstanceSegmentation",
             "AutoModelForKeypointDetection",
@@ -6251,6 +6253,7 @@
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
             MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
             MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -6292,6 +6295,7 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
             AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForKeypointDetection,

diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
@@ -74,6 +74,7 @@
         "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
         "MODEL_FOR_VISION_2_SEQ_MAPPING",
+        "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
         "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
         "MODEL_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
@@ -119,6 +120,7 @@
         "AutoModelWithLMHead",
         "AutoModelForZeroShotImageClassification",
         "AutoModelForZeroShotObjectDetection",
+        "AutoModelForImageTextToText",
     ]
 
 try:
@@ -238,6 +240,7 @@
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
             MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
             MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -279,6 +282,7 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
             AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForKeypointDetection,

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -753,6 +753,32 @@
     ]
 )
 
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
+    [
+        ("blip", "BlipForConditionalGeneration"),
+        ("blip-2", "Blip2ForConditionalGeneration"),
+        ("chameleon", "ChameleonForConditionalGeneration"),
+        ("fuyu", "FuyuForCausalLM"),
+        ("git", "GitForCausalLM"),
+        ("idefics", "IdeficsForVisionText2Text"),
+        ("idefics2", "Idefics2ForConditionalGeneration"),
+        ("idefics3", "Idefics3ForConditionalGeneration"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llava", "LlavaForConditionalGeneration"),
+        ("llava_next", "LlavaNextForConditionalGeneration"),
+        ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+        ("mllama", "MllamaForConditionalGeneration"),
+        ("paligemma", "PaliGemmaForConditionalGeneration"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("pixtral", "LlavaForConditionalGeneration"),
+        ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
+        ("udop", "UdopForConditionalGeneration"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
+        ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
+    ]
+)
+
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
@@ -1413,6 +1439,9 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
@@ -1707,6 +1736,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
 AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
 
 
+class AutoModelForImageTextToText(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+
+AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
+
+
 class AutoModelForAudioClassification(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -99,6 +99,7 @@
         ("trocr", "TrOCRProcessor"),
         ("tvlt", "TvltProcessor"),
         ("tvp", "TvpProcessor"),
+        ("udop", "UdopProcessor"),
         ("unispeech", "Wav2Vec2Processor"),
         ("unispeech-sat", "Wav2Vec2Processor"),
         ("video_llava", "VideoLlavaProcessor"),

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
@@ -707,6 +707,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
 
 
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
+
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
 
 
@@ -874,6 +877,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForImageTextToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForImageToImage(metaclass=DummyObject):
     _backends = ["torch"]
 

diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -23,7 +23,7 @@
 import numpy as np
 import requests
 
-from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
+from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
 from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
 from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -551,7 +551,7 @@ def test_snowman_image_captioning(self):
         image.save("new_image.jpg")
         image = Image.open("new_image.jpg")
 
-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
         processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
 
         prompt = "<grounding>An image of"
@@ -697,7 +697,7 @@ def test_snowman_image_captioning_batch(self):
         image.save("new_image.jpg")
         image = Image.open("new_image.jpg")
 
-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
+        model = AutoModelForImageTextToText.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
 
         prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
 

diff --git a/utils/check_repo.py b/utils/check_repo.py
@@ -170,7 +170,6 @@
     "ClapTextModelWithProjection",
     "ClapAudioModel",
     "ClapAudioModelWithProjection",
-    "Blip2ForConditionalGeneration",
     "Blip2TextModelWithProjection",
     "Blip2VisionModelWithProjection",
     "Blip2QFormerModel",
@@ -181,7 +180,6 @@
     "GitVisionModel",
     "GraphormerModel",
     "GraphormerForGraphClassification",
-    "BlipForConditionalGeneration",
     "BlipForImageTextRetrieval",
     "BlipForQuestionAnswering",
     "BlipVisionModel",
@@ -245,7 +243,6 @@
     "DetrForSegmentation",
     "Pix2StructVisionModel",
     "Pix2StructTextModel",
-    "Pix2StructForConditionalGeneration",
     "ConditionalDetrForSegmentation",
     "DPRReader",
     "FlaubertForQuestionAnswering",
@@ -322,7 +319,6 @@
     "SeamlessM4TCodeHifiGan",
     "SeamlessM4TForSpeechToSpeech",  # no auto class for speech-to-speech
     "TvpForVideoGrounding",
-    "UdopForConditionalGeneration",
     "SeamlessM4Tv2NARTextToUnitModel",
     "SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
     "SeamlessM4Tv2CodeHifiGan",