huggingface · regisss · Apr 23, 2024 · Mar 25, 2024 · Apr 10, 2024
@@ -510,6 +510,8 @@ def generate(
         generation_config = copy.deepcopy(generation_config)
         if generation_config.static_shapes is None:
             generation_config.static_shapes = self.config.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+            if self.config.model_type == "vision-encoder-decoder":
+                generation_config.static_shapes = self.config.decoder.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
         self.generation_config.static_shapes = generation_config.static_shapes
         if generation_config.ignore_eos is None:
             generation_config.ignore_eos = kwargs.get("ignore_eos", lazy_mode)

@@ -131,6 +131,7 @@
     gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
     gaudi_T5LayerSelfAttention_forward,
     gaudi_T5Stack_forward,
+    gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
     gaudi_vit_self_attention_forward,
     gaudi_wav2vec2_encoder_forward,
     gaudi_wav2vec2_forward,
@@ -363,3 +364,4 @@ def adapt_transformers_to_gaudi():
     transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = (
         gaudi_SpeechT5SpeechDecoderPrenet_forward
     )
+    transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder.VisionEncoderDecoderModel.prepare_inputs_for_generation = gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation
@@ -138,6 +138,9 @@
     gaudi_T5LayerSelfAttention_forward,
     gaudi_T5Stack_forward,
 )
+from .vision_encoder_decoder import (
+    gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
+)
 from .vit import gaudi_vit_self_attention_forward
 from .wav2vec2 import (
     _gaudi_wav2vec2_compute_mask_indices,

@@ -0,0 +1,3 @@
+from .modeling_vision_encoder_decoder import (
+    gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
+)
@@ -0,0 +1,32 @@
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation(
+    self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+):
+    """
+    Copied from VideoEncoderDecoderModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py#L645
+    The only differences are:
+    - add token idx support
+    """
+    decoder_inputs = self.decoder.prepare_inputs_for_generation(
+        input_ids,
+        past_key_values=past_key_values,
+        attention_mask=kwargs.get("decoder_attention_mask", None),
+        token_idx=kwargs.get("token_idx", None),
+    )
+    decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
+    input_dict = {
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "decoder_input_ids": decoder_inputs["input_ids"],
+        "encoder_outputs": encoder_outputs,
+        "past_key_values": decoder_inputs["past_key_values"],
+        "use_cache": use_cache,
+        "decoder_position_ids": decoder_inputs.get("position_ids", None),
+        "decoder_token_idx": decoder_inputs.get("token_idx", None),
+    }
+    return input_dict
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest import TestCase
+
+import torch
+from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+from transformers import pipeline
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+
+class GaudiPipelineTester(TestCase):
+    def _test_image_to_text(self, model, expected_result):
+        adapt_transformers_to_gaudi()
+        MODEL_DTYPE_LIST = [torch.bfloat16, torch.float32]
+        generate_kwargs = {
+            "lazy_mode": True,
+            "hpu_graphs": True,
+            "max_new_tokens": 128,
+            "ignore_eos": False,
+        }
+        image = "./tests/resource/image-captioning-example.png"
+        for model_dtype in MODEL_DTYPE_LIST:
+            generator = pipeline(
+                "image-to-text",
+                model=model,
+                torch_dtype=model_dtype,
+                device="hpu",
+            )
+            generator.model = wrap_in_hpu_graph(generator.model)
+            for i in range(3):
+                output = generator(image, generate_kwargs=generate_kwargs)
+            self.assertTrue(output[0]["generated_text"].startswith(expected_result))
+
+    def test_image_to_text_blip(self):
+        model = "Salesforce/blip-image-captioning-base"
+        expected_result = "a soccer player is playing a game on the app"
+        self._test_image_to_text(model, expected_result)
+
+    def test_image_to_text_vit(self):
+        model = "nlpconnect/vit-gpt2-image-captioning"
+        expected_result = "a soccer game with a player jumping to catch"
+        self._test_image_to_text(model, expected_result)