Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add static shape support for vision_encoder_decoder generation if dec… #834

Merged
merged 2 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions optimum/habana/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,8 @@ def generate(
generation_config = copy.deepcopy(generation_config)
if generation_config.static_shapes is None:
generation_config.static_shapes = self.config.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
if self.config.model_type == "vision-encoder-decoder":
generation_config.static_shapes = self.config.decoder.model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES
self.generation_config.static_shapes = generation_config.static_shapes
if generation_config.ignore_eos is None:
generation_config.ignore_eos = kwargs.get("ignore_eos", lazy_mode)
Expand Down
2 changes: 2 additions & 0 deletions optimum/habana/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
gaudi_T5ForConditionalGeneration_prepare_inputs_for_generation,
gaudi_T5LayerSelfAttention_forward,
gaudi_T5Stack_forward,
gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
gaudi_vit_self_attention_forward,
gaudi_wav2vec2_encoder_forward,
gaudi_wav2vec2_forward,
Expand Down Expand Up @@ -363,3 +364,4 @@ def adapt_transformers_to_gaudi():
transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = (
gaudi_SpeechT5SpeechDecoderPrenet_forward
)
transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder.VisionEncoderDecoderModel.prepare_inputs_for_generation = gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation
3 changes: 3 additions & 0 deletions optimum/habana/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@
gaudi_T5LayerSelfAttention_forward,
gaudi_T5Stack_forward,
)
from .vision_encoder_decoder import (
gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
)
from .vit import gaudi_vit_self_attention_forward
from .wav2vec2 import (
_gaudi_wav2vec2_compute_mask_indices,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .modeling_vision_encoder_decoder import (
gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from transformers.utils import logging


logger = logging.get_logger(__name__)


def gaudi_VisionEncoderDecoderModel_prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
):
"""
Copied from VideoEncoderDecoderModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py#L645
The only differences are:
- add token idx support
"""
decoder_inputs = self.decoder.prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
attention_mask=kwargs.get("decoder_attention_mask", None),
token_idx=kwargs.get("token_idx", None),
)
decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
input_dict = {
"attention_mask": attention_mask,
"decoder_attention_mask": decoder_attention_mask,
"decoder_input_ids": decoder_inputs["input_ids"],
"encoder_outputs": encoder_outputs,
"past_key_values": decoder_inputs["past_key_values"],
"use_cache": use_cache,
"decoder_position_ids": decoder_inputs.get("position_ids", None),
"decoder_token_idx": decoder_inputs.get("token_idx", None),
}
return input_dict
Binary file added tests/resource/image-captioning-example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
57 changes: 57 additions & 0 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest import TestCase

import torch
from habana_frameworks.torch.hpu import wrap_in_hpu_graph
from transformers import pipeline

from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi


class GaudiPipelineTester(TestCase):
def _test_image_to_text(self, model, expected_result):
adapt_transformers_to_gaudi()
MODEL_DTYPE_LIST = [torch.bfloat16, torch.float32]
generate_kwargs = {
"lazy_mode": True,
"hpu_graphs": True,
"max_new_tokens": 128,
"ignore_eos": False,
}
image = "./tests/resource/image-captioning-example.png"
for model_dtype in MODEL_DTYPE_LIST:
generator = pipeline(
"image-to-text",
model=model,
torch_dtype=model_dtype,
device="hpu",
)
generator.model = wrap_in_hpu_graph(generator.model)
for i in range(3):
output = generator(image, generate_kwargs=generate_kwargs)
self.assertTrue(output[0]["generated_text"].startswith(expected_result))

def test_image_to_text_blip(self):
model = "Salesforce/blip-image-captioning-base"
expected_result = "a soccer player is playing a game on the app"
self._test_image_to_text(model, expected_result)

def test_image_to_text_vit(self):
model = "nlpconnect/vit-gpt2-image-captioning"
expected_result = "a soccer game with a player jumping to catch"
self._test_image_to_text(model, expected_result)
Loading