Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Sentence Transformer Inference with Intel Gaudi2 GPU Supported ( 'hpu' ) - Follow up for #2557 #2630

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions sentence_transformers/SentenceTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import math
import queue
import tempfile
import copy

from . import __MODEL_HUB_ORGANIZATION__
from .evaluation import SentenceEvaluator
Expand Down Expand Up @@ -89,11 +90,13 @@ def __init__(
token: Optional[Union[bool, str]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
truncate_dim: Optional[int] = None,
padding: Union[str, bool] = True,
):
# Note: self._load_sbert_model can also update `self.prompts` and `self.default_prompt_name`
self.prompts = prompts or {}
self.default_prompt_name = default_prompt_name
self.truncate_dim = truncate_dim
self.padding = padding
self._model_card_vars = {}
self._model_card_text = None
self._model_config = {}
Expand Down Expand Up @@ -315,6 +318,10 @@ def encode(
ht.hpu.wrap_in_hpu_graph(self, disable_tensor_cache=True)
self.is_hpu_graph_enabled = True

from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

adapt_transformers_to_gaudi()

self.eval()
if show_progress_bar is None:
show_progress_bar = (
Expand Down Expand Up @@ -374,11 +381,42 @@ def encode(
for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
sentences_batch = sentences_sorted[start_index : start_index + batch_size]
features = self.tokenize(sentences_batch)
if self.device.type == "hpu":
if "input_ids" in features:
curr_tokenize_len = features["input_ids"].shape
additional_pad_len = 2 ** math.ceil(math.log2(curr_tokenize_len[1])) - curr_tokenize_len[1]
features["input_ids"] = torch.cat(
(
features["input_ids"],
torch.ones((curr_tokenize_len[0], additional_pad_len), dtype=torch.int8),
),
-1,
)
features["attention_mask"] = torch.cat(
(
features["attention_mask"],
torch.zeros((curr_tokenize_len[0], additional_pad_len), dtype=torch.int8),
),
-1,
)
if "token_type_ids" in features:
features["token_type_ids"] = torch.cat(
(
features["token_type_ids"],
torch.zeros((curr_tokenize_len[0], additional_pad_len), dtype=torch.int8),
),
-1,
)

features = batch_to_device(features, device)
features.update(extra_features)

with torch.no_grad():
out_features = self.forward(features)
if self.device.type == "hpu":
hpu_graph_out = self.forward(features)
out_features = copy.deepcopy(hpu_graph_out)
else:
out_features = self.forward(features)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if self.device.type == "hpu":
hpu_graph_out = self.forward(features)
out_features = copy.deepcopy(hpu_graph_out)
else:
out_features = self.forward(features)
out_features = self.forward(features)
if self.device.type == "hpu":
out_features = copy.deepcopy(out_features)

I would prefer this, could you verify if that works? Admittedly, I'm not sure why you have to copy these.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to support the PR (test_encode_truncate() under tests/test_sentence_transformer) #2573, we need the code here. The reason is in original implementation of out_features = self.forward(features) the variables of features/out_features are using one single memory and out_features just adding additional key ("sentence_embedding"). When later on done with Dim_truncate there will be some error in hpu's graph mode because graph mode will take the original output as one reference.

I think what you changed here is good for simplicity.

out_features["sentence_embedding"] = truncate_embeddings(
out_features["sentence_embedding"], self.truncate_dim
)
Expand Down Expand Up @@ -595,8 +633,7 @@ def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
"""
kwargs = {}
# HPU models reach optimal performance if the padding is not dynamic
if self.device.type == "hpu":
kwargs["padding"] = "max_length"
kwargs["padding"] = self.padding

try:
return self._first_module().tokenize(texts, **kwargs)
Expand Down
7 changes: 6 additions & 1 deletion sentence_transformers/models/CLIPModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import transformers
import torch
from PIL import Image
from sentence_transformers.util import get_device_name
tomaarsen marked this conversation as resolved.
Show resolved Hide resolved


class CLIPModel(nn.Module):
Expand Down Expand Up @@ -72,7 +73,11 @@ def tokenize(self, texts, padding: Union[str, bool] = True):
encoding["pixel_values"] = image_features.pixel_values

encoding["image_text_info"] = image_text_info
return encoding
device = get_device_name()
if device == "hpu":
return dict(encoding)
else:
return encoding
tomaarsen marked this conversation as resolved.
Show resolved Hide resolved

def save(self, output_path: str):
self.model.save_pretrained(output_path)
Expand Down
10 changes: 2 additions & 8 deletions tests/test_compute_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import get_device_name


def test_encode_token_embeddings(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
Expand All @@ -24,13 +23,8 @@ def test_encode_token_embeddings(paraphrase_distilroberta_base_v1_model: Sentenc
emb = model.encode(sent, output_value="token_embeddings", batch_size=2)
assert len(emb) == len(sent)

device = get_device_name()
if device == "hpu":
for s, e in zip(sent, emb):
assert len(model.tokenize([s])["input_ids"][0]) == model.get_max_seq_length()
else:
for s, e in zip(sent, emb):
assert len(model.tokenize([s])["input_ids"][0]) == e.shape[0]
for s, e in zip(sent, emb):
assert len(model.tokenize([s])["input_ids"][0]) == e.shape[0]


def test_encode_single_sentences(paraphrase_distilroberta_base_v1_model: SentenceTransformer) -> None:
Expand Down
Loading