Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add transformers v4.45 support #902

Merged
merged 25 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test_ipex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ jobs:
pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu
pip install intel_extension_for_pytorch==${{ matrix.ipex-version }}
pip install Pillow parameterized
pip install transformers[testing]==${{ matrix.transformers-version }}
pip install .[ipex]
pip install .[ipex,tests]
- name: Test with Pytest
run: |
pytest tests/ipex/
2 changes: 0 additions & 2 deletions .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ jobs:
fail-fast: false
matrix:
python-version: ["3.8", "3.12"]
transformers-version: ["4.36.0", "4.44.*"]
os: [ubuntu-latest]

runs-on: ${{ matrix.os }}
Expand All @@ -38,7 +37,6 @@ jobs:
# install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
pip install transformers==${{ matrix.transformers-version }}

- name: Test with Pytest
env:
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/test_openvino_basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ jobs:
# Install openvino manually to prevent dependency conflicts when .[openvino] pins
# optimum or transformers to a specific version
pip install .[tests] openvino
pip install transformers==${{ matrix.transformers-version }}

- name: Pip freeze
run: pip freeze
Expand Down
16 changes: 15 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

import onnx
from transformers.generation import GenerationMixin
from transformers.utils import is_tf_available, is_torch_available

from openvino.runtime import Model, save_model
Expand All @@ -40,6 +41,7 @@
_torch_version,
_transformers_version,
compare_versions,
is_transformers_version,
)
from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
from optimum.utils.save_utils import maybe_save_preprocessors
Expand Down Expand Up @@ -379,7 +381,7 @@ def ts_patched_forward(*args, **kwargs):
if stateful:
# cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly
# TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation
logger.warn(
logger.warning(
"[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. "
"A stateless model will be exported instead. It may result in sub-optimal inference performance."
"Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
Expand Down Expand Up @@ -622,6 +624,18 @@ def export_from_model(

files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
elif library_name != "diffusers":
if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(model.generation_config, param_name, param_value)
setattr(model.config, param_name, None)

# Saving the model config and preprocessor as this is needed sometimes.
model.config.save_pretrained(output)
generation_config = getattr(model, "generation_config", None)
Expand Down
78 changes: 51 additions & 27 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def patch_model_with_bettertransformer(model):
return model

if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"):
log.warn(
log.warning(
COLOR_RED
+ "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. "
f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. "
Expand All @@ -75,7 +75,7 @@ def patch_model_with_bettertransformer(model):
display_version = (
_openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version
)
log.warn(
log.warning(
COLOR_RED
+ f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers "
f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: "
Expand All @@ -93,7 +93,7 @@ def patch_model_with_bettertransformer(model):
try:
model = model.to_bettertransformer()
except Exception as e:
log.warn(
log.warning(
f"Cannot apply model.to_bettertransformer because of the exception:\n{e}."
" Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention"
)
Expand Down Expand Up @@ -168,7 +168,8 @@ def __enter__(self):
layer.block_sparse_moe.forward = types.MethodType(
_mixtral_sparse_moe_block_forward, layer.block_sparse_moe
)
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
Expand Down Expand Up @@ -2145,6 +2146,7 @@ def _persimmon_self_attn_sdpa_forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb

Expand All @@ -2170,25 +2172,42 @@ def _persimmon_self_attn_sdpa_forward(
value_states = value_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)

kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
if is_transformers_version("<", "4.44.99"):
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
else:
if position_embeddings is None:
log.warning(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings

if is_transformers_version("<", "4.44.99"):
rotary_ndims = self.rotary_emb.dim
else:
rotary_ndims = self.rotary_ndims

# Partial rotary embedding
query_rot, query_pass = (
query_states[..., : self.rotary_emb.dim],
query_states[..., self.rotary_emb.dim :],
query_states[..., :rotary_ndims],
query_states[..., rotary_ndims:],
)
key_rot, key_pass = (
key_states[..., : self.rotary_emb.dim],
key_states[..., self.rotary_emb.dim :],
key_states[..., :rotary_ndims],
key_states[..., rotary_ndims:],
)
# [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
Expand All @@ -2202,7 +2221,7 @@ def _persimmon_self_attn_sdpa_forward(
cache_kwargs = {
"sin": sin,
"cos": cos,
"partial_rotation_size": self.rotary_emb.dim,
"partial_rotation_size": rotary_ndims,
"cache_position": cache_position,
}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
Expand Down Expand Up @@ -2238,7 +2257,8 @@ def __enter__(self):
orig_self_attn_fwd = layer.self_attn.forward
layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn)
layer.self_attn._orig_forward = orig_self_attn_fwd
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
Expand Down Expand Up @@ -2387,29 +2407,33 @@ def __exit__(self, exc_type, exc_value, traceback):
class RotaryEmbPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.model.layers:
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.model.layers:
_reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)


class FalconModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.transformer.h:
_reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.transformer.h:
_reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)


class GptNeoxModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.gpt_neox.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.gpt_neox.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)


class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
def __enter__(self):
super().__enter__()
for layer in self._model.gpt_neox_japanese.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
if is_transformers_version("<", "4.44.99"):
for layer in self._model.gpt_neox_japanese.layers:
_reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
echarlaix marked this conversation as resolved.
Show resolved Hide resolved


class Gemma2ModelPatcher(LlamaModelPatcher):
Expand Down
4 changes: 2 additions & 2 deletions optimum/exporters/openvino/stateful.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def make_stateful(
shape[0] = num_beams_and_batch
input.get_node().set_partial_shape(shape)
else:
log.warn(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")
log.warning(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set")

for kv_name_pair in zip(key_value_input_names, key_value_output_names):
input_output_map[kv_name_pair[0]] = kv_name_pair[1]
Expand All @@ -176,7 +176,7 @@ def ensure_stateful_is_available(warn=True):
"""
if is_openvino_version("<", "2023.3"):
if warn:
log.warn(
log.warning(
f"Could not create or use stateful model when using old version of openvino=={_openvino_version}. It may result in sub-optimal inference performance."
"Install openvino>=2023.3.0."
)
Expand Down
13 changes: 12 additions & 1 deletion optimum/intel/neural_compressor/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,18 @@ def _inner_training_loop(
if not delay_optimizer_creation:
self.create_optimizer_and_scheduler(num_training_steps=max_steps)

self.state = TrainerState()
if is_transformers_version(">=", "4.44.99"):
from transformers.trainer_callback import ExportableState

self.state = TrainerState(
stateful_callbacks=[
cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
]
)

else:
self.state = TrainerState()

self.state.is_hyper_param_search = trial is not None
self.state.train_batch_size = self._train_batch_size

Expand Down
38 changes: 18 additions & 20 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel

from ...exporters.openvino import export, main_export
from ..utils.import_utils import is_nncf_available
from ..utils.import_utils import is_nncf_available, is_transformers_version
from ..utils.modeling_utils import _find_files_matching_pattern
from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig
from .utils import (
Expand Down Expand Up @@ -127,11 +127,25 @@ def __init__(

self.output_names = output_names
self.output_dtypes = output_dtypes

self.model = model
self.request = None if not self._compile_only else self.model

generation_config = kwargs.get("generation_config", None)
if self.can_generate():
self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config))
self.generation_config = generation_config or GenerationConfig.from_model_config(config)

if is_transformers_version(">=", "4.44.99"):
misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
if len(misplaced_generation_parameters) > 0:
logger.warning(
"Moving the following attributes in the config to the generation config: "
f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
"generation parameters in the model config, as opposed to in the generation config.",
)
for param_name, param_value in misplaced_generation_parameters.items():
setattr(self.generation_config, param_name, param_value)
setattr(self.config, param_name, None)

else:
self.generation_config = None

Expand Down Expand Up @@ -352,19 +366,6 @@ def _from_pretrained(
model_save_dir=model_cache_path.parent,
)

try:
generation_config = GenerationConfig.from_pretrained(
model_id,
token=token,
revision=revision,
subfolder=subfolder,
force_download=force_download,
cache_dir=cache_dir,
)
kwargs["generation_config"] = generation_config
except Exception:
pass

return cls(
model,
config=config,
Expand Down Expand Up @@ -583,7 +584,6 @@ def _from_transformers(
library_name=cls._library_name,
)

config.save_pretrained(save_dir_path)
return cls._from_pretrained(
model_id=save_dir_path,
config=config,
Expand Down Expand Up @@ -712,9 +712,7 @@ def can_generate(self) -> bool:
"""
Returns whether this model can generate sequences with `.generate()`.
"""
if isinstance(self, GenerationMixin):
return True
return False
return isinstance(self, GenerationMixin)

def _inference(self, inputs):
try:
Expand Down
Loading
Loading