From a042720fdff19fe64cd479538cef023b54a26105 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 16 Sep 2024 18:33:40 +0200 Subject: [PATCH 01/23] Add transformers v4.45 support --- optimum/exporters/openvino/convert.py | 13 +++++ optimum/intel/openvino/modeling_base.py | 29 +++++----- .../intel/openvino/modeling_base_seq2seq.py | 55 +++++++++++++------ optimum/intel/openvino/modeling_decoder.py | 38 +++++++------ setup.py | 6 +- 5 files changed, 91 insertions(+), 50 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index a0541e8bc..98a8b680e 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -39,6 +39,7 @@ _torch_version, _transformers_version, compare_versions, + is_transformers_version, ) from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available from optimum.utils.save_utils import maybe_save_preprocessors @@ -615,6 +616,18 @@ def export_from_model( logging.disable(logging.NOTSET) if library_name != "diffusers": + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + if model.can_generate() and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) + # Saving the model config and preprocessor as this is needed sometimes. model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index bdf183bf6..6c2d734bf 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -34,7 +34,7 @@ from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel from ...exporters.openvino import export, main_export -from ..utils.import_utils import is_nncf_available +from ..utils.import_utils import is_nncf_available, is_transformers_version from ..utils.modeling_utils import _find_files_matching_pattern from .configuration import OVConfig, OVDynamicQuantizationConfig, OVWeightQuantizationConfig from .utils import ( @@ -125,11 +125,23 @@ def __init__( self.output_names = output_names self.output_dtypes = output_dtypes - self.model = model self.request = None if not self._compile_only else self.model if self.can_generate(): self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) + + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + else: self.generation_config = None @@ -351,19 +363,6 @@ def _from_pretrained( model_save_dir=model_cache_path.parent, ) - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - token=token, - revision=revision, - subfolder=subfolder, - force_download=force_download, - cache_dir=cache_dir, - ) - kwargs["generation_config"] = generation_config - except Exception: - pass - return cls( model, config=config, diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 7de9d5cf5..30506cc36 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -26,6 +26,7 @@ from transformers.file_utils import add_start_docstrings from ...exporters.openvino import main_export +from ..utils.import_utils import is_transformers_version from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( @@ -78,10 +79,21 @@ def __init__( self.encoder_model = encoder self.decoder_model = decoder self.decoder_with_past_model = decoder_with_past - if self.can_generate(): - self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) - else: - self.generation_config = None + + self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) + + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + self._openvino_config = None if quantization_config: self._openvino_config = OVConfig(quantization_config=quantization_config) @@ -166,6 +178,9 @@ def _from_pretrained( local_files_only(`bool`, *optional*, defaults to `False`): Whether or not to only look at local files (i.e., do not try to download the model). """ + generation_config = kwargs.pop("generation_config", None) + subfolder = kwargs.pop("subfolder", "") + default_encoder_file_name = ONNX_ENCODER_NAME if from_onnx else OV_ENCODER_NAME default_decoder_file_name = ONNX_DECODER_NAME if from_onnx else OV_DECODER_NAME default_decoder_with_past_file_name = ONNX_DECODER_WITH_PAST_NAME if from_onnx else OV_DECODER_WITH_PAST_NAME @@ -229,6 +244,7 @@ def _from_pretrained( cache_dir=cache_dir, force_download=force_download, local_files_only=local_files_only, + subfolder=subfolder, ) file_names[name] = model_cache_path @@ -252,18 +268,24 @@ def _from_pretrained( kwargs.get("ov_config"), model_save_dir, ) - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - token=token, - revision=revision, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - ) - kwargs["generation_config"] = generation_config - except Exception: - pass + + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) return cls( encoder=encoder, @@ -272,6 +294,7 @@ def _from_pretrained( config=config, model_save_dir=model_save_dir, quantization_config=quantization_config, + generation_config=generation_config, **kwargs, ) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 0c6b558fe..3c923ac73 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -143,9 +143,8 @@ def __init__( self.num_pkv = 2 self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] - self._original_model = ( - self.model.clone() if not compile_only else None - ) # keep original model for serialization + # Keeping the original model for serialization + self._original_model = self.model.clone() if not compile_only else None self._pkv_precision = Type.f32 self.next_beam_idx = None self._past_length = 0 @@ -787,6 +786,7 @@ def _from_pretrained( quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME file_name = file_name or default_file_name @@ -827,20 +827,23 @@ def _from_pretrained( enable_compilation = kwargs.pop("compile", True) and not quantization_config - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - token=token, - revision=revision, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - ) - if getattr(generation_config, "cache_implementation", None) is not None: - generation_config.cache_implementation = None - kwargs["generation_config"] = generation_config - except Exception: - pass + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) causal_model = init_cls( model=model, @@ -849,6 +852,7 @@ def _from_pretrained( compile=enable_compilation, compile_only=compile_only, quantization_config=quantization_config, + generation_config=generation_config, **kwargs, ) diff --git a/setup.py b/setup.py index f379bd782..d2695ba97 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,10 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "transformers>=4.36,<4.45", - "optimum~=1.22", + "transformers @ git+https://github.com/huggingface/transformers.git", + "optimum @ git+https://github.com/huggingface/optimum.git@trfs-4.45", + # "transformers>=4.36,<4.46", + # "optimum~=1.22", "datasets>=1.4.0", "sentencepiece", "setuptools", From 19748bd842a981bbf34b33c0d6283a7d0099080a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 16 Sep 2024 18:53:26 +0200 Subject: [PATCH 02/23] update setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d2695ba97..75b7875bc 100644 --- a/setup.py +++ b/setup.py @@ -61,10 +61,10 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<=4.43.2"], + "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<=4.46"], "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.46"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From e8bd1b656499b555bdbd8a346c691ebeea90038c Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 16 Sep 2024 18:54:59 +0200 Subject: [PATCH 03/23] for test --- .github/workflows/test_ipex.yml | 1 - .github/workflows/test_openvino.yml | 2 -- .github/workflows/test_openvino_basic.yml | 1 - 3 files changed, 4 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 8cdfe30b5..7b37498cd 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -41,7 +41,6 @@ jobs: pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu pip install intel_extension_for_pytorch==${{ matrix.ipex-version }} pip install Pillow parameterized - pip install transformers[testing]==${{ matrix.transformers-version }} pip install .[ipex] - name: Test with Pytest run: | diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 335acf669..e0fb95215 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -21,7 +21,6 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.12"] - transformers-version: ["4.36.0", "4.44.*"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} @@ -38,7 +37,6 @@ jobs: # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - pip install transformers==${{ matrix.transformers-version }} - name: Test with Pytest env: diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index ced98dd9a..782e05f03 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -47,7 +47,6 @@ jobs: # Install openvino manually to prevent dependency conflicts when .[openvino] pins # optimum or transformers to a specific version pip install .[tests] openvino - pip install transformers==${{ matrix.transformers-version }} - name: Pip freeze run: pip freeze From 2917afb586b0c41ec81c84478a12f14764ba6f4b Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 17 Sep 2024 14:19:09 +0200 Subject: [PATCH 04/23] fix --- .github/workflows/test_ipex.yml | 2 +- optimum/exporters/openvino/convert.py | 3 ++- optimum/intel/openvino/modeling_base.py | 4 +--- optimum/intel/openvino/modeling_decoder.py | 4 ---- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 7b37498cd..f7b04486b 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -41,7 +41,7 @@ jobs: pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu pip install intel_extension_for_pytorch==${{ matrix.ipex-version }} pip install Pillow parameterized - pip install .[ipex] + pip install .[ipex,tests] - name: Test with Pytest run: | pytest tests/ipex/ diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 98a8b680e..dd2a80323 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -20,6 +20,7 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import onnx +from transformers.generation import GenerationMixin from transformers.utils import is_tf_available, is_torch_available from openvino.runtime import Model, save_model @@ -618,7 +619,7 @@ def export_from_model( if library_name != "diffusers": if is_transformers_version(">=", "4.44.99"): misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - if model.can_generate() and len(misplaced_generation_parameters) > 0: + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " f"{misplaced_generation_parameters}. You are seeing this warning because you've set " diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 6c2d734bf..817ecc5d7 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -710,9 +710,7 @@ def can_generate(self) -> bool: """ Returns whether this model can generate sequences with `.generate()`. """ - if isinstance(self, GenerationMixin): - return True - return False + return isinstance(self, GenerationMixin) def _inference(self, inputs): try: diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3c923ac73..0f36192e4 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -764,10 +764,6 @@ def _reorder_cache( ) return tuple(np.take(past_state, beam_idx, 0) for past_state in past_key_values) - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - return True - @classmethod def _from_pretrained( cls, From 96ef48df77159c42f2170cf684a2f93f1989d8f0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 17 Sep 2024 16:34:33 +0200 Subject: [PATCH 05/23] fix --- optimum/intel/openvino/modeling_base.py | 1 - optimum/intel/openvino/modeling_base_seq2seq.py | 1 - optimum/intel/openvino/modeling_decoder.py | 3 --- 3 files changed, 5 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 817ecc5d7..0e041ebf9 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -581,7 +581,6 @@ def _from_transformers( library_name=cls._library_name, ) - config.save_pretrained(save_dir_path) return cls._from_pretrained( model_id=save_dir_path, config=config, diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 30506cc36..ad16d10fb 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -375,7 +375,6 @@ def _from_transformers( ov_config=ov_config, ) - config.save_pretrained(save_dir_path) return cls._from_pretrained( model_id=save_dir_path, config=config, diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 0f36192e4..fdb6cb026 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -328,9 +328,6 @@ def _from_transformers( library_name=cls._library_name, ) - config.is_decoder = True - config.is_encoder_decoder = False - config.save_pretrained(save_dir_path) return cls._from_pretrained( model_id=save_dir_path, config=config, From 3ec2fbed129ca653716017f31a4ceaf03a2dfc7e Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 17 Sep 2024 16:55:24 +0200 Subject: [PATCH 06/23] fix generation config --- optimum/intel/openvino/modeling_base.py | 4 +++- optimum/intel/openvino/modeling_base_seq2seq.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 0e041ebf9..186a2b6b2 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -127,8 +127,10 @@ def __init__( self.output_dtypes = output_dtypes self.model = model self.request = None if not self._compile_only else self.model + + generation_config = kwargs.get("generation_config", None) if self.can_generate(): - self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) + self.generation_config = generation_config or GenerationConfig.from_model_config(config) if is_transformers_version(">=", "4.44.99"): misplaced_generation_parameters = self.config._get_non_default_generation_parameters() diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index ad16d10fb..763dd2b50 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -80,7 +80,8 @@ def __init__( self.decoder_model = decoder self.decoder_with_past_model = decoder_with_past - self.generation_config = kwargs.get("generation_config", GenerationConfig.from_model_config(config)) + generation_config = kwargs.get("generation_config", None) + self.generation_config = generation_config or GenerationConfig.from_model_config(config) if is_transformers_version(">=", "4.44.99"): misplaced_generation_parameters = self.config._get_non_default_generation_parameters() From b595fd0a437d181f6ed61dd42bc6009b42af0385 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 17 Sep 2024 18:32:11 +0200 Subject: [PATCH 07/23] fixes --- optimum/exporters/openvino/model_patcher.py | 34 ++++++++++++--------- optimum/exporters/openvino/stateful.py | 4 +-- optimum/intel/openvino/modeling_decoder.py | 2 +- setup.py | 2 +- tests/openvino/test_quantization.py | 2 +- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 59d4bedb5..f84bce362 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -45,7 +45,7 @@ from transformers.modeling_tf_utils import TFPreTrainedModel -BETTERTRANSFORMER_IGNORE = ("codegen",) +BETTERTRANSFORMER_IGNORE = ("codegen", "gpt_neo") def patch_model_with_bettertransformer(model): @@ -57,7 +57,7 @@ def patch_model_with_bettertransformer(model): return model if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"): - log.warn( + log.warning( COLOR_RED + "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. " f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. " @@ -75,7 +75,7 @@ def patch_model_with_bettertransformer(model): display_version = ( _openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version ) - log.warn( + log.warning( COLOR_RED + f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers " f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: " @@ -93,7 +93,7 @@ def patch_model_with_bettertransformer(model): try: model = model.to_bettertransformer() except Exception as e: - log.warn( + log.warning( f"Cannot apply model.to_bettertransformer because of the exception:\n{e}." " Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention" ) @@ -168,7 +168,8 @@ def __enter__(self): layer.block_sparse_moe.forward = types.MethodType( _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if is_transformers_version("<", "4.44.99"): + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2238,7 +2239,8 @@ def __enter__(self): orig_self_attn_fwd = layer.self_attn.forward layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn) layer.self_attn._orig_forward = orig_self_attn_fwd - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if is_transformers_version("<", "4.44.99"): + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2387,29 +2389,33 @@ def __exit__(self, exc_type, exc_value, traceback): class RotaryEmbPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.model.layers: + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) class FalconModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.transformer.h: - _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.transformer.h: + _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb) class GptNeoxModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.gpt_neox.layers: - _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.gpt_neox.layers: + _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) class GptNeoxJapaneseModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.gpt_neox_japanese.layers: - _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) + if is_transformers_version("<", "4.44.99"): + for layer in self._model.gpt_neox_japanese.layers: + _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) class Gemma2ModelPatcher(LlamaModelPatcher): diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index f05cc23cd..fa5a2a898 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -151,7 +151,7 @@ def make_stateful( shape[0] = num_beams_and_batch input.get_node().set_partial_shape(shape) else: - log.warn(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set") + log.warning(f"Rank of {input.get_any_name()} input of the model is not 2, batch size is not set") for kv_name_pair in zip(key_value_input_names, key_value_output_names): input_output_map[kv_name_pair[0]] = kv_name_pair[1] @@ -176,7 +176,7 @@ def ensure_stateful_is_available(warn=True): """ if is_openvino_version("<", "2023.3"): if warn: - log.warn( + log.warning( f"Could not create or use stateful model when using old version of openvino=={_openvino_version}. It may result in sub-optimal inference performance." "Install openvino>=2023.3.0." ) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index fdb6cb026..fcc814d26 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -122,7 +122,7 @@ def __init__( "`compile_only` mode does not support disabling compilation." "Please provide `compile=True` if you want to use `compile_only=True` or set `compile_only=False`" ) - + config.is_encoder_decoder = False super().__init__( model, config, diff --git a/setup.py b/setup.py index 75b7875bc..829028f0c 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ EXTRAS_REQUIRE = { "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<=4.46"], - "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], + "openvino": ["openvino>=2023.3,<2024.4", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.46"], "diffusers": ["diffusers"], diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c263000f1..8f8fe2545 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -737,7 +737,7 @@ class OVTrainerTest(unittest.TestCase): @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8): model_id = MODEL_NAMES[model_name] - model = AutoModelForSequenceClassification.from_pretrained(model_id) + model = AutoModelForSequenceClassification.from_pretrained(model_id, attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_id) ov_config = OVConfig() dataset = load_dataset("glue", "sst2") From 03a974a626a0a6c6799addfd9ede398e2a711b34 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 20 Sep 2024 11:17:54 +0200 Subject: [PATCH 08/23] fix persimmon --- optimum/exporters/openvino/model_patcher.py | 47 +++++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f84bce362..2a3847df4 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2136,6 +2136,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.norm_attn_norm.attn.rotary_emb.forward = block.norm_attn_norm.attn.rotary_emb._orig_forward + # Adapted from https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/models/persimmon/modeling_persimmon.py#L264 def _persimmon_self_attn_sdpa_forward( self, @@ -2146,6 +2147,7 @@ def _persimmon_self_attn_sdpa_forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb @@ -2171,25 +2173,42 @@ def _persimmon_self_attn_sdpa_forward( value_states = value_states.transpose(1, 2) key_states = key_states.transpose(1, 2) - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." + if is_transformers_version("<", "4.44.99"): + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + else: + if position_embeddings is None: + log.warning( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + + if is_transformers_version("<", "4.44.99"): + rotary_ndims = self.rotary_emb.dim + else: + rotary_ndims = self.rotary_ndims # Partial rotary embedding query_rot, query_pass = ( - query_states[..., : self.rotary_emb.dim], - query_states[..., self.rotary_emb.dim :], + query_states[..., :rotary_ndims], + query_states[..., rotary_ndims:], ) key_rot, key_pass = ( - key_states[..., : self.rotary_emb.dim], - key_states[..., self.rotary_emb.dim :], + key_states[..., :rotary_ndims], + key_states[..., rotary_ndims:], ) # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) @@ -2203,7 +2222,7 @@ def _persimmon_self_attn_sdpa_forward( cache_kwargs = { "sin": sin, "cos": cos, - "partial_rotation_size": self.rotary_emb.dim, + "partial_rotation_size": rotary_ndims, "cache_position": cache_position, } key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) From 07cb7b4c9ef922eb4fb20f3a04d959922bc141fc Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Fri, 20 Sep 2024 15:08:33 +0200 Subject: [PATCH 09/23] fix --- optimum/exporters/openvino/convert.py | 2 +- optimum/exporters/openvino/model_patcher.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index dd2a80323..e394d19c4 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -386,7 +386,7 @@ def ts_patched_forward(*args, **kwargs): if stateful: # cannot raise because stateful is enabled by default and it would break backward compatibility for models that couldn't convert to OV directly # TODO: Implement stateful for ONNX path as well, not doing it right now because of lack of validation - logger.warn( + logger.warning( "[ WARNING ] Making stateful models is not supported when exporting to ONNX as an intermediate step. " "A stateless model will be exported instead. It may result in sub-optimal inference performance." "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path." diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 2a3847df4..3f36b1a7f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -45,7 +45,7 @@ from transformers.modeling_tf_utils import TFPreTrainedModel -BETTERTRANSFORMER_IGNORE = ("codegen", "gpt_neo") +BETTERTRANSFORMER_IGNORE = ("codegen",) def patch_model_with_bettertransformer(model): From f97f04964922b62f81a37b3b2f5f3915f94b3d4c Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 23 Sep 2024 14:11:34 +0200 Subject: [PATCH 10/23] style --- optimum/exporters/openvino/convert.py | 1 - optimum/exporters/openvino/model_patcher.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index c956f3c30..858910527 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -624,7 +624,6 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": - if is_transformers_version(">=", "4.44.99"): misplaced_generation_parameters = model.config._get_non_default_generation_parameters() if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 51d7656d7..859ed7d0f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2136,7 +2136,6 @@ def __exit__(self, exc_type, exc_value, traceback): block.norm_attn_norm.attn.rotary_emb.forward = block.norm_attn_norm.attn.rotary_emb._orig_forward - # Adapted from https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/models/persimmon/modeling_persimmon.py#L264 def _persimmon_self_attn_sdpa_forward( self, @@ -2199,7 +2198,7 @@ def _persimmon_self_attn_sdpa_forward( if is_transformers_version("<", "4.44.99"): rotary_ndims = self.rotary_emb.dim else: - rotary_ndims = self.rotary_ndims + rotary_ndims = self.rotary_ndims # Partial rotary embedding query_rot, query_pass = ( From 4f3b698cd0dcf00ff2d9682359ee37b0dc0bcb50 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 23 Sep 2024 18:55:53 +0200 Subject: [PATCH 11/23] fix trainer --- optimum/intel/neural_compressor/trainer.py | 13 ++++++++++++- optimum/intel/openvino/trainer.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index 5e64ca862..7bfcb8c37 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -271,7 +271,18 @@ def _inner_training_loop( if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) - self.state = TrainerState() + if is_transformers_version(">=", "4.44.99"): + from transformers.trainer_callback import ExportableState + + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) + + else: + self.state = TrainerState() + self.state.is_hyper_param_search = trial is not None self.state.train_batch_size = self._train_batch_size diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index b050286a1..1a308eecf 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -380,7 +380,18 @@ def _inner_training_loop( if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) - self.state = TrainerState() + if is_transformers_version(">=", "4.44.99"): + from transformers.trainer_callback import ExportableState + + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) + + else: + self.state = TrainerState() + self.state.is_hyper_param_search = trial is not None self.state.train_batch_size = self._train_batch_size From b90298582b04d017bd08f717ad35da555bbb1337 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 24 Sep 2024 16:15:26 +0200 Subject: [PATCH 12/23] fix generation config for inc models --- .../intel/neural_compressor/modeling_base.py | 69 +++++++++++++++++-- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 392d84b47..edbad5693 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -49,7 +49,7 @@ from optimum.intel.generation import BaseModelForCausalLM from ...modeling_base import OptimizedModel -from ..utils.import_utils import _torch_version, is_torch_version +from ..utils.import_utils import _torch_version, is_torch_version, is_transformers_version from .configuration import INCConfig from .quantization import _weight_only_quantization from .utils import QUANTIZATION_CONFIG_NAME @@ -92,7 +92,25 @@ def __init__( self._device = getattr(self.model, "device", None) or torch.device( "cuda:0" if torch.cuda.is_available() else "cpu" ) - self.generation_config = GenerationConfig.from_model_config(config) + + generation_config = kwargs.get("generation_config", None) + if self.can_generate(): + self.generation_config = generation_config or GenerationConfig.from_model_config(config) + + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + + else: + self.generation_config = None # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863 @@ -126,9 +144,29 @@ def _from_pretrained( token = use_auth_token quantization_config = kwargs.pop("quantization_config", None) + generation_config = kwargs.pop("generation_config", None) + model_path = Path(model_id) is_local = model_path.is_dir() + if generation_config is None and cls.can_generate(): + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + # ITREX compatibility quantization_config_path = None if is_local: @@ -202,7 +240,7 @@ def _from_pretrained( **kwargs, ) - return cls(model, config=config, model_save_dir=None, **kwargs).model + return cls(model, config=config, model_save_dir=None, generation_config=generation_config, **kwargs).model model_cache_path = None inc_config = None @@ -261,7 +299,14 @@ def _from_pretrained( ) model = torch.jit.load(model_cache_path) model = torch.jit.freeze(model.eval()) - return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs) + return cls( + model, + config=config, + model_save_dir=model_save_dir, + inc_config=inc_config, + generation_config=generation_config, + **kwargs, + ) model_class = _get_model_class(config, cls.auto_model_class._model_mapping) # Load the state dictionary of the model to verify whether the model to get the quantization config @@ -283,7 +328,13 @@ def _from_pretrained( raise return cls( - model, config=config, model_save_dir=model_save_dir, q_config=q_config, inc_config=inc_config, **kwargs + model, + config=config, + model_save_dir=model_save_dir, + q_config=q_config, + inc_config=inc_config, + generation_config=generation_config, + **kwargs, ) def _save_pretrained(self, save_directory: Union[str, Path]): @@ -304,6 +355,14 @@ def _save_pretrained(self, save_directory: Union[str, Path]): if self.inc_config: self.inc_config.save_pretrained(save_directory) + if self.generation_config is not None: + try: + self.generation_config.save_pretrained(save_directory) + except Exception as exception: + logger.warning( + f"The generation config will not be saved, saving failed with following error:\n{exception}" + ) + def forward(self, *args, **kwargs): return self.model(*args, **kwargs) From 78c10b61b38211eb8c3a7574bd1e6a3e6c8977da Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 24 Sep 2024 16:40:09 +0200 Subject: [PATCH 13/23] fix --- .../intel/neural_compressor/modeling_base.py | 33 +++---------------- tests/neural_compressor/test_modeling.py | 21 ------------ 2 files changed, 4 insertions(+), 50 deletions(-) diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index edbad5693..6f6c700d8 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -46,8 +46,6 @@ from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from transformers.utils.generic import ContextManagers -from optimum.intel.generation import BaseModelForCausalLM - from ...modeling_base import OptimizedModel from ..utils.import_utils import _torch_version, is_torch_version, is_transformers_version from .configuration import INCConfig @@ -85,6 +83,8 @@ def __init__( inc_config: Dict = None, **kwargs, ): + generation_config = kwargs.pop("generation_config", None) + super().__init__(model=model, config=config, **kwargs) self.inc_config = inc_config self._q_config = q_config @@ -92,8 +92,6 @@ def __init__( self._device = getattr(self.model, "device", None) or torch.device( "cuda:0" if torch.cuda.is_available() else "cpu" ) - - generation_config = kwargs.get("generation_config", None) if self.can_generate(): self.generation_config = generation_config or GenerationConfig.from_model_config(config) @@ -149,7 +147,7 @@ def _from_pretrained( model_path = Path(model_id) is_local = model_path.is_dir() - if generation_config is None and cls.can_generate(): + if generation_config is None and "text-generation" in cls.export_feature: try: generation_config = GenerationConfig.from_pretrained( model_id, @@ -425,29 +423,6 @@ class INCModelForVision2Seq(INCModel): export_feature = "image-to-text" -class INCModelForCausalLM(INCModel, BaseModelForCausalLM): +class INCModelForCausalLM(INCModel): auto_model_class = AutoModelForCausalLM export_feature = "text-generation" - forward = BaseModelForCausalLM.forward - generate = BaseModelForCausalLM.generate - can_generate = BaseModelForCausalLM.can_generate - - def __init__( - self, - model, - config: PretrainedConfig = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - q_config: Dict = None, - inc_config: Dict = None, - use_cache: bool = True, - **kwargs, - ): - super(INCModelForCausalLM, self).__init__( - model=model, - config=config, - model_save_dir=model_save_dir, - q_config=q_config, - inc_config=inc_config, - use_cache=use_cache, - **kwargs, - ) diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index 81e6d03dc..e9e117518 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -125,27 +125,6 @@ def test_pipeline(self, model_id, task): pipe(*inputs) - def test_compare_with_and_without_past_key_values(self): - model_id = "echarlaix/tiny-random-gpt2-torchscript" - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt") - - model_with_pkv = INCModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv") - - outputs_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - model_without_pkv = INCModelForCausalLM.from_pretrained( - model_id, use_cache=False, subfolder="model_without_pkv" - ) - - outputs_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertEqual(outputs_with_pkv.shape[1], self.GENERATION_LENGTH) - self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH) - self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv)) - def test_saving_loading_inc_woq_model(self): model_name = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ" model = INCModelForCausalLM.from_pretrained(model_name, revision="main") From 263151405cdadc1be0d5bfee8ab6829459fd067c Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 24 Sep 2024 17:43:45 +0200 Subject: [PATCH 14/23] fix trainer --- optimum/intel/neural_compressor/trainer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index 7bfcb8c37..c018cf3e8 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -703,6 +703,21 @@ def _save(self, output_dir=None, state_dict=None): output_model_file = os.path.join(output_dir, WEIGHTS_NAME) # Save the config + if self.model.can_generate(): + if is_transformers_version(">=", "4.44.99"): + misplaced_generation_parameters = self.model.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.model.generation_config, param_name, param_value) + setattr(self.model.config, param_name, None) + + self.model.generation_config.save_pretrained(output_dir) + if self.model.config is not None: self.model.config.save_pretrained(output_dir) From c05c0ad68751f419d293a11508e33dc4bc2cc8ca Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 24 Sep 2024 17:44:22 +0200 Subject: [PATCH 15/23] format --- optimum/intel/neural_compressor/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index c018cf3e8..c0fe0cf6d 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -715,7 +715,7 @@ def _save(self, output_dir=None, state_dict=None): for param_name, param_value in misplaced_generation_parameters.items(): setattr(self.model.generation_config, param_name, param_value) setattr(self.model.config, param_name, None) - + self.model.generation_config.save_pretrained(output_dir) if self.model.config is not None: From 2f8a51c39ae196ed1c561159af134c0e75f693af Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 25 Sep 2024 11:38:46 +0200 Subject: [PATCH 16/23] fix seq2seq trainer --- .../neural_compressor/trainer_seq2seq.py | 51 ++++++++++++------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/optimum/intel/neural_compressor/trainer_seq2seq.py b/optimum/intel/neural_compressor/trainer_seq2seq.py index 27540cfb1..58beb5139 100644 --- a/optimum/intel/neural_compressor/trainer_seq2seq.py +++ b/optimum/intel/neural_compressor/trainer_seq2seq.py @@ -124,6 +124,7 @@ def prediction_step( inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, + **gen_kwargs, ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform an evaluation step on `model` using `inputs`. @@ -155,17 +156,17 @@ def prediction_step( has_labels = "labels" in inputs inputs = self._prepare_inputs(inputs) - # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = { - "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, - "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, - "synced_gpus": True if is_deepspeed_zero3_enabled() else False, - } + # Priority (handled in generate): + # non-`None` gen_kwargs > model.generation_config > default GenerationConfig() + if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): + gen_kwargs = self._gen_kwargs.copy() + if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None: + gen_kwargs.pop("num_beams") + if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None: + gen_kwargs.pop("max_length") - if "attention_mask" in inputs: - gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) - if "global_attention_mask" in inputs: - gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) + if "synced_gpus" not in gen_kwargs: + gen_kwargs["synced_gpus"] = is_deepspeed_zero3_enabled() # prepare generation inputs # some encoder-decoder models can have varying encoder's and thus @@ -176,14 +177,25 @@ def prediction_step( generation_inputs = inputs[self.model.main_input_name] generated_tokens = self.model.generate(generation_inputs, **gen_kwargs) + + # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop + # TODO: remove this hack when the legacy code that initializes generation_config from a model config is + # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 + if self.model.generation_config._from_model_config: + self.model.generation_config._from_model_config = False + + # Retrieves GenerationConfig from model.generation_config + gen_config = self.model.generation_config # in case the batch is shorter than max length, the output should be padded - if generated_tokens.shape[-1] < gen_kwargs["max_length"]: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + if generated_tokens.shape[-1] < gen_config.max_length: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) + elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) with torch.no_grad(): - with self.autocast_smart_context_manager(): - outputs = model(**inputs) if has_labels: + with self.compute_loss_context_manager(): + outputs = model(**inputs) if self.label_smoother is not None: loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() else: @@ -192,16 +204,19 @@ def prediction_step( loss = None if self.args.prediction_loss_only: - return (loss, None, None) + return loss, None, None if has_labels: labels = inputs["labels"] - if labels.shape[-1] < gen_kwargs["max_length"]: - labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + if labels.shape[-1] < gen_config.max_length: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) + elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) else: labels = None - return (loss, generated_tokens, labels) + return loss, generated_tokens, labels + def _pad_tensors_to_max_len(self, tensor, max_length): if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): From ca7b9deab01ad9ec1232c6339c4f2348fc04c6e0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 25 Sep 2024 11:39:16 +0200 Subject: [PATCH 17/23] style --- optimum/intel/neural_compressor/trainer_seq2seq.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/intel/neural_compressor/trainer_seq2seq.py b/optimum/intel/neural_compressor/trainer_seq2seq.py index 58beb5139..ee8f21da5 100644 --- a/optimum/intel/neural_compressor/trainer_seq2seq.py +++ b/optimum/intel/neural_compressor/trainer_seq2seq.py @@ -166,7 +166,7 @@ def prediction_step( gen_kwargs.pop("max_length") if "synced_gpus" not in gen_kwargs: - gen_kwargs["synced_gpus"] = is_deepspeed_zero3_enabled() + gen_kwargs["synced_gpus"] = is_deepspeed_zero3_enabled() # prepare generation inputs # some encoder-decoder models can have varying encoder's and thus @@ -217,7 +217,6 @@ def prediction_step( return loss, generated_tokens, labels - def _pad_tensors_to_max_len(self, tensor, max_length): if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): # If PAD token is not defined at least EOS token has to be defined From ba8de366ffc18616e74ed86afb6a1da06b26e549 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Wed, 25 Sep 2024 11:42:32 +0200 Subject: [PATCH 18/23] fix trfs version for ipex --- .github/workflows/test_ipex.yml | 3 ++- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index f7b04486b..8cdfe30b5 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -41,7 +41,8 @@ jobs: pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu pip install intel_extension_for_pytorch==${{ matrix.ipex-version }} pip install Pillow parameterized - pip install .[ipex,tests] + pip install transformers[testing]==${{ matrix.transformers-version }} + pip install .[ipex] - name: Test with Pytest run: | pytest tests/ipex/ diff --git a/setup.py b/setup.py index fb6facff9..eee43291a 100644 --- a/setup.py +++ b/setup.py @@ -62,10 +62,10 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<=4.46"], + "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate"], "openvino": ["openvino>=2023.3,<2024.4", "nncf>=2.11.0", "openvino-tokenizers[transformers]<2024.4"], "nncf": ["nncf>=2.11.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.46"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From bc25916536829658b279e478282696ce0a1dd863 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 26 Sep 2024 10:53:53 +0200 Subject: [PATCH 19/23] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index eee43291a..ce2a55e32 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,9 @@ INSTALL_REQUIRE = [ "torch>=1.11", + "transformers>=4.36,<4.46", "transformers @ git+https://github.com/huggingface/transformers.git", "optimum @ git+https://github.com/huggingface/optimum.git@trfs-4.45", - # "transformers>=4.36,<4.46", # "optimum~=1.22", "datasets>=1.4.0", "sentencepiece", From 53e4b628d258bb00e8ce29394b1bf614e5bb8f30 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 26 Sep 2024 11:46:35 +0200 Subject: [PATCH 20/23] fix --- optimum/exporters/openvino/model_patcher.py | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 859ed7d0f..a7664d304 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -45,7 +45,7 @@ from transformers.modeling_tf_utils import TFPreTrainedModel -BETTERTRANSFORMER_IGNORE = ("codegen",) +BETTERTRANSFORMER_IGNORE = ("codegen", "gpt_neo") def patch_model_with_bettertransformer(model): diff --git a/setup.py b/setup.py index ce2a55e32..ba55b6ae6 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,6 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36,<4.46", - "transformers @ git+https://github.com/huggingface/transformers.git", "optimum @ git+https://github.com/huggingface/optimum.git@trfs-4.45", # "optimum~=1.22", "datasets>=1.4.0", From 498f4c217f7f9c41b563efa2a05bd4f6e65f926a Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 26 Sep 2024 14:54:29 +0200 Subject: [PATCH 21/23] udpate transformers version in workflows --- .github/workflows/test_openvino.yml | 2 ++ .github/workflows/test_openvino_basic.yml | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index e0fb95215..23ec08eae 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -21,6 +21,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.12"] + transformers-version: ["4.36.0", "4.45.*"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} @@ -37,6 +38,7 @@ jobs: # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime + pip install transformers==${{ matrix.transformers-version }} - name: Test with Pytest env: diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index 782e05f03..2b372f1c0 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -24,7 +24,7 @@ jobs: # This also ensures that the test fails if dependencies break for Python 3.7 python-version: ["3.8", "3.12"] os: ["ubuntu-22.04", "windows-latest"] - transformers-version: ["4.44.*"] + transformers-version: ["4.45.*"] include: - python-version: "3.12" os: "ubuntu-22.04" @@ -47,6 +47,7 @@ jobs: # Install openvino manually to prevent dependency conflicts when .[openvino] pins # optimum or transformers to a specific version pip install .[tests] openvino + pip install transformers==${{ matrix.transformers-version }} - name: Pip freeze run: pip freeze From fc6703ab39bb591da3c49714794c3b2d0c00e138 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 26 Sep 2024 15:09:45 +0200 Subject: [PATCH 22/23] add warning for transformers --- optimum/intel/openvino/trainer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 1a308eecf..a2f08b647 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -87,7 +87,7 @@ from optimum.exporters.onnx import OnnxConfig from ..utils.constant import _TASK_ALIASES -from ..utils.import_utils import is_transformers_version +from ..utils.import_utils import _transformers_version, is_transformers_version from .configuration import OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments @@ -215,6 +215,11 @@ def __init__( ): logger.warning("OVTrainer is deprecated and will be removed in optimum-intel v1.22.0.") + if is_transformers_version(">=", "4.45.0"): + logger.warning( + f"The transformers version found is {_transformers_version} which is not officially supported by the OVTrainer, use at your own risk" + ) + self.neftune_noise_alpha = None super().__init__( From fe952589b17acdd041b163281d4b03eeed565cc7 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 26 Sep 2024 16:47:01 +0200 Subject: [PATCH 23/23] update setup --- .github/workflows/test_openvino.yml | 2 +- .github/workflows/test_openvino_basic.yml | 2 +- setup.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 23ec08eae..335acf669 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.12"] - transformers-version: ["4.36.0", "4.45.*"] + transformers-version: ["4.36.0", "4.44.*"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index 2b372f1c0..ced98dd9a 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -24,7 +24,7 @@ jobs: # This also ensures that the test fails if dependencies break for Python 3.7 python-version: ["3.8", "3.12"] os: ["ubuntu-22.04", "windows-latest"] - transformers-version: ["4.45.*"] + transformers-version: ["4.44.*"] include: - python-version: "3.12" os: "ubuntu-22.04" diff --git a/setup.py b/setup.py index ba55b6ae6..d0d8e5215 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36,<4.46", - "optimum @ git+https://github.com/huggingface/optimum.git@trfs-4.45", - # "optimum~=1.22", + "optimum~=1.22", "datasets>=1.4.0", "sentencepiece", "setuptools",