huggingface · sywangyi · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        transformers-version: ["4.46.*"]
+        transformers-version: ["4.46.0", "4.46.3"]
         torch-version: ["2.4.0", "2.5.0"]
 
     runs-on: ubuntu-22.04

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -38,7 +38,6 @@
     GenerationConfig,
     GenerationMixin,
     PretrainedConfig,
-    is_torch_xpu_available,
 )
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 from transformers.generation.candidate_generator import _crop_past_key_values
@@ -127,18 +126,9 @@ def __init__(
         warmup: bool = True,
         **kwargs,
     ):
-        if is_torch_xpu_available(check_device=True):
-            self._device = torch.device("xpu:0")
-        elif torch.cuda.is_available():
-            self._device = torch.device("cuda:0")
-        else:
-            self._device = torch.device("cpu")
-
         config = config or model.config
-
         OptimizedModel.__init__(self, model=model, config=config)
 
-        self.model.to(self._device)
         self._dtype = self.model.dtype if self.model.dtype is not None else torch.float32
         self.use_cache = kwargs.get("use_cache", False)
         self.model_save_dir = model_save_dir
@@ -174,7 +164,6 @@ def _from_pretrained(
         local_files_only: bool = False,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         trust_remote_code: bool = False,
-        file_name: Optional[str] = WEIGHTS_NAME,
         **kwargs,
     ):
         """
@@ -207,9 +196,6 @@ def _from_pretrained(
                 float16 or bfloat16 or float32: load in a specified dtype, ignoring the model config.torch_dtype if one exists. If not specified, the model will get loaded in float32.
             trust_remote_code (`bool`, *optional*)
                 Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.
-            file_name (`str`, *optional*):
-                The file name of the model to load. Overwrites the default file name and allows one to load the model
-                with a different name.
         """
         if use_auth_token is not None:
             warnings.warn(
@@ -287,7 +273,7 @@ def eval(self):
 
     @property
     def device(self) -> torch.device:
-        return self._device
+        return self.model.device
 
     @property
     def dtype(self) -> torch.dtype:
@@ -305,8 +291,7 @@ def add_patch(self) -> bool:
         return self._add_patch
 
     def to(self, device: Union[torch.device, str]):
-        self._device = device if isinstance(device, torch.device) else torch.device(device)
-        self.model.to(self._device)
+        self.model.to(self.device)
         return self
 
     def can_generate(self):
@@ -323,8 +308,8 @@ def _init_warmup(self):
         if not self._add_patch:
             # use_cache = "past_key_values" in self.input_names
             dummy_inputs = _prepare_inputs_for_ipex_model(self, self.export_feature, self.use_cache)
-            if self._device.type != "cpu":
-                dummy_inputs = recursive_to_device(value=dummy_inputs, device=self._device)
+            if self.device.type != "cpu":
+                dummy_inputs = recursive_to_device(value=dummy_inputs, device=self.device)
             for _ in range(2):
                 self(**dummy_inputs)
 
@@ -526,15 +511,15 @@ def generate(self, *args, **kwargs):
             raise ValueError(
                 f"Assisted decoding is not supported for patched models if ipex < 2.4, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}"
             )
-        # Patch functions to support paged cache
+        # Patch functions to support ipex_paged cache
         if self._add_patch:
-            transformers.generation.utils.NEED_SETUP_CACHE_CLASSES_MAPPING["paged"] = IPEXPagedCache
-            self.generation_config.cache_implementation = "paged"
+            transformers.generation.utils.NEED_SETUP_CACHE_CLASSES_MAPPING["ipex_paged"] = IPEXPagedCache
+            self.generation_config.cache_implementation = "ipex_paged"
             if is_transformers_version(">=", "4.45.0"):
-                if "paged" not in transformers.generation.configuration_utils.ALL_CACHE_IMPLEMENTATIONS:
-                    transformers.generation.configuration_utils.ALL_CACHE_IMPLEMENTATIONS.append("paged")
+                if "ipex_paged" not in transformers.generation.configuration_utils.ALL_CACHE_IMPLEMENTATIONS:
+                    transformers.generation.configuration_utils.ALL_CACHE_IMPLEMENTATIONS.append("ipex_paged")
             if new_kwargs.get("generation_config", None):
-                new_kwargs["generation_config"].cache_implementation = "paged"
+                new_kwargs["generation_config"].cache_implementation = "ipex_paged"
 
         if self._add_patch and new_kwargs.get("assistant_model", None):
             transformers.generation.utils._crop_past_key_values = _ipex_crop_past_key_values

diff --git a/setup.py b/setup.py
@@ -64,7 +64,7 @@
     "nncf": ["nncf>=2.11.0"],
     "openvino": ["nncf>=2.11.0", "openvino==2024.5.0", "openvino-tokenizers==2024.5.0"],
     "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
-    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.47"],
+    "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,

diff --git a/tests/neural_compressor/test_ipex.py b/tests/neural_compressor/test_ipex.py
@@ -17,7 +17,6 @@
 
 import os
 import tempfile
-import unittest
 
 from neural_compressor.config import PostTrainingQuantConfig
 
@@ -53,7 +52,7 @@ class IPEXQuantizationTest(INCTestMixin):
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("text-classification", "bert", 21),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
-    def test_ipex_static_quantization_with_smoothquant(self, task, model_arch, expected_quantized_matmuls):
+    def test_static_quantization_with_smoothquant(self, task, model_arch, expected_quantized_matmuls):
         recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}}
         num_samples = 10
         model_name = MODEL_NAMES[model_arch]