From e6fb3a25c4c8698ea03e9d2ef0124061a6e237e6 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 12 Jul 2024 17:31:50 +0800 Subject: [PATCH 1/5] support p2te save/load Signed-off-by: Kaihui-intel --- .../torch/algorithms/pt2e_quant/__init__.py | 1 + .../torch/quantization/algorithm_entry.py | 6 ++++++ .../torch/quantization/load_entry.py | 5 +++++ test/3x/torch/quantization/test_pt2e_quant.py | 15 +++++++++++---- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py index b6187ba214a..ef0535637da 100644 --- a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py +++ b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py @@ -14,3 +14,4 @@ from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer +from .save_load import save, load \ No newline at end of file diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 856961af532..a4da3842f9f 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -210,6 +210,7 @@ def static_quant_entry( def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode, *args, **kwargs) -> torch.nn.Module: logger.info("Quantize model with the PT2E static quant algorithm.") from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer + from neural_compressor.torch.algorithms.pt2e_quant.save_load import save run_fn = kwargs.get("run_fn", None) example_inputs = kwargs.get("example_inputs", None) @@ -221,6 +222,8 @@ def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode model = w8a8_quantizer.execute( model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace ) + model.qconfig = configs_mapping + model.save = MethodType(save, model) return model @@ -230,6 +233,7 @@ def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode def pt2e_static_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode, *args, **kwargs) -> torch.nn.Module: logger.info("Quantize model with the PT2E static quant algorithm.") from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer + from neural_compressor.torch.algorithms.pt2e_quant.save_load import save run_fn = kwargs.get("run_fn", None) example_inputs = kwargs.get("example_inputs", None) @@ -240,6 +244,8 @@ def pt2e_static_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode, model = w8a8_quantizer.execute( model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace ) + model.qconfig = configs_mapping + model.save = MethodType(save, model) return model diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py index d20f828659d..9e43546f877 100644 --- a/neural_compressor/torch/quantization/load_entry.py +++ b/neural_compressor/torch/quantization/load_entry.py @@ -84,6 +84,10 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu" from neural_compressor.torch.algorithms import static_quant return static_quant.load(model_name_or_path) + elif "static_quant" in per_op_qconfig.keys() or "pt2e_dynamic_quant" in per_op_qconfig.keys(): # PT2E + from neural_compressor.torch.algorithms import pt2e_quant + + return pt2e_quant.load(model_name_or_path) else: config_mapping = load_config_mapping(qconfig_file_path, ConfigRegistry.get_all_configs()["torch"]) # select load function @@ -99,6 +103,7 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu" from neural_compressor.torch.algorithms import habana_fp8 return habana_fp8.load(model_name_or_path, original_model) + elif format == LoadFormat.HUGGINGFACE.value: # now only support load huggingface WOQ causal language model from neural_compressor.torch.algorithms import weight_only diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py index e2c643f07c6..6c610653bcd 100644 --- a/test/3x/torch/quantization/test_pt2e_quant.py +++ b/test/3x/torch/quantization/test_pt2e_quant.py @@ -1,8 +1,5 @@ -import os -import unittest -from unittest.mock import patch - import pytest +import shutil import torch import torch.testing._internal.common_quantization as torch_test_quant_common @@ -33,6 +30,8 @@ def _is_ipex_imported(): class TestPT2EQuantization: + def teardown_class(self): + shutil.rmtree("saved_results", ignore_errors=True) @staticmethod def get_toy_model(): @@ -114,6 +113,14 @@ def calib_fn(model): config.freezing = True q_model_out = q_model(*example_inputs) assert torch.allclose(float_model_output, q_model_out, atol=1e-2), "Quantization failed!" + + # test save and load + q_model.save(example_inputs=example_inputs, output_dir="./saved_results",) + from neural_compressor.torch.quantization import load + loaded_quantized_model = load("./saved_results") + loaded_q_model_out = loaded_quantized_model(*example_inputs) + assert torch.allclose(loaded_q_model_out, q_model_out) + opt_model = torch.compile(q_model) out = opt_model(*example_inputs) logger.warning("out shape is %s", out.shape) From ff463a7f329bc23391d2d2ade6cab6d5cd726d3c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jul 2024 09:36:37 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/pt2e_quant/__init__.py | 2 +- neural_compressor/torch/quantization/load_entry.py | 2 +- test/3x/torch/quantization/test_pt2e_quant.py | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py index ef0535637da..b3c530ce2fd 100644 --- a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py +++ b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py @@ -14,4 +14,4 @@ from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer -from .save_load import save, load \ No newline at end of file +from .save_load import save, load diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py index 9e43546f877..641ec9e421f 100644 --- a/neural_compressor/torch/quantization/load_entry.py +++ b/neural_compressor/torch/quantization/load_entry.py @@ -84,7 +84,7 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu" from neural_compressor.torch.algorithms import static_quant return static_quant.load(model_name_or_path) - elif "static_quant" in per_op_qconfig.keys() or "pt2e_dynamic_quant" in per_op_qconfig.keys(): # PT2E + elif "static_quant" in per_op_qconfig.keys() or "pt2e_dynamic_quant" in per_op_qconfig.keys(): # PT2E from neural_compressor.torch.algorithms import pt2e_quant return pt2e_quant.load(model_name_or_path) diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py index 6c610653bcd..53cc45418e7 100644 --- a/test/3x/torch/quantization/test_pt2e_quant.py +++ b/test/3x/torch/quantization/test_pt2e_quant.py @@ -1,5 +1,6 @@ -import pytest import shutil + +import pytest import torch import torch.testing._internal.common_quantization as torch_test_quant_common @@ -113,14 +114,18 @@ def calib_fn(model): config.freezing = True q_model_out = q_model(*example_inputs) assert torch.allclose(float_model_output, q_model_out, atol=1e-2), "Quantization failed!" - + # test save and load - q_model.save(example_inputs=example_inputs, output_dir="./saved_results",) + q_model.save( + example_inputs=example_inputs, + output_dir="./saved_results", + ) from neural_compressor.torch.quantization import load + loaded_quantized_model = load("./saved_results") loaded_q_model_out = loaded_quantized_model(*example_inputs) assert torch.allclose(loaded_q_model_out, q_model_out) - + opt_model = torch.compile(q_model) out = opt_model(*example_inputs) logger.warning("out shape is %s", out.shape) From 434b4f211b1a73d93bf626529a9d1d77a9fd0e29 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 15 Jul 2024 10:01:59 +0800 Subject: [PATCH 3/5] add save_load.py Signed-off-by: Kaihui-intel --- .../torch/algorithms/pt2e_quant/save_load.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 neural_compressor/torch/algorithms/pt2e_quant/save_load.py diff --git a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py new file mode 100644 index 00000000000..8b42d317bff --- /dev/null +++ b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py @@ -0,0 +1,24 @@ +import torch +import os +import json +from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger +from neural_compressor.common.utils import load_config_mapping, save_config_mapping + +def save(model, example_inputs, output_dir="./saved_results"): + os.makedirs(output_dir, exist_ok=True) + qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) + qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME) + quantized_ep = torch.export.export(model, example_inputs) + torch.export.save(quantized_ep, qmodel_file_path) + for key, op_config in model.qconfig.items(): + model.qconfig[key] = op_config.to_dict() + with open(qconfig_file_path, "w") as f: + json.dump(model.qconfig, f, indent=4) + + logger.info("Save quantized model to {}.".format(qmodel_file_path)) + logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path)) + +def load(output_dir="./saved_results"): + qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) + loaded_quantized_ep = torch.export.load(qmodel_file_path) + return loaded_quantized_ep.module() \ No newline at end of file From 7cb62f15ac769448cee5c8ba2b5357738f627673 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 02:05:44 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/pt2e_quant/save_load.py | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py index 8b42d317bff..606c31f41c2 100644 --- a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py +++ b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py @@ -1,8 +1,25 @@ -import torch -import os +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json -from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger +import os + +import torch + from neural_compressor.common.utils import load_config_mapping, save_config_mapping +from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger + def save(model, example_inputs, output_dir="./saved_results"): os.makedirs(output_dir, exist_ok=True) @@ -14,11 +31,12 @@ def save(model, example_inputs, output_dir="./saved_results"): model.qconfig[key] = op_config.to_dict() with open(qconfig_file_path, "w") as f: json.dump(model.qconfig, f, indent=4) - + logger.info("Save quantized model to {}.".format(qmodel_file_path)) logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path)) + def load(output_dir="./saved_results"): qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) loaded_quantized_ep = torch.export.load(qmodel_file_path) - return loaded_quantized_ep.module() \ No newline at end of file + return loaded_quantized_ep.module() From 2708ed258f6d5212a5aedce97b79533914aea2fe Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Mon, 15 Jul 2024 13:40:32 +0800 Subject: [PATCH 5/5] enhance ut Signed-off-by: Kaihui-intel --- test/3x/torch/quantization/test_pt2e_quant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py index 53cc45418e7..d55e9004a3a 100644 --- a/test/3x/torch/quantization/test_pt2e_quant.py +++ b/test/3x/torch/quantization/test_pt2e_quant.py @@ -124,7 +124,7 @@ def calib_fn(model): loaded_quantized_model = load("./saved_results") loaded_q_model_out = loaded_quantized_model(*example_inputs) - assert torch.allclose(loaded_q_model_out, q_model_out) + assert torch.equal(loaded_q_model_out, q_model_out) opt_model = torch.compile(q_model) out = opt_model(*example_inputs)