From e6fb3a25c4c8698ea03e9d2ef0124061a6e237e6 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 12 Jul 2024 17:31:50 +0800
Subject: [PATCH 1/5] support p2te save/load

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/pt2e_quant/__init__.py       |  1 +
 .../torch/quantization/algorithm_entry.py         |  6 ++++++
 .../torch/quantization/load_entry.py              |  5 +++++
 test/3x/torch/quantization/test_pt2e_quant.py     | 15 +++++++++++----
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
index b6187ba214a..ef0535637da 100644
--- a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
+++ b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
@@ -14,3 +14,4 @@
 
 
 from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer
+from .save_load import save, load
\ No newline at end of file
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 856961af532..a4da3842f9f 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -210,6 +210,7 @@ def static_quant_entry(
 def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode, *args, **kwargs) -> torch.nn.Module:
     logger.info("Quantize model with the PT2E static quant algorithm.")
     from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer
+    from neural_compressor.torch.algorithms.pt2e_quant.save_load import save
 
     run_fn = kwargs.get("run_fn", None)
     example_inputs = kwargs.get("example_inputs", None)
@@ -221,6 +222,8 @@ def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode
             model = w8a8_quantizer.execute(
                 model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace
             )
+            model.qconfig = configs_mapping
+            model.save = MethodType(save, model)
             return model
 
 
@@ -230,6 +233,7 @@ def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode
 def pt2e_static_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode, *args, **kwargs) -> torch.nn.Module:
     logger.info("Quantize model with the PT2E static quant algorithm.")
     from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer
+    from neural_compressor.torch.algorithms.pt2e_quant.save_load import save
 
     run_fn = kwargs.get("run_fn", None)
     example_inputs = kwargs.get("example_inputs", None)
@@ -240,6 +244,8 @@ def pt2e_static_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode,
             model = w8a8_quantizer.execute(
                 model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace
             )
+            model.qconfig = configs_mapping
+            model.save = MethodType(save, model)
             return model
 
 
diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py
index d20f828659d..9e43546f877 100644
--- a/neural_compressor/torch/quantization/load_entry.py
+++ b/neural_compressor/torch/quantization/load_entry.py
@@ -84,6 +84,10 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             from neural_compressor.torch.algorithms import static_quant
 
             return static_quant.load(model_name_or_path)
+        elif "static_quant" in  per_op_qconfig.keys() or "pt2e_dynamic_quant" in  per_op_qconfig.keys(): # PT2E
+            from neural_compressor.torch.algorithms import pt2e_quant
+
+            return pt2e_quant.load(model_name_or_path)
         else:
             config_mapping = load_config_mapping(qconfig_file_path, ConfigRegistry.get_all_configs()["torch"])
             # select load function
@@ -99,6 +103,7 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
                 from neural_compressor.torch.algorithms import habana_fp8
 
                 return habana_fp8.load(model_name_or_path, original_model)
+
     elif format == LoadFormat.HUGGINGFACE.value:
         # now only support load huggingface WOQ causal language model
         from neural_compressor.torch.algorithms import weight_only
diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py
index e2c643f07c6..6c610653bcd 100644
--- a/test/3x/torch/quantization/test_pt2e_quant.py
+++ b/test/3x/torch/quantization/test_pt2e_quant.py
@@ -1,8 +1,5 @@
-import os
-import unittest
-from unittest.mock import patch
-
 import pytest
+import shutil
 import torch
 import torch.testing._internal.common_quantization as torch_test_quant_common
 
@@ -33,6 +30,8 @@ def _is_ipex_imported():
 
 
 class TestPT2EQuantization:
+    def teardown_class(self):
+        shutil.rmtree("saved_results", ignore_errors=True)
 
     @staticmethod
     def get_toy_model():
@@ -114,6 +113,14 @@ def calib_fn(model):
         config.freezing = True
         q_model_out = q_model(*example_inputs)
         assert torch.allclose(float_model_output, q_model_out, atol=1e-2), "Quantization failed!"
+        
+        # test save and load
+        q_model.save(example_inputs=example_inputs, output_dir="./saved_results",)
+        from neural_compressor.torch.quantization import load
+        loaded_quantized_model = load("./saved_results")
+        loaded_q_model_out = loaded_quantized_model(*example_inputs)
+        assert torch.allclose(loaded_q_model_out, q_model_out)
+        
         opt_model = torch.compile(q_model)
         out = opt_model(*example_inputs)
         logger.warning("out shape is %s", out.shape)

From ff463a7f329bc23391d2d2ade6cab6d5cd726d3c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 09:36:37 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/pt2e_quant/__init__.py         |  2 +-
 neural_compressor/torch/quantization/load_entry.py  |  2 +-
 test/3x/torch/quantization/test_pt2e_quant.py       | 13 +++++++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
index ef0535637da..b3c530ce2fd 100644
--- a/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
+++ b/neural_compressor/torch/algorithms/pt2e_quant/__init__.py
@@ -14,4 +14,4 @@
 
 
 from neural_compressor.torch.algorithms.pt2e_quant.core import W8A8PT2EQuantizer
-from .save_load import save, load
\ No newline at end of file
+from .save_load import save, load
diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py
index 9e43546f877..641ec9e421f 100644
--- a/neural_compressor/torch/quantization/load_entry.py
+++ b/neural_compressor/torch/quantization/load_entry.py
@@ -84,7 +84,7 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             from neural_compressor.torch.algorithms import static_quant
 
             return static_quant.load(model_name_or_path)
-        elif "static_quant" in  per_op_qconfig.keys() or "pt2e_dynamic_quant" in  per_op_qconfig.keys(): # PT2E
+        elif "static_quant" in per_op_qconfig.keys() or "pt2e_dynamic_quant" in per_op_qconfig.keys():  # PT2E
             from neural_compressor.torch.algorithms import pt2e_quant
 
             return pt2e_quant.load(model_name_or_path)
diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py
index 6c610653bcd..53cc45418e7 100644
--- a/test/3x/torch/quantization/test_pt2e_quant.py
+++ b/test/3x/torch/quantization/test_pt2e_quant.py
@@ -1,5 +1,6 @@
-import pytest
 import shutil
+
+import pytest
 import torch
 import torch.testing._internal.common_quantization as torch_test_quant_common
 
@@ -113,14 +114,18 @@ def calib_fn(model):
         config.freezing = True
         q_model_out = q_model(*example_inputs)
         assert torch.allclose(float_model_output, q_model_out, atol=1e-2), "Quantization failed!"
-        
+
         # test save and load
-        q_model.save(example_inputs=example_inputs, output_dir="./saved_results",)
+        q_model.save(
+            example_inputs=example_inputs,
+            output_dir="./saved_results",
+        )
         from neural_compressor.torch.quantization import load
+
         loaded_quantized_model = load("./saved_results")
         loaded_q_model_out = loaded_quantized_model(*example_inputs)
         assert torch.allclose(loaded_q_model_out, q_model_out)
-        
+
         opt_model = torch.compile(q_model)
         out = opt_model(*example_inputs)
         logger.warning("out shape is %s", out.shape)

From 434b4f211b1a73d93bf626529a9d1d77a9fd0e29 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 15 Jul 2024 10:01:59 +0800
Subject: [PATCH 3/5] add save_load.py

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../torch/algorithms/pt2e_quant/save_load.py  | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 neural_compressor/torch/algorithms/pt2e_quant/save_load.py

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
new file mode 100644
index 00000000000..8b42d317bff
--- /dev/null
+++ b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
@@ -0,0 +1,24 @@
+import torch
+import os
+import json
+from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
+from neural_compressor.common.utils import load_config_mapping, save_config_mapping
+
+def save(model, example_inputs, output_dir="./saved_results"):
+    os.makedirs(output_dir, exist_ok=True)
+    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
+    qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
+    quantized_ep = torch.export.export(model, example_inputs)
+    torch.export.save(quantized_ep, qmodel_file_path)
+    for key, op_config in model.qconfig.items():
+        model.qconfig[key] = op_config.to_dict()
+    with open(qconfig_file_path, "w") as f:
+        json.dump(model.qconfig, f, indent=4)
+    
+    logger.info("Save quantized model to {}.".format(qmodel_file_path))
+    logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
+
+def load(output_dir="./saved_results"):
+    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
+    loaded_quantized_ep = torch.export.load(qmodel_file_path)
+    return loaded_quantized_ep.module()
\ No newline at end of file

From 7cb62f15ac769448cee5c8ba2b5357738f627673 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 02:05:44 +0000
Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/pt2e_quant/save_load.py  | 28 +++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
index 8b42d317bff..606c31f41c2 100644
--- a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
+++ b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
@@ -1,8 +1,25 @@
-import torch
-import os
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
-from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
+import os
+
+import torch
+
 from neural_compressor.common.utils import load_config_mapping, save_config_mapping
+from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
+
 
 def save(model, example_inputs, output_dir="./saved_results"):
     os.makedirs(output_dir, exist_ok=True)
@@ -14,11 +31,12 @@ def save(model, example_inputs, output_dir="./saved_results"):
         model.qconfig[key] = op_config.to_dict()
     with open(qconfig_file_path, "w") as f:
         json.dump(model.qconfig, f, indent=4)
-    
+
     logger.info("Save quantized model to {}.".format(qmodel_file_path))
     logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
 
+
 def load(output_dir="./saved_results"):
     qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
     loaded_quantized_ep = torch.export.load(qmodel_file_path)
-    return loaded_quantized_ep.module()
\ No newline at end of file
+    return loaded_quantized_ep.module()

From 2708ed258f6d5212a5aedce97b79533914aea2fe Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 15 Jul 2024 13:40:32 +0800
Subject: [PATCH 5/5] enhance ut

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 test/3x/torch/quantization/test_pt2e_quant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py
index 53cc45418e7..d55e9004a3a 100644
--- a/test/3x/torch/quantization/test_pt2e_quant.py
+++ b/test/3x/torch/quantization/test_pt2e_quant.py
@@ -124,7 +124,7 @@ def calib_fn(model):
 
         loaded_quantized_model = load("./saved_results")
         loaded_q_model_out = loaded_quantized_model(*example_inputs)
-        assert torch.allclose(loaded_q_model_out, q_model_out)
+        assert torch.equal(loaded_q_model_out, q_model_out)
 
         opt_model = torch.compile(q_model)
         out = opt_model(*example_inputs)