From 363b9ac718bf17bb212fb1a7daf246b06c488712 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 28 Jun 2024 03:44:53 -0400
Subject: [PATCH 1/3] refine hqq test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../weight_only/hqq/test_hqq_cuda.py          | 26 ++++++----------
 .../{hqq/test_hqq_cpu.py => tes_hqq.py}       | 30 ++++++++++++++-----
 2 files changed, 32 insertions(+), 24 deletions(-)
 rename test/3x/torch/quantization/weight_only/{hqq/test_hqq_cpu.py => tes_hqq.py} (88%)

diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py
index 777daf0e60b..c47d7e93310 100644
--- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py
+++ b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py
@@ -10,7 +10,9 @@
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
 
 
-def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128):
+def _common_hqq_test(
+    nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None
+):
     # Parse config
     weight_qconfig = QTensorConfig(
         nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False
@@ -22,22 +24,17 @@ def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False
     if quant_scale:
         scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False)
     hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig)
-    device = torch.cuda.current_device()
 
     # Create HQQ Linear
     bs = 4
     in_features = 64
     out_features = 128
-    see_cuda_memory_usage(message="Before create float linear")
     float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features)
     if hqq_global_option.use_half:
         float_linear = float_linear.half()
-    see_cuda_memory_usage(message="After create float linear")
     float_linear.to(device)
     float_linear_copy = deepcopy(float_linear)
-    see_cuda_memory_usage(message="After copy the float linear")
     hqq_linear = HQQLinear.from_float(float_linear_copy, quant_config=hqq_quant_config)
-    see_cuda_memory_usage(message="After create hqq linear")
 
     # Forward
     input = torch.randn(bs, in_features, device=device)
@@ -52,7 +49,6 @@ def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False
     torch.allclose(hqq_output, hqq_output_2)
     del float_linear, hqq_linear
     del float_output, hqq_output, hqq_output_2
-    see_cuda_memory_usage("At the end of test")
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU")
@@ -87,6 +83,7 @@ def test_hqq_quant(self):
             q_label_1.eq(q_label_2)
         ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
 
+    @pytest.mark.parametrize("device_name", ["cuda", "cpu"])
     @pytest.mark.parametrize(
         "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",
         [
@@ -106,25 +103,20 @@ def test_hqq_quant(self):
     )
     def test_hqq_module_cuda(
         self,
+        device_name,
         nbits,
         group_size,
         quant_zero,
         quant_scale,
         scale_quant_group_size,
     ):
-        _common_cuda_test(
+        if device_name == "cuda" and not torch.cuda.is_available():
+            pytest.skip("Skipping CUDA test because cuda is not available")
+        _common_hqq_test(
             nbits=nbits,
             group_size=group_size,
             quant_zero=quant_zero,
             quant_scale=quant_scale,
             scale_quant_group_size=scale_quant_group_size,
+            device=torch.device(device_name),
         )
-
-
-# _common_cuda_test(
-#     nbits=4,
-#     group_size=64,
-#     quant_zero=False,
-#     quant_scale=False,
-#     scale_quant_group_size=128
-# )
diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py b/test/3x/torch/quantization/weight_only/tes_hqq.py
similarity index 88%
rename from test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py
rename to test/3x/torch/quantization/weight_only/tes_hqq.py
index 9a0290ffe29..1d68a553859 100644
--- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py
+++ b/test/3x/torch/quantization/weight_only/tes_hqq.py
@@ -6,6 +6,7 @@
 import transformers
 from transformers import AutoModelForCausalLM
 
+from neural_compressor.common.utils import logger
 from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option
 from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear
 from neural_compressor.torch.quantization import HQQConfig, convert, get_default_hqq_config, prepare, quantize
@@ -14,7 +15,9 @@
 device = accelerator.current_device_name()
 
 
-def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128):
+def _common_hqq_test(
+    nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None
+):
     # Parse config
     weight_qconfig = QTensorConfig(
         nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False
@@ -26,7 +29,6 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False,
     if quant_scale:
         scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False)
     hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig)
-    device = "cpu"
 
     # Create HQQ Linear
     bs = 4
@@ -34,7 +36,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False,
     out_features = 128
     float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features)
     if hqq_global_option.use_half:
-        print(f"hqq_global_option use half: {hqq_global_option.use_half}")
+        logger.info(f"hqq_global_option use half: {hqq_global_option.use_half}")
         float_linear = float_linear.half()
     float_linear.to(device)
     float_linear_copy = deepcopy(float_linear)
@@ -54,7 +56,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False,
     del float_output, hqq_output, hqq_output_2
 
 
-class TestHQQCPU:
+class TestHQQ:
 
     @classmethod
     def setup_class(cls):
@@ -137,6 +139,7 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half):
             id(model.model.decoder.embed_tokens.weight) == lm_head_id
         ), "The tied lm_head weight is not deep copied, please check!"
 
+    @pytest.mark.parametrize("device_name", ["cuda", "cpu"])
     @pytest.mark.parametrize(
         "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",
         [
@@ -155,13 +158,26 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half):
             (4, -1, False, True, 64),
         ],
     )
-    def test_hqq_module_cpu(
-        self, force_use_cpu, force_not_half, nbits, group_size, quant_zero, quant_scale, scale_quant_group_size
+    def test_hqq_module(
+        self,
+        nbits,
+        group_size,
+        quant_zero,
+        quant_scale,
+        scale_quant_group_size,
+        device_name,
     ):
-        _common_cpu_test(
+        if device_name == "cuda" and not torch.cuda.is_available():
+            pytest.skip("Skipping CUDA test because cuda is not available")
+        if device_name == "cpu":
+            os.environ["FORCE_DEVICE"] = "cpu"
+            hqq_global_option.use_half = False
+
+        _common_hqq_test(
             nbits=nbits,
             group_size=group_size,
             quant_zero=quant_zero,
             quant_scale=quant_scale,
             scale_quant_group_size=scale_quant_group_size,
+            device=torch.device(device_name),
         )

From 9b6a51ad3471785b4e6c2fbbddb6d4c18ef9e9c3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 28 Jun 2024 15:47:33 +0800
Subject: [PATCH 2/3] remove cuda

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../weight_only/hqq/test_hqq_cuda.py          | 122 ------------------
 1 file changed, 122 deletions(-)
 delete mode 100644 test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py

diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py
deleted file mode 100644
index c47d7e93310..00000000000
--- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from copy import deepcopy
-
-import pytest
-import torch
-from transformers import AutoModelForCausalLM
-
-from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option
-from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear
-from neural_compressor.torch.algorithms.weight_only.hqq.utility import see_cuda_memory_usage
-from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
-
-
-def _common_hqq_test(
-    nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None
-):
-    # Parse config
-    weight_qconfig = QTensorConfig(
-        nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False
-    )
-    zero_qconfig = None
-    if quant_zero:
-        zero_qconfig = QTensorConfig(nbits=8, channel_wise=False, group_size=None, optimize=False)
-    scale_qconfig = None
-    if quant_scale:
-        scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False)
-    hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig)
-
-    # Create HQQ Linear
-    bs = 4
-    in_features = 64
-    out_features = 128
-    float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features)
-    if hqq_global_option.use_half:
-        float_linear = float_linear.half()
-    float_linear.to(device)
-    float_linear_copy = deepcopy(float_linear)
-    hqq_linear = HQQLinear.from_float(float_linear_copy, quant_config=hqq_quant_config)
-
-    # Forward
-    input = torch.randn(bs, in_features, device=device)
-    if hqq_global_option.use_half:
-        input = input.half()
-    float_output = float_linear(input)
-    input_for_hqq = deepcopy(input)
-    hqq_output = hqq_linear(input_for_hqq)
-    hqq_output_2 = hqq_linear(input_for_hqq)
-    float_qdq_diff = 0.1  # hard code it first
-    torch.allclose(float_output, hqq_output, atol=float_qdq_diff)
-    torch.allclose(hqq_output, hqq_output_2)
-    del float_linear, hqq_linear
-    del float_output, hqq_output, hqq_output_2
-
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU")
-class TestHQQCUDA:
-    @classmethod
-    def setup_class(cls):
-        torch.manual_seed(0)
-        torch.cuda.manual_seed(0)
-        hqq_global_option.use_half = True
-
-    def test_hqq_quant(self):
-        from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare, quantize
-
-        fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
-        example_inputs = torch.tensor(
-            [[10, 20, 30, 40, 50, 60]], dtype=torch.long, device=auto_detect_accelerator().current_device()
-        )
-        # test_default_config
-        quant_config = get_default_hqq_config()
-
-        # prepare + convert API
-        model = prepare(deepcopy(fp32_model), quant_config)
-        model = convert(model)
-        q_label_1 = model(example_inputs)[0]
-
-        # quantize API
-        model = quantize(deepcopy(fp32_model), quant_config)
-        q_label_2 = model(example_inputs)[0]
-
-        # compare the results of calling `convert` + `prepare` and calling `quantize`
-        assert torch.all(
-            q_label_1.eq(q_label_2)
-        ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
-
-    @pytest.mark.parametrize("device_name", ["cuda", "cpu"])
-    @pytest.mark.parametrize(
-        "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size",
-        [
-            (4, 64, True, False, 128),
-            (4, 64, False, False, 128),
-            (4, 64, True, True, 128),
-            (4, 64, False, True, 128),
-            (8, 64, True, False, 128),
-            (8, 64, False, False, 128),
-            (8, 64, True, True, 128),
-            (8, 64, False, True, 128),
-            (4, 64, True, False, 64),
-            (4, 64, False, False, 64),
-            (4, 64, True, True, 64),
-            (4, 64, False, True, 64),
-        ],
-    )
-    def test_hqq_module_cuda(
-        self,
-        device_name,
-        nbits,
-        group_size,
-        quant_zero,
-        quant_scale,
-        scale_quant_group_size,
-    ):
-        if device_name == "cuda" and not torch.cuda.is_available():
-            pytest.skip("Skipping CUDA test because cuda is not available")
-        _common_hqq_test(
-            nbits=nbits,
-            group_size=group_size,
-            quant_zero=quant_zero,
-            quant_scale=quant_scale,
-            scale_quant_group_size=scale_quant_group_size,
-            device=torch.device(device_name),
-        )

From c9db050eedc5ac37e80b0faf725a9b354e1f373c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 28 Jun 2024 18:00:02 +0800
Subject: [PATCH 3/3] correct name

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../3x/torch/quantization/weight_only/{tes_hqq.py => test_hqq.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/3x/torch/quantization/weight_only/{tes_hqq.py => test_hqq.py} (100%)

diff --git a/test/3x/torch/quantization/weight_only/tes_hqq.py b/test/3x/torch/quantization/weight_only/test_hqq.py
similarity index 100%
rename from test/3x/torch/quantization/weight_only/tes_hqq.py
rename to test/3x/torch/quantization/weight_only/test_hqq.py