From 363b9ac718bf17bb212fb1a7daf246b06c488712 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 28 Jun 2024 03:44:53 -0400 Subject: [PATCH 1/3] refine hqq test Signed-off-by: yiliu30 --- .../weight_only/hqq/test_hqq_cuda.py | 26 ++++++---------- .../{hqq/test_hqq_cpu.py => tes_hqq.py} | 30 ++++++++++++++----- 2 files changed, 32 insertions(+), 24 deletions(-) rename test/3x/torch/quantization/weight_only/{hqq/test_hqq_cpu.py => tes_hqq.py} (88%) diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py index 777daf0e60b..c47d7e93310 100644 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py +++ b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py @@ -10,7 +10,9 @@ from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator -def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128): +def _common_hqq_test( + nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None +): # Parse config weight_qconfig = QTensorConfig( nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False @@ -22,22 +24,17 @@ def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False if quant_scale: scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - device = torch.cuda.current_device() # Create HQQ Linear bs = 4 in_features = 64 out_features = 128 - see_cuda_memory_usage(message="Before create float linear") float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) if hqq_global_option.use_half: float_linear = float_linear.half() - see_cuda_memory_usage(message="After create float linear") float_linear.to(device) float_linear_copy = deepcopy(float_linear) - see_cuda_memory_usage(message="After copy the float linear") hqq_linear = HQQLinear.from_float(float_linear_copy, quant_config=hqq_quant_config) - see_cuda_memory_usage(message="After create hqq linear") # Forward input = torch.randn(bs, in_features, device=device) @@ -52,7 +49,6 @@ def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False torch.allclose(hqq_output, hqq_output_2) del float_linear, hqq_linear del float_output, hqq_output, hqq_output_2 - see_cuda_memory_usage("At the end of test") @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") @@ -87,6 +83,7 @@ def test_hqq_quant(self): q_label_1.eq(q_label_2) ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal." + @pytest.mark.parametrize("device_name", ["cuda", "cpu"]) @pytest.mark.parametrize( "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", [ @@ -106,25 +103,20 @@ def test_hqq_quant(self): ) def test_hqq_module_cuda( self, + device_name, nbits, group_size, quant_zero, quant_scale, scale_quant_group_size, ): - _common_cuda_test( + if device_name == "cuda" and not torch.cuda.is_available(): + pytest.skip("Skipping CUDA test because cuda is not available") + _common_hqq_test( nbits=nbits, group_size=group_size, quant_zero=quant_zero, quant_scale=quant_scale, scale_quant_group_size=scale_quant_group_size, + device=torch.device(device_name), ) - - -# _common_cuda_test( -# nbits=4, -# group_size=64, -# quant_zero=False, -# quant_scale=False, -# scale_quant_group_size=128 -# ) diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py b/test/3x/torch/quantization/weight_only/tes_hqq.py similarity index 88% rename from test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py rename to test/3x/torch/quantization/weight_only/tes_hqq.py index 9a0290ffe29..1d68a553859 100644 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py +++ b/test/3x/torch/quantization/weight_only/tes_hqq.py @@ -6,6 +6,7 @@ import transformers from transformers import AutoModelForCausalLM +from neural_compressor.common.utils import logger from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear from neural_compressor.torch.quantization import HQQConfig, convert, get_default_hqq_config, prepare, quantize @@ -14,7 +15,9 @@ device = accelerator.current_device_name() -def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128): +def _common_hqq_test( + nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None +): # Parse config weight_qconfig = QTensorConfig( nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False @@ -26,7 +29,6 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, if quant_scale: scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - device = "cpu" # Create HQQ Linear bs = 4 @@ -34,7 +36,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, out_features = 128 float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) if hqq_global_option.use_half: - print(f"hqq_global_option use half: {hqq_global_option.use_half}") + logger.info(f"hqq_global_option use half: {hqq_global_option.use_half}") float_linear = float_linear.half() float_linear.to(device) float_linear_copy = deepcopy(float_linear) @@ -54,7 +56,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, del float_output, hqq_output, hqq_output_2 -class TestHQQCPU: +class TestHQQ: @classmethod def setup_class(cls): @@ -137,6 +139,7 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half): id(model.model.decoder.embed_tokens.weight) == lm_head_id ), "The tied lm_head weight is not deep copied, please check!" + @pytest.mark.parametrize("device_name", ["cuda", "cpu"]) @pytest.mark.parametrize( "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", [ @@ -155,13 +158,26 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half): (4, -1, False, True, 64), ], ) - def test_hqq_module_cpu( - self, force_use_cpu, force_not_half, nbits, group_size, quant_zero, quant_scale, scale_quant_group_size + def test_hqq_module( + self, + nbits, + group_size, + quant_zero, + quant_scale, + scale_quant_group_size, + device_name, ): - _common_cpu_test( + if device_name == "cuda" and not torch.cuda.is_available(): + pytest.skip("Skipping CUDA test because cuda is not available") + if device_name == "cpu": + os.environ["FORCE_DEVICE"] = "cpu" + hqq_global_option.use_half = False + + _common_hqq_test( nbits=nbits, group_size=group_size, quant_zero=quant_zero, quant_scale=quant_scale, scale_quant_group_size=scale_quant_group_size, + device=torch.device(device_name), ) From 9b6a51ad3471785b4e6c2fbbddb6d4c18ef9e9c3 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 28 Jun 2024 15:47:33 +0800 Subject: [PATCH 2/3] remove cuda Signed-off-by: yiliu30 --- .../weight_only/hqq/test_hqq_cuda.py | 122 ------------------ 1 file changed, 122 deletions(-) delete mode 100644 test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py deleted file mode 100644 index c47d7e93310..00000000000 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py +++ /dev/null @@ -1,122 +0,0 @@ -from copy import deepcopy - -import pytest -import torch -from transformers import AutoModelForCausalLM - -from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option -from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear -from neural_compressor.torch.algorithms.weight_only.hqq.utility import see_cuda_memory_usage -from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator - - -def _common_hqq_test( - nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None -): - # Parse config - weight_qconfig = QTensorConfig( - nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False - ) - zero_qconfig = None - if quant_zero: - zero_qconfig = QTensorConfig(nbits=8, channel_wise=False, group_size=None, optimize=False) - scale_qconfig = None - if quant_scale: - scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) - hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - - # Create HQQ Linear - bs = 4 - in_features = 64 - out_features = 128 - float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) - if hqq_global_option.use_half: - float_linear = float_linear.half() - float_linear.to(device) - float_linear_copy = deepcopy(float_linear) - hqq_linear = HQQLinear.from_float(float_linear_copy, quant_config=hqq_quant_config) - - # Forward - input = torch.randn(bs, in_features, device=device) - if hqq_global_option.use_half: - input = input.half() - float_output = float_linear(input) - input_for_hqq = deepcopy(input) - hqq_output = hqq_linear(input_for_hqq) - hqq_output_2 = hqq_linear(input_for_hqq) - float_qdq_diff = 0.1 # hard code it first - torch.allclose(float_output, hqq_output, atol=float_qdq_diff) - torch.allclose(hqq_output, hqq_output_2) - del float_linear, hqq_linear - del float_output, hqq_output, hqq_output_2 - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") -class TestHQQCUDA: - @classmethod - def setup_class(cls): - torch.manual_seed(0) - torch.cuda.manual_seed(0) - hqq_global_option.use_half = True - - def test_hqq_quant(self): - from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare, quantize - - fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") - example_inputs = torch.tensor( - [[10, 20, 30, 40, 50, 60]], dtype=torch.long, device=auto_detect_accelerator().current_device() - ) - # test_default_config - quant_config = get_default_hqq_config() - - # prepare + convert API - model = prepare(deepcopy(fp32_model), quant_config) - model = convert(model) - q_label_1 = model(example_inputs)[0] - - # quantize API - model = quantize(deepcopy(fp32_model), quant_config) - q_label_2 = model(example_inputs)[0] - - # compare the results of calling `convert` + `prepare` and calling `quantize` - assert torch.all( - q_label_1.eq(q_label_2) - ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal." - - @pytest.mark.parametrize("device_name", ["cuda", "cpu"]) - @pytest.mark.parametrize( - "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", - [ - (4, 64, True, False, 128), - (4, 64, False, False, 128), - (4, 64, True, True, 128), - (4, 64, False, True, 128), - (8, 64, True, False, 128), - (8, 64, False, False, 128), - (8, 64, True, True, 128), - (8, 64, False, True, 128), - (4, 64, True, False, 64), - (4, 64, False, False, 64), - (4, 64, True, True, 64), - (4, 64, False, True, 64), - ], - ) - def test_hqq_module_cuda( - self, - device_name, - nbits, - group_size, - quant_zero, - quant_scale, - scale_quant_group_size, - ): - if device_name == "cuda" and not torch.cuda.is_available(): - pytest.skip("Skipping CUDA test because cuda is not available") - _common_hqq_test( - nbits=nbits, - group_size=group_size, - quant_zero=quant_zero, - quant_scale=quant_scale, - scale_quant_group_size=scale_quant_group_size, - device=torch.device(device_name), - ) From c9db050eedc5ac37e80b0faf725a9b354e1f373c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 28 Jun 2024 18:00:02 +0800 Subject: [PATCH 3/3] correct name Signed-off-by: yiliu30 --- .../3x/torch/quantization/weight_only/{tes_hqq.py => test_hqq.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/3x/torch/quantization/weight_only/{tes_hqq.py => test_hqq.py} (100%) diff --git a/test/3x/torch/quantization/weight_only/tes_hqq.py b/test/3x/torch/quantization/weight_only/test_hqq.py similarity index 100% rename from test/3x/torch/quantization/weight_only/tes_hqq.py rename to test/3x/torch/quantization/weight_only/test_hqq.py