diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py b/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py deleted file mode 100644 index 777daf0e60b..00000000000 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cuda.py +++ /dev/null @@ -1,130 +0,0 @@ -from copy import deepcopy - -import pytest -import torch -from transformers import AutoModelForCausalLM - -from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option -from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear -from neural_compressor.torch.algorithms.weight_only.hqq.utility import see_cuda_memory_usage -from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator - - -def _common_cuda_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128): - # Parse config - weight_qconfig = QTensorConfig( - nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False - ) - zero_qconfig = None - if quant_zero: - zero_qconfig = QTensorConfig(nbits=8, channel_wise=False, group_size=None, optimize=False) - scale_qconfig = None - if quant_scale: - scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) - hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - device = torch.cuda.current_device() - - # Create HQQ Linear - bs = 4 - in_features = 64 - out_features = 128 - see_cuda_memory_usage(message="Before create float linear") - float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) - if hqq_global_option.use_half: - float_linear = float_linear.half() - see_cuda_memory_usage(message="After create float linear") - float_linear.to(device) - float_linear_copy = deepcopy(float_linear) - see_cuda_memory_usage(message="After copy the float linear") - hqq_linear = HQQLinear.from_float(float_linear_copy, quant_config=hqq_quant_config) - see_cuda_memory_usage(message="After create hqq linear") - - # Forward - input = torch.randn(bs, in_features, device=device) - if hqq_global_option.use_half: - input = input.half() - float_output = float_linear(input) - input_for_hqq = deepcopy(input) - hqq_output = hqq_linear(input_for_hqq) - hqq_output_2 = hqq_linear(input_for_hqq) - float_qdq_diff = 0.1 # hard code it first - torch.allclose(float_output, hqq_output, atol=float_qdq_diff) - torch.allclose(hqq_output, hqq_output_2) - del float_linear, hqq_linear - del float_output, hqq_output, hqq_output_2 - see_cuda_memory_usage("At the end of test") - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU") -class TestHQQCUDA: - @classmethod - def setup_class(cls): - torch.manual_seed(0) - torch.cuda.manual_seed(0) - hqq_global_option.use_half = True - - def test_hqq_quant(self): - from neural_compressor.torch.quantization import convert, get_default_hqq_config, prepare, quantize - - fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") - example_inputs = torch.tensor( - [[10, 20, 30, 40, 50, 60]], dtype=torch.long, device=auto_detect_accelerator().current_device() - ) - # test_default_config - quant_config = get_default_hqq_config() - - # prepare + convert API - model = prepare(deepcopy(fp32_model), quant_config) - model = convert(model) - q_label_1 = model(example_inputs)[0] - - # quantize API - model = quantize(deepcopy(fp32_model), quant_config) - q_label_2 = model(example_inputs)[0] - - # compare the results of calling `convert` + `prepare` and calling `quantize` - assert torch.all( - q_label_1.eq(q_label_2) - ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal." - - @pytest.mark.parametrize( - "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", - [ - (4, 64, True, False, 128), - (4, 64, False, False, 128), - (4, 64, True, True, 128), - (4, 64, False, True, 128), - (8, 64, True, False, 128), - (8, 64, False, False, 128), - (8, 64, True, True, 128), - (8, 64, False, True, 128), - (4, 64, True, False, 64), - (4, 64, False, False, 64), - (4, 64, True, True, 64), - (4, 64, False, True, 64), - ], - ) - def test_hqq_module_cuda( - self, - nbits, - group_size, - quant_zero, - quant_scale, - scale_quant_group_size, - ): - _common_cuda_test( - nbits=nbits, - group_size=group_size, - quant_zero=quant_zero, - quant_scale=quant_scale, - scale_quant_group_size=scale_quant_group_size, - ) - - -# _common_cuda_test( -# nbits=4, -# group_size=64, -# quant_zero=False, -# quant_scale=False, -# scale_quant_group_size=128 -# ) diff --git a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py b/test/3x/torch/quantization/weight_only/test_hqq.py similarity index 88% rename from test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py rename to test/3x/torch/quantization/weight_only/test_hqq.py index 9a0290ffe29..1d68a553859 100644 --- a/test/3x/torch/quantization/weight_only/hqq/test_hqq_cpu.py +++ b/test/3x/torch/quantization/weight_only/test_hqq.py @@ -6,6 +6,7 @@ import transformers from transformers import AutoModelForCausalLM +from neural_compressor.common.utils import logger from neural_compressor.torch.algorithms.weight_only.hqq.config import HQQModuleConfig, QTensorConfig, hqq_global_option from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear from neural_compressor.torch.quantization import HQQConfig, convert, get_default_hqq_config, prepare, quantize @@ -14,7 +15,9 @@ device = accelerator.current_device_name() -def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128): +def _common_hqq_test( + nbits=4, group_size=64, quant_zero=True, quant_scale=False, scale_quant_group_size=128, device=None +): # Parse config weight_qconfig = QTensorConfig( nbits=nbits, channel_wise=True, group_size=group_size, optimize=True, round_zero=True if nbits == 4 else False @@ -26,7 +29,6 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, if quant_scale: scale_qconfig = QTensorConfig(nbits=8, channel_wise=True, group_size=scale_quant_group_size, optimize=False) hqq_quant_config = HQQModuleConfig(weight=weight_qconfig, scale=scale_qconfig, zero=zero_qconfig) - device = "cpu" # Create HQQ Linear bs = 4 @@ -34,7 +36,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, out_features = 128 float_linear = torch.nn.Linear(in_features=in_features, out_features=out_features) if hqq_global_option.use_half: - print(f"hqq_global_option use half: {hqq_global_option.use_half}") + logger.info(f"hqq_global_option use half: {hqq_global_option.use_half}") float_linear = float_linear.half() float_linear.to(device) float_linear_copy = deepcopy(float_linear) @@ -54,7 +56,7 @@ def _common_cpu_test(nbits=4, group_size=64, quant_zero=True, quant_scale=False, del float_output, hqq_output, hqq_output_2 -class TestHQQCPU: +class TestHQQ: @classmethod def setup_class(cls): @@ -137,6 +139,7 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half): id(model.model.decoder.embed_tokens.weight) == lm_head_id ), "The tied lm_head weight is not deep copied, please check!" + @pytest.mark.parametrize("device_name", ["cuda", "cpu"]) @pytest.mark.parametrize( "nbits, group_size, quant_zero, quant_scale, scale_quant_group_size", [ @@ -155,13 +158,26 @@ def test_quant_lm_head(self, force_use_cpu, force_not_half): (4, -1, False, True, 64), ], ) - def test_hqq_module_cpu( - self, force_use_cpu, force_not_half, nbits, group_size, quant_zero, quant_scale, scale_quant_group_size + def test_hqq_module( + self, + nbits, + group_size, + quant_zero, + quant_scale, + scale_quant_group_size, + device_name, ): - _common_cpu_test( + if device_name == "cuda" and not torch.cuda.is_available(): + pytest.skip("Skipping CUDA test because cuda is not available") + if device_name == "cpu": + os.environ["FORCE_DEVICE"] = "cpu" + hqq_global_option.use_half = False + + _common_hqq_test( nbits=nbits, group_size=group_size, quant_zero=quant_zero, quant_scale=quant_scale, scale_quant_group_size=scale_quant_group_size, + device=torch.device(device_name), )