From 209dbf750adc26f14021c07a9c41df6b0f242dd8 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 16 Feb 2024 23:29:23 +0100 Subject: [PATCH 1/3] overwrite to for QTensor and QBitsTensor --- quanto/tensor/core.py | 8 ++++++++ test/nn/test_qlinear.py | 23 ++++++++++++++++++++++- test/tensor/test_qtensor.py | 7 +++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/quanto/tensor/core.py b/quanto/tensor/core.py index 1a12d27e..23d6187e 100644 --- a/quanto/tensor/core.py +++ b/quanto/tensor/core.py @@ -313,6 +313,10 @@ def __torch_dispatch__(cls, op, types, args, kwargs=None): def numpy(self): return self.dequantize().cpu().numpy() + def to(self, *args, **kwargs): + self._data = self._data.to(*args, **kwargs) + self._scale = self._scale.to(*args, **kwargs) + return self class AffineQuantizer(Function): """A standard affine quantizer.""" @@ -424,3 +428,7 @@ def __torch_dispatch__(cls, op, types, args, kwargs=None): return QBitsTensor(data, scale, zeropoint) args, kwargs = pytree.tree_map_only(QBitsTensor, lambda x: x.qtensor(), (args, kwargs or {})) return op(*args, **kwargs) + + def to(self, *args, **kwargs): + self._zeropoint = self._zeropoint.to(*args, **kwargs) + return super().to(*args, **kwargs) diff --git a/test/nn/test_qlinear.py b/test/nn/test_qlinear.py index de96d994..1cab18d3 100644 --- a/test/nn/test_qlinear.py +++ b/test/nn/test_qlinear.py @@ -2,7 +2,7 @@ import torch from helpers import assert_similar, random_qtensor -from quanto import Calibration, QTensor, int4 +from quanto import Calibration, QBitsTensor, QTensor, int4 from quanto.nn import QLinear @@ -26,6 +26,27 @@ def _test_quantize_linear(batch_size, tokens, embeddings, use_bias, weights, act atol = {None: dtype_atol, torch.int8: dtype_atol, torch.float8_e5m2: 5e-3, torch.float8_e4m3fn: 5e-3}[activations] assert_similar(out, qout, atol=atol) +@pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"]) +@pytest.mark.parametrize("weights", [int4, torch.int8], ids=["w-int4", "w-int8"]) +@pytest.mark.parametrize( + "activations", + [None, torch.float8_e5m2, torch.float8_e4m3fn], + ids=["None","a-float8-e5m2", "a-float8-e4m3"], +) +def test_move_qlinear(use_bias ,weights, activations, device): + linear = torch.nn.Linear(32, 32, bias=use_bias) + qlinear = QLinear.from_module(linear, weights=weights, activations=activations) + # QAT optional for weight only quantization + qinputs = random_qtensor((1,32, 32)) + with torch.no_grad(), Calibration(): + qlinear(qinputs) + qlinear.freeze() + qlinear.to(device) + if isinstance(qlinear.weight,QTensor): + assert qlinear.weight._data.device.type == device.type + assert qlinear.weight._scale.device.type == device.type + if isinstance(qlinear.weight,QBitsTensor): + assert qlinear.weight._zeropoint.device.type == device.type @pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("tokens, embeddings", [(32, 32), (10, 32)]) diff --git a/test/tensor/test_qtensor.py b/test/tensor/test_qtensor.py index e40fbcda..800caa1f 100644 --- a/test/tensor/test_qtensor.py +++ b/test/tensor/test_qtensor.py @@ -8,6 +8,13 @@ from quanto import QTensor, absmax_scale +def test_qtensor_move(device): + input_shape = (2, 4, 8) + qa = random_qtensor(input_shape, dtype=torch.float32) + qa = qa.to(device) + assert qa._data.device.type == device.type + assert qa._scale.device.type == device.type + @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (10, 32, 32)]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) @pytest.mark.parametrize("itype", [torch.int8], ids=["int8"]) From 33d47986cb3d3161ecb19bb8e31d66d62dba3157 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 16 Feb 2024 23:47:52 +0100 Subject: [PATCH 2/3] fix conflits --- test/nn/test_qlinear.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nn/test_qlinear.py b/test/nn/test_qlinear.py index e62916a0..dc60cc86 100644 --- a/test/nn/test_qlinear.py +++ b/test/nn/test_qlinear.py @@ -27,10 +27,10 @@ def _test_quantize_linear(batch_size, tokens, embeddings, use_bias, weights, act assert_similar(out, qout, atol=atol) @pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"]) -@pytest.mark.parametrize("weights", [qint4, torch.int8], ids=["w-int4", "w-int8"]) +@pytest.mark.parametrize("weights", [qint4, qint8], ids=["w-int4", "w-int8"]) @pytest.mark.parametrize( "activations", - [None, torch.float8_e5m2, torch.float8_e4m3fn], + [None, qfloat8_e5m2, qfloat8_e4m3fn], ids=["None","a-float8-e5m2", "a-float8-e4m3"], ) def test_move_qlinear(use_bias ,weights, activations, device): From 971d97c3e0cf2d42d81c2961d5d55e532fde0a4c Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 16 Feb 2024 23:57:10 +0100 Subject: [PATCH 3/3] style --- quanto/tensor/core.py | 1 + test/nn/test_qlinear.py | 12 +++++++----- test/tensor/test_qtensor.py | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/quanto/tensor/core.py b/quanto/tensor/core.py index 6bb95714..da99afc6 100644 --- a/quanto/tensor/core.py +++ b/quanto/tensor/core.py @@ -305,6 +305,7 @@ def to(self, *args, **kwargs): self._scale = self._scale.to(*args, **kwargs) return self + class AffineQuantizer(Function): """A standard affine quantizer.""" diff --git a/test/nn/test_qlinear.py b/test/nn/test_qlinear.py index dc60cc86..99ddd0ef 100644 --- a/test/nn/test_qlinear.py +++ b/test/nn/test_qlinear.py @@ -26,28 +26,30 @@ def _test_quantize_linear(batch_size, tokens, embeddings, use_bias, weights, act atol = {None: dtype_atol, qint8: dtype_atol, qfloat8_e5m2: 5e-3, qfloat8_e4m3fn: 5e-3}[activations] assert_similar(out, qout, atol=atol) + @pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"]) @pytest.mark.parametrize("weights", [qint4, qint8], ids=["w-int4", "w-int8"]) @pytest.mark.parametrize( "activations", [None, qfloat8_e5m2, qfloat8_e4m3fn], - ids=["None","a-float8-e5m2", "a-float8-e4m3"], + ids=["None", "a-float8-e5m2", "a-float8-e4m3"], ) -def test_move_qlinear(use_bias ,weights, activations, device): +def test_move_qlinear(use_bias, weights, activations, device): linear = torch.nn.Linear(32, 32, bias=use_bias) qlinear = QLinear.from_module(linear, weights=weights, activations=activations) # QAT optional for weight only quantization - qinputs = random_qtensor((1,32, 32)) + qinputs = random_qtensor((1, 32, 32)) with torch.no_grad(), Calibration(): qlinear(qinputs) qlinear.freeze() qlinear.to(device) - if isinstance(qlinear.weight,QTensor): + if isinstance(qlinear.weight, QTensor): assert qlinear.weight._data.device.type == device.type assert qlinear.weight._scale.device.type == device.type - if isinstance(qlinear.weight,QBitsTensor): + if isinstance(qlinear.weight, QBitsTensor): assert qlinear.weight._zeropoint.device.type == device.type + @pytest.mark.parametrize("batch_size", [1, 10]) @pytest.mark.parametrize("tokens, embeddings", [(32, 32), (10, 32)]) @pytest.mark.parametrize("use_bias", [True, False], ids=["bias", "no-bias"]) diff --git a/test/tensor/test_qtensor.py b/test/tensor/test_qtensor.py index f44a47e1..84e5fe2f 100644 --- a/test/tensor/test_qtensor.py +++ b/test/tensor/test_qtensor.py @@ -15,6 +15,7 @@ def test_qtensor_move(device): assert qa._data.device.type == device.type assert qa._scale.device.type == device.type + @pytest.mark.parametrize("input_shape", [(10,), (1, 10), (10, 32, 32)]) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"]) @pytest.mark.parametrize("qtype", [qint8], ids=["qint8"])