From 02abc9b8c9e16fe1cd251babc9925aee2bf494e5 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 2 Jul 2024 20:11:05 +0000 Subject: [PATCH 01/15] actorder --- .../quantization/gptq/utils/gptq_wrapper.py | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index db2afc64a..460d4457a 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -109,6 +109,9 @@ def compress( self.H = torch.linalg.cholesky(self.H, upper=True) Hinv = self.H + actorder = False + invperm = False + # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) @@ -140,14 +143,38 @@ def compress( q = torch.dequantize(q) elif hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme + actorder = quant_scheme.weights.actorder + if quant_scheme.weights is not None: - scale = self.layer.weight_scale - zero_point = self.layer.weight_zero_point from compressed_tensors.quantization import QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import ( fake_quantize, ) + scale = self.layer.weight_scale + zero_point = self.layer.weight_zero_point + + if actorder: + perm = torch.argsort(torch.diag(self.H), descending=True) + W = W[:, perm] + self.H = self.H[perm][:, perm] + invperm = torch.argsort(perm) + + group_size = quant_scheme.weights.group_size + if group_size is None or group_size == -1: + group_size = self.layer.weight.shape[1] + + if actorder: + indices = torch.arange(self.columns, device=invperm.device) + g_idx = (perm[indices] // group_size).to(dtype=torch.int32) + g_idx = g_idx[invperm] + self.layer.weight_g_idx.data = g_idx + else: + indices = torch.arange( + self.columns, device=W.device, dtype=torch.int32 + ) + g_idx = indices // group_size + strategy = quant_scheme.weights.strategy if strategy == QuantizationStrategy.TENSOR: @@ -176,6 +203,17 @@ def compress( # ends up being a channelwise application altered_qargs = copy(quant_scheme.weights) altered_qargs.strategy = QuantizationStrategy.CHANNEL + + # apply g_idx + if g_idx is not None: + # scale and zp already transformed by group_size + # extract first index of group_idze + indices_to_extract = torch.arange( + 0, g_idx.shape[0], group_size + ) + scale = scale[:, g_idx[indices_to_extract]] + zero_point = zero_point[:, g_idx[indices_to_extract]] + q = fake_quantize( q, scale[:, input_dim_group], @@ -206,6 +244,9 @@ def compress( logger.info("time %.2f" % (time.time() - tick)) logger.info("error %.2f" % torch.sum(Losses).item()) + if actorder: + W = W[:, invperm] + if isinstance(self.layer, transformers.Conv1D): W = W.t() W = W.reshape(final_shape).to(final_dtype) From 3e7b87539bb7672b848252d202fa6033814ec22d Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 10 Jul 2024 17:27:10 +0000 Subject: [PATCH 02/15] g_idx fix --- .../quantization/gptq/utils/gptq_wrapper.py | 44 +++++++++---------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 460d4457a..447849946 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -109,8 +109,23 @@ def compress( self.H = torch.linalg.cholesky(self.H, upper=True) Hinv = self.H - actorder = False - invperm = False + g_idx = None + if hasattr(self.layer, "quantization_scheme"): + quant_scheme = self.layer.quantization_scheme + actorder = quant_scheme.weights.actorder + group_size = quant_scheme.weights.group_size + + if actorder: + perm = torch.argsort(torch.diag(self.H), descending=True) + W = W[:, perm] + self.H = self.H[perm][:, perm] + invperm = torch.argsort(perm) + + g_idx = torch.Tensor([i // group_size for i in range(self.columns)]).to( + device=invperm.device + ) + g_idx = g_idx[invperm] + self.layer.weight_g_idx.data = g_idx # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): @@ -143,7 +158,6 @@ def compress( q = torch.dequantize(q) elif hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme - actorder = quant_scheme.weights.actorder if quant_scheme.weights is not None: from compressed_tensors.quantization import QuantizationStrategy @@ -154,27 +168,10 @@ def compress( scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point - if actorder: - perm = torch.argsort(torch.diag(self.H), descending=True) - W = W[:, perm] - self.H = self.H[perm][:, perm] - invperm = torch.argsort(perm) - group_size = quant_scheme.weights.group_size if group_size is None or group_size == -1: group_size = self.layer.weight.shape[1] - if actorder: - indices = torch.arange(self.columns, device=invperm.device) - g_idx = (perm[indices] // group_size).to(dtype=torch.int32) - g_idx = g_idx[invperm] - self.layer.weight_g_idx.data = g_idx - else: - indices = torch.arange( - self.columns, device=W.device, dtype=torch.int32 - ) - g_idx = indices // group_size - strategy = quant_scheme.weights.strategy if strategy == QuantizationStrategy.TENSOR: @@ -207,12 +204,13 @@ def compress( # apply g_idx if g_idx is not None: # scale and zp already transformed by group_size - # extract first index of group_idze + # extract first index of group_size indices_to_extract = torch.arange( 0, g_idx.shape[0], group_size ) - scale = scale[:, g_idx[indices_to_extract]] - zero_point = zero_point[:, g_idx[indices_to_extract]] + grouped_indicies = g_idx[indices_to_extract].int() + scale = scale[:, grouped_indicies] + zero_point = zero_point[:, grouped_indicies] q = fake_quantize( q, From 778b5b51eafce34770a7f52546979e93ccce73ae Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 10 Jul 2024 20:29:57 +0000 Subject: [PATCH 03/15] fix --- .../quantization/gptq/utils/gptq_wrapper.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 447849946..65f13be8d 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -98,24 +98,14 @@ def compress( dead = torch.diag(self.H) == 0 self.H[dead, dead] = 1 W[:, dead] = 0 - - Losses = torch.zeros(self.rows, device=self.dev) - - damp = percdamp * torch.mean(torch.diag(self.H)) - diag = torch.arange(self.columns, device=self.dev) - self.H[diag, diag] += damp - self.H = torch.linalg.cholesky(self.H) - self.H = torch.cholesky_inverse(self.H) - self.H = torch.linalg.cholesky(self.H, upper=True) - Hinv = self.H - + g_idx = None if hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme actorder = quant_scheme.weights.actorder - group_size = quant_scheme.weights.group_size - + if actorder: + group_size = quant_scheme.weights.group_size perm = torch.argsort(torch.diag(self.H), descending=True) W = W[:, perm] self.H = self.H[perm][:, perm] @@ -126,6 +116,16 @@ def compress( ) g_idx = g_idx[invperm] self.layer.weight_g_idx.data = g_idx + + Losses = torch.zeros(self.rows, device=self.dev) + + damp = percdamp * torch.mean(torch.diag(self.H)) + diag = torch.arange(self.columns, device=self.dev) + self.H[diag, diag] += damp + self.H = torch.linalg.cholesky(self.H) + self.H = torch.cholesky_inverse(self.H) + self.H = torch.linalg.cholesky(self.H, upper=True) + Hinv = self.H # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): @@ -209,6 +209,7 @@ def compress( 0, g_idx.shape[0], group_size ) grouped_indicies = g_idx[indices_to_extract].int() + scale = scale[:, grouped_indicies] zero_point = zero_point[:, grouped_indicies] From bc08e8d1d74d479bb6e545797e01b2db5890dfbd Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 10 Jul 2024 21:02:25 +0000 Subject: [PATCH 04/15] lint --- .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 65f13be8d..f7bea26d8 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -98,12 +98,12 @@ def compress( dead = torch.diag(self.H) == 0 self.H[dead, dead] = 1 W[:, dead] = 0 - + g_idx = None if hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme actorder = quant_scheme.weights.actorder - + if actorder: group_size = quant_scheme.weights.group_size perm = torch.argsort(torch.diag(self.H), descending=True) @@ -116,7 +116,7 @@ def compress( ) g_idx = g_idx[invperm] self.layer.weight_g_idx.data = g_idx - + Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -124,7 +124,7 @@ def compress( self.H[diag, diag] += damp self.H = torch.linalg.cholesky(self.H) self.H = torch.cholesky_inverse(self.H) - self.H = torch.linalg.cholesky(self.H, upper=True) + self.H = torch.linalg.cholesky(self.H, upper=True) Hinv = self.H # See section 3.4 of https://arxiv.org/abs/2203.07259 From f203537e4dd7d1d5a507a1489b4c3ea06cf416fc Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 11 Jul 2024 20:33:20 +0000 Subject: [PATCH 05/15] propagagte g_idx with perm --- .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index f7bea26d8..c47969893 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -111,10 +111,9 @@ def compress( self.H = self.H[perm][:, perm] invperm = torch.argsort(perm) - g_idx = torch.Tensor([i // group_size for i in range(self.columns)]).to( - device=invperm.device - ) - g_idx = g_idx[invperm] + g_idx = torch.Tensor( + [perm[i] // group_size for i in range(self.columns)] + ).to(device=invperm.device) self.layer.weight_g_idx.data = g_idx Losses = torch.zeros(self.rows, device=self.dev) From 9967a4a6256beb2236398cbde727f252a3ef4f8a Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 12 Jul 2024 18:20:21 +0000 Subject: [PATCH 06/15] scratch --- .../quantization/gptq/utils/gptq_wrapper.py | 54 +++++++++++++------ 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index c47969893..ebd4f2c97 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -100,6 +100,24 @@ def compress( W[:, dead] = 0 g_idx = None + # if hasattr(self.layer, "quantization_scheme"): + # quant_scheme = self.layer.quantization_scheme + # actorder = quant_scheme.weights.actorder + + # if actorder: + # group_size = quant_scheme.weights.group_size + # perm = torch.argsort(torch.diag(self.H), descending=True) + # # W = W[:, perm] + # # self.H = self.H[perm][:, perm] + # invperm = torch.argsort(perm) + + # # g_idx = torch.Tensor( + # # [perm[i] // group_size for i in range(self.columns)] + # # ).to(device=invperm.device) + # g_idx = torch.Tensor( + # [i // group_size for i in range(self.columns)] + # ).to(device=invperm.device) + # self.layer.weight_g_idx.data = g_idx if hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme actorder = quant_scheme.weights.actorder @@ -114,8 +132,11 @@ def compress( g_idx = torch.Tensor( [perm[i] // group_size for i in range(self.columns)] ).to(device=invperm.device) + # g_idx = torch.Tensor( + # [i // group_size for i in range(self.columns)] + # ).to(device=invperm.device) self.layer.weight_g_idx.data = g_idx - + Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -200,24 +221,22 @@ def compress( altered_qargs = copy(quant_scheme.weights) altered_qargs.strategy = QuantizationStrategy.CHANNEL - # apply g_idx if g_idx is not None: - # scale and zp already transformed by group_size - # extract first index of group_size - indices_to_extract = torch.arange( - 0, g_idx.shape[0], group_size + q = fake_quantize( + q, + scale[:, int(g_idx[column_idx])], + zero_point[:, int(g_idx[column_idx])], + altered_qargs, + ) + + else: + + q = fake_quantize( + q, + scale[:, input_dim_group], + zero_point[:, input_dim_group], + altered_qargs, ) - grouped_indicies = g_idx[indices_to_extract].int() - - scale = scale[:, grouped_indicies] - zero_point = zero_point[:, grouped_indicies] - - q = fake_quantize( - q, - scale[:, input_dim_group], - zero_point[:, input_dim_group], - altered_qargs, - ) Q1[:, i] = q Losses1[:, i] = (w - q) ** 2 / d**2 @@ -244,6 +263,7 @@ def compress( if actorder: W = W[:, invperm] + self.H = self.H[perm][:, perm] if isinstance(self.layer, transformers.Conv1D): W = W.t() From 52ef6be284b59482e38718d518eacb62ad6926e6 Mon Sep 17 00:00:00 2001 From: Benjamin Date: Thu, 18 Jul 2024 12:07:01 -0400 Subject: [PATCH 07/15] GPTQ - move calibration of quantiztion params to after hessian calibration --- .../modifiers/quantization/gptq/base.py | 20 +++++++++++++++++-- .../quantization/gptq/utils/gptq_wrapper.py | 8 ++++++++ .../quantization/quantization/base.py | 7 +++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 2ccfb114a..d9d7959db 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -1,7 +1,12 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch -from compressed_tensors.quantization import QuantizationScheme +from compressed_tensors.quantization import ( + QuantizationScheme, + disable_quantization, + enable_quantization, + freeze_module_quantization, +) from loguru import logger from pydantic import Field from torch.nn import Module @@ -163,7 +168,9 @@ def on_initialize(self, state: "State", **kwargs) -> bool: if not self.initialized_structure_: self.on_initialize_structure(state, **kwargs) if self.quantization_modifier_: - self.quantization_modifier_.initialize(state, **kwargs) + self.quantization_modifier_.initialize( + state, freeze_quantization=False, **kwargs + ) if not self.quantize: raise ValueError("To use the GPTQModifier, quantization must be enabled.") @@ -178,6 +185,7 @@ def on_initialize(self, state: "State", **kwargs) -> bool: self.initialize_compression(modifiable_model, calibration_dataloader) self.apply_compression(calibration_dataloader) + state.model.apply(freeze_module_quantization) return True @@ -250,6 +258,11 @@ def apply_compression( logger.info( f"Running {class_name} calibration with " f"{len(dataloader)} samples..." ) + + # quantization scales and zp are already initialized but we do not + # want to calibrate wrt to these + self.model.apply(disable_quantization) + if not self.sequential_update: # in non-sequential mode we run one forward batch for all modules run_calibration_forward(self.model, dataloader, mask_padding=True) @@ -271,6 +284,9 @@ def apply_compression( layer_compressor.revert_layer_wrappers() torch.cuda.empty_cache() + # re-enable quantization + self.model.apply(enable_quantization) + def _build_quant_modifier(self): """ Build a quantization modifier based on the specified config_groups, diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index db2afc64a..5bc3f14f3 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -141,6 +141,14 @@ def compress( elif hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme if quant_scheme.weights is not None: + # fetch latest correct scale and ZP relevant for any changes + # such as activation reordering + from compressed_tensors.quantization import ( + update_layer_weight_quant_params, + ) + + update_layer_weight_quant_params(self.layer) + scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point from compressed_tensors.quantization import QuantizationStrategy diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 434f6f2d8..b90ec250f 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -61,7 +61,9 @@ def on_initialize_structure(self, state: State, **kwargs): self._apply_modifier_to_model(module) module.apply(freeze_module_quantization) - def on_initialize(self, state: State, **kwargs) -> bool: + def on_initialize( + self, state: State, freeze_quantization: bool = True, **kwargs + ) -> bool: if self.end and self.end != -1: raise ValueError( "end_epoch is disabled for QuantizationModifier and can only be set to" @@ -80,7 +82,8 @@ def on_initialize(self, state: State, **kwargs) -> bool: self._check_token_distribution( module, threshold=kwargs.get("min_tokens_per_module") ) - module.apply(freeze_module_quantization) + if freeze_quantization: + module.apply(freeze_module_quantization) return True From ad7803ea2633adf2865f1a7ab9f1bc28656f2d4d Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 22 Jul 2024 15:33:31 +0000 Subject: [PATCH 08/15] no recompute --- .../quantization/gptq/utils/gptq_wrapper.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 66c59adbc..cc0bec980 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -136,7 +136,7 @@ def compress( # [i // group_size for i in range(self.columns)] # ).to(device=invperm.device) self.layer.weight_g_idx.data = g_idx - + Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -161,6 +161,16 @@ def compress( if preserve_zeros: W1_nz_mask = W_nz_mask[:, i1:i2] + if hasattr(self.layer, "quantization_scheme"): + quant_scheme = self.layer.quantization_scheme + if quant_scheme.weights is not None: + # such as activation reordering + from compressed_tensors.quantization import ( + update_layer_weight_quant_params, + ) + + update_layer_weight_quant_params(self.layer, g_idx) + for i in range(count): w = W1[:, i] d = Hinv1[i, i] @@ -181,11 +191,6 @@ def compress( if quant_scheme.weights is not None: # fetch latest correct scale and ZP relevant for any changes - # such as activation reordering - from compressed_tensors.quantization import ( - update_layer_weight_quant_params, - ) - update_layer_weight_quant_params(self.layer, g_idx) scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point @@ -237,9 +242,8 @@ def compress( zero_point[:, int(g_idx[column_idx])], altered_qargs, ) - - else: + else: q = fake_quantize( q, scale[:, input_dim_group], @@ -266,7 +270,7 @@ def compress( W[:, i2:] -= w_err * W_nz_mask[:, i2:] else: W[:, i2:] -= w_err - + print("time %.2f" % (time.time() - tick)) logger.info("time %.2f" % (time.time() - tick)) logger.info("error %.2f" % torch.sum(Losses).item()) From 84e74cec3d59c7ff7d309fc4324dd77a5d26ab3d Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 22 Jul 2024 16:27:48 +0000 Subject: [PATCH 09/15] clean up --- .../quantization/gptq/utils/gptq_wrapper.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index cc0bec980..b147cc87a 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -161,15 +161,9 @@ def compress( if preserve_zeros: W1_nz_mask = W_nz_mask[:, i1:i2] + is_layer_updated_actorder = False if hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme - if quant_scheme.weights is not None: - # such as activation reordering - from compressed_tensors.quantization import ( - update_layer_weight_quant_params, - ) - - update_layer_weight_quant_params(self.layer, g_idx) for i in range(count): w = W1[:, i] @@ -191,6 +185,17 @@ def compress( if quant_scheme.weights is not None: # fetch latest correct scale and ZP relevant for any changes + if ( + quant_scheme.weights is not None + and not is_layer_updated_actorder + ): + # such as activation reordering + from compressed_tensors.quantization import ( + update_layer_weight_quant_params, + ) + + update_layer_weight_quant_params(self.layer, g_idx) + is_layer_updated_actorder = True scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point From fed380f014cea8ee99ac3cea78d76869bc2cb629 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Mon, 22 Jul 2024 16:33:47 +0000 Subject: [PATCH 10/15] remvoe unwanted code --- .../quantization/gptq/utils/gptq_wrapper.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 4e54d0f67..75b9936d5 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -100,24 +100,6 @@ def compress( W[:, dead] = 0 g_idx = None - # if hasattr(self.layer, "quantization_scheme"): - # quant_scheme = self.layer.quantization_scheme - # actorder = quant_scheme.weights.actorder - - # if actorder: - # group_size = quant_scheme.weights.group_size - # perm = torch.argsort(torch.diag(self.H), descending=True) - # # W = W[:, perm] - # # self.H = self.H[perm][:, perm] - # invperm = torch.argsort(perm) - - # # g_idx = torch.Tensor( - # # [perm[i] // group_size for i in range(self.columns)] - # # ).to(device=invperm.device) - # g_idx = torch.Tensor( - # [i // group_size for i in range(self.columns)] - # ).to(device=invperm.device) - # self.layer.weight_g_idx.data = g_idx if hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme actorder = quant_scheme.weights.actorder @@ -132,9 +114,6 @@ def compress( g_idx = torch.Tensor( [perm[i] // group_size for i in range(self.columns)] ).to(device=invperm.device) - # g_idx = torch.Tensor( - # [i // group_size for i in range(self.columns)] - # ).to(device=invperm.device) self.layer.weight_g_idx.data = g_idx Losses = torch.zeros(self.rows, device=self.dev) @@ -273,7 +252,6 @@ def compress( W[:, i2:] -= w_err * W_nz_mask[:, i2:] else: W[:, i2:] -= w_err - print("time %.2f" % (time.time() - tick)) logger.info("time %.2f" % (time.time() - tick)) logger.info("error %.2f" % torch.sum(Losses).item()) From 51adb464bb61bf34a1937162bee343fc0f0a1461 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 26 Jul 2024 16:18:26 +0000 Subject: [PATCH 11/15] rese observer params before inference --- .../modifiers/quantization/gptq/utils/gptq_wrapper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index 75b9936d5..ad47151a7 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -171,6 +171,8 @@ def compress( update_layer_weight_quant_params, ) + observer = getattr(self.layer, "weight_observer", None) + observer.reset() update_layer_weight_quant_params(self.layer, g_idx) is_layer_updated_actorder = True From 543cfdd5f382cf83ee5d977370e13c39f21c44c4 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Sat, 27 Jul 2024 00:37:39 +0000 Subject: [PATCH 12/15] draft --- .../quantization/gptq/utils/gptq_wrapper.py | 70 +++++++++++++++---- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index ad47151a7..cf4d1eed0 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -99,7 +99,8 @@ def compress( self.H[dead, dead] = 1 W[:, dead] = 0 - g_idx = None + g_idx, g_idx_for_perm_weights = None, None + actorder = False if hasattr(self.layer, "quantization_scheme"): quant_scheme = self.layer.quantization_scheme actorder = quant_scheme.weights.actorder @@ -111,9 +112,27 @@ def compress( self.H = self.H[perm][:, perm] invperm = torch.argsort(perm) + # # g_idx for the shuffled weights + # g_idx = torch.Tensor( + # [i // group_size for i in range(self.columns)] + # ).to(device=invperm.device) + + # g_idx for the original weights g_idx = torch.Tensor( [perm[i] // group_size for i in range(self.columns)] ).to(device=invperm.device) + + # g_idx for the permutated weights + g_idx_for_perm_weights = torch.Tensor( + [i // group_size for i in range(self.columns)] + ).to(device=invperm.device) + + + # # g_idx for the original weights + # g_idx_original = g_idx.clone() + # for i in range(g_idx.shape[0]): + # g_idx_original[perm[i]] = torch.Tensor([i // group_size]) + self.layer.weight_g_idx.data = g_idx Losses = torch.zeros(self.rows, device=self.dev) @@ -173,6 +192,8 @@ def compress( observer = getattr(self.layer, "weight_observer", None) observer.reset() + + # update self.layer params with respect to g_idx update_layer_weight_quant_params(self.layer, g_idx) is_layer_updated_actorder = True @@ -183,9 +204,6 @@ def compress( fake_quantize, ) - scale = self.layer.weight_scale - zero_point = self.layer.weight_zero_point - group_size = quant_scheme.weights.group_size if group_size is None or group_size == -1: group_size = self.layer.weight.shape[1] @@ -210,24 +228,25 @@ def compress( else: # strategy == QuantizationStrategy.GROUP # get the group index for the current column column_idx = i1 + i - input_dim_group = ( - column_idx // quant_scheme.weights.group_size - ) - + # Since we're only applying quantization to a slice, this # ends up being a channelwise application altered_qargs = copy(quant_scheme.weights) altered_qargs.strategy = QuantizationStrategy.CHANNEL - if g_idx is not None: + if actorder: q = fake_quantize( q, - scale[:, int(g_idx[column_idx])], - zero_point[:, int(g_idx[column_idx])], + scale[:, int(g_idx_for_perm_weights[column_idx])], + zero_point[:, int(g_idx_for_perm_weights[column_idx])], altered_qargs, ) else: + input_dim_group = ( + column_idx // quant_scheme.weights.group_size + ) + q = fake_quantize( q, scale[:, input_dim_group], @@ -259,7 +278,7 @@ def compress( if actorder: W = W[:, invperm] - self.H = self.H[perm][:, perm] + # self.H = self.H[perm][:, perm] if isinstance(self.layer, transformers.Conv1D): W = W.t() @@ -276,3 +295,30 @@ def free(self): """ delattr(self, "H") super().free() + + +""" +# w = torch.randn(5,5) +t = torch.Tensor([0,3,4,2,1]) +group_size = 2 + +perm = torch.argsort(t) # tensor([0, 4, 3, 1, 2]) # put zero to 0, put 4th index to the second idx, here and so on + +g_idx = torch.Tensor([perm[i] // group_size for i in range(5)]).to(device=invperm.device) +# tensor([0., 2., 1., 0., 1.], device='cuda:0') + +g_idx[perm[i]] = i // group_idx for i in column_size + + + +Make g_idx + +g_idx = torch.Tensor( + [perm[i] // group_size for i in range(self.columns)] + ).to(device=invperm.device) + +in this format, make sure it runs in compressed tensors and llm-compressor +""" + + + From 757adfefef5e1d8097a9d045ba70531cb6a77c87 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Wed, 31 Jul 2024 20:36:20 +0000 Subject: [PATCH 13/15] draft --- .../quantization/gptq/utils/gptq_wrapper.py | 79 ++++++++++++++----- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index cf4d1eed0..bffd82133 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -108,33 +108,42 @@ def compress( if actorder: group_size = quant_scheme.weights.group_size perm = torch.argsort(torch.diag(self.H), descending=True) + + # index of perm is the min of self.H + + # # to mimick group size + # perm = torch.arange(0, self.columns, dtype=torch.int) + + W = W[:, perm] self.H = self.H[perm][:, perm] invperm = torch.argsort(perm) - # # g_idx for the shuffled weights - # g_idx = torch.Tensor( - # [i // group_size for i in range(self.columns)] - # ).to(device=invperm.device) - + # breakpoint() # g_idx for the original weights - g_idx = torch.Tensor( - [perm[i] // group_size for i in range(self.columns)] + g_idx = torch.tensor( + # [perm[i] // group_size for i in range(self.columns)] + [i // group_size for i in range(self.columns)], #B + dtype=torch.int, ).to(device=invperm.device) - - # g_idx for the permutated weights + + # g_idx for the permutated weights # A g_idx_for_perm_weights = torch.Tensor( [i // group_size for i in range(self.columns)] ).to(device=invperm.device) - - # # g_idx for the original weights - # g_idx_original = g_idx.clone() + # # # g_idx for the original weights + # g_idx = g_idx_for_perm_weights.clone() # for i in range(g_idx.shape[0]): - # g_idx_original[perm[i]] = torch.Tensor([i // group_size]) + # g_idx[perm[i]] = torch.Tensor([i // group_size]) - self.layer.weight_g_idx.data = g_idx + g_idx = g_idx[invperm] # A, B + # print(g_idx) + # breakpoint() + # breakpoint() + self.layer.weight_g_idx.data = g_idx + Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -194,11 +203,19 @@ def compress( observer.reset() # update self.layer params with respect to g_idx - update_layer_weight_quant_params(self.layer, g_idx) + # update_layer_weight_quant_params(self.layer, g_idx) + update_layer_weight_quant_params(self.layer,perm=perm, + # g_idx=g_idx, + ) + is_layer_updated_actorder = True scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point + # breakpoint() + scale = scale[:, g_idx] + zero_point=zero_point[:, g_idx] + from compressed_tensors.quantization import QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import ( fake_quantize, @@ -235,10 +252,22 @@ def compress( altered_qargs.strategy = QuantizationStrategy.CHANNEL if actorder: + input_dim_group = ( + column_idx // quant_scheme.weights.group_size + ) + # print(column_idx) q = fake_quantize( q, scale[:, int(g_idx_for_perm_weights[column_idx])], zero_point[:, int(g_idx_for_perm_weights[column_idx])], + # scale[:, int(g_idx[column_idx])], + # zero_point[:, int(g_idx[column_idx])], + # scale[:, int(g_idx[column_idx])], + # zero_point[:, int(g_idx[column_idx])], + # scale[:, input_dim_group], + # zero_point[:, input_dim_group], + # scale[:, int(invperm[column_idx] // group_size)], #B + # zero_point[:, int(invperm[column_idx] // group_size)], #B altered_qargs, ) @@ -298,16 +327,24 @@ def free(self): """ -# w = torch.randn(5,5) -t = torch.Tensor([0,3,4,2,1]) +import torch + group_size = 2 +n = 6 +# Hessian +# H = torch.randperm(n) +H = torch.Tensor([5, 0, 1, 3, 2, 4]) # tensor([5, 0, 1, 3, 2, 4]) +perm = torch.argsort(H, descending=True) # tensor([0, 5, 3, 4, 2, 1]) +invperm = torch.argsort(perm) # tensor([0, 5, 4, 2, 3, 1]) + +# w = torch.randperm(n) +w = torch.Tensor([0, 4, 1, 3, 5, 2]) # tensor([0, 4, 1, 3, 5, 2]) +W = w.clone()[perm] # tensor([0, 2, 3, 5, 1, 4]) -perm = torch.argsort(t) # tensor([0, 4, 3, 1, 2]) # put zero to 0, put 4th index to the second idx, here and so on +g_idx = torch.tensor([int(i // group_size) for i in range(n)], dtype=torch.int) +g_idx = g_idx[invperm] -g_idx = torch.Tensor([perm[i] // group_size for i in range(5)]).to(device=invperm.device) -# tensor([0., 2., 1., 0., 1.], device='cuda:0') -g_idx[perm[i]] = i // group_idx for i in column_size From 814a97e84caba6bdc12eaea11ece0d99f73b6100 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 1 Aug 2024 00:58:38 +0000 Subject: [PATCH 14/15] draft --- .../quantization/gptq/utils/gptq_wrapper.py | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index bffd82133..a3c55824e 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -122,8 +122,8 @@ def compress( # breakpoint() # g_idx for the original weights g_idx = torch.tensor( - # [perm[i] // group_size for i in range(self.columns)] - [i // group_size for i in range(self.columns)], #B + [perm[i] // group_size for i in range(self.columns)] , + # [i // group_size for i in range(self.columns)], #B dtype=torch.int, ).to(device=invperm.device) @@ -137,7 +137,7 @@ def compress( # for i in range(g_idx.shape[0]): # g_idx[perm[i]] = torch.Tensor([i // group_size]) - g_idx = g_idx[invperm] # A, B + # g_idx = g_idx[invperm] # A, B # print(g_idx) # breakpoint() # breakpoint() @@ -204,8 +204,9 @@ def compress( # update self.layer params with respect to g_idx # update_layer_weight_quant_params(self.layer, g_idx) - update_layer_weight_quant_params(self.layer,perm=perm, - # g_idx=g_idx, + update_layer_weight_quant_params(self.layer, + # perm=perm, + g_idx=g_idx, ) is_layer_updated_actorder = True @@ -262,6 +263,7 @@ def compress( zero_point[:, int(g_idx_for_perm_weights[column_idx])], # scale[:, int(g_idx[column_idx])], # zero_point[:, int(g_idx[column_idx])], + # scale[:, int(g_idx[column_idx])], # zero_point[:, int(g_idx[column_idx])], # scale[:, input_dim_group], @@ -345,16 +347,6 @@ def free(self): g_idx = g_idx[invperm] - - - -Make g_idx - -g_idx = torch.Tensor( - [perm[i] // group_size for i in range(self.columns)] - ).to(device=invperm.device) - -in this format, make sure it runs in compressed tensors and llm-compressor """ From c83f977c41392b66d2dd8391f481957d86b49793 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Tue, 6 Aug 2024 18:34:28 +0000 Subject: [PATCH 15/15] mimic gptq --- .../quantization/gptq/utils/gptq_wrapper.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py index a3c55824e..bc86e2cea 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -122,8 +122,8 @@ def compress( # breakpoint() # g_idx for the original weights g_idx = torch.tensor( - [perm[i] // group_size for i in range(self.columns)] , - # [i // group_size for i in range(self.columns)], #B + # [perm[i] // group_size for i in range(self.columns)] , + [i // group_size for i in range(self.columns)], #B dtype=torch.int, ).to(device=invperm.device) @@ -142,8 +142,11 @@ def compress( # breakpoint() # breakpoint() + # self.layer.weight_g_idx.data = g_idx + g_idx = g_idx[invperm] self.layer.weight_g_idx.data = g_idx + Losses = torch.zeros(self.rows, device=self.dev) damp = percdamp * torch.mean(torch.diag(self.H)) @@ -205,8 +208,8 @@ def compress( # update self.layer params with respect to g_idx # update_layer_weight_quant_params(self.layer, g_idx) update_layer_weight_quant_params(self.layer, - # perm=perm, - g_idx=g_idx, + perm=perm, + # g_idx=g_idx, ) is_layer_updated_actorder = True @@ -214,8 +217,8 @@ def compress( scale = self.layer.weight_scale zero_point = self.layer.weight_zero_point # breakpoint() - scale = scale[:, g_idx] - zero_point=zero_point[:, g_idx] + # scale = scale[:, g_idx] + # zero_point=zero_point[:, g_idx] from compressed_tensors.quantization import QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import ( @@ -261,8 +264,10 @@ def compress( q, scale[:, int(g_idx_for_perm_weights[column_idx])], zero_point[:, int(g_idx_for_perm_weights[column_idx])], - # scale[:, int(g_idx[column_idx])], - # zero_point[:, int(g_idx[column_idx])], + # scale[:, int(perm[column_idx] // group_size)], + # zero_point[:, int(perm[column_idx] // group_size)], + # scale[:, int(perm[column_idx] // group_size)], + # zero_point[:, int(perm[column_idx] // group_size)], # scale[:, int(g_idx[column_idx])], # zero_point[:, int(g_idx[column_idx])], @@ -344,7 +349,7 @@ def free(self): W = w.clone()[perm] # tensor([0, 2, 3, 5, 1, 4]) g_idx = torch.tensor([int(i // group_size) for i in range(n)], dtype=torch.int) -g_idx = g_idx[invperm] +g_idx = g_idx[invperm] # tensor([0, 2, 2, 1, 1, 0] """