From 4a1735a38292389b75e06bda4529f722be6e9b64 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 21 Nov 2023 16:11:44 +0800
Subject: [PATCH 01/25] rename rtn_quantize to weight_only_algos

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../torch/algorithms/rtn_quantize.py          | 77 -------------------
 1 file changed, 77 deletions(-)
 delete mode 100644 neural_compressor/torch/algorithms/rtn_quantize.py

diff --git a/neural_compressor/torch/algorithms/rtn_quantize.py b/neural_compressor/torch/algorithms/rtn_quantize.py
deleted file mode 100644
index 55e9fd31f4d..00000000000
--- a/neural_compressor/torch/algorithms/rtn_quantize.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Dict
-
-import torch
-
-from neural_compressor.common.base_config import BaseConfig
-from neural_compressor.common.logger import Logger
-from neural_compressor.common.utility import RTN_WEIGHT_ONLY_QUANT
-from neural_compressor.torch.algorithms.rtn import rtn_quantize as torch_rtn_quantize
-from neural_compressor.torch.quantization.config import RTNWeightQuantConfig
-from neural_compressor.torch.utils import fetch_module, register_algo, set_module
-
-logger = Logger().get_logger()
-
-
-def _apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
-    enable_full_range = quant_config.enable_full_range
-    enable_mse_search = quant_config.enable_mse_search
-    group_dim = quant_config.group_dim
-    dtype = quant_config.weight_dtype
-    num_bits = quant_config.weight_bits
-    scheme = "sym" if quant_config.weight_sym else "asym"
-    group_size = quant_config.weight_group_size
-    return_int = quant_config.return_int
-    return torch_rtn_quantize(
-        module,
-        num_bits,
-        group_size,
-        scheme,
-        return_int=return_int,
-        data_type=dtype,
-        enable_full_range=enable_full_range,
-        enable_mse_search=enable_mse_search,
-        group_dim=group_dim,
-    )
-
-
-def _convert_quant_config_into_quant_config_mapping(
-    fp32_model: torch.nn.Module, quant_config: BaseConfig
-) -> Dict[str, BaseConfig]:
-    # TODO(Yi) enhance it, currently we only assign the global config to module
-    # model_info: List[Tuple[str, Callable]] = []
-    linear_lst = []
-    for name, module in fp32_model.named_modules():
-        if isinstance(module, torch.nn.Linear):
-            linear_lst.append(name)
-    _quant_config = quant_config if quant_config.global_config is None else quant_config.global_config
-    quant_config_mapping: Dict[str, BaseConfig] = {name: _quant_config for name in linear_lst}
-    return quant_config_mapping
-
-
-@register_algo(name=RTN_WEIGHT_ONLY_QUANT)
-def rtn_quantize_entry(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
-    quant_config_mapping: Dict[str, RTNWeightQuantConfig] = _convert_quant_config_into_quant_config_mapping(
-        model, quant_config
-    )
-    """The main entry to apply rtn quantization."""
-    for op_name, quant_config in quant_config_mapping.items():
-        original_module = fetch_module(model, op_name)
-        logger.info(f"Apply RTN on module: {op_name}, {original_module}")
-        rtn_module = _apply_rtn_on_single_module(original_module, quant_config)
-        set_module(model, op_name, rtn_module)
-    return model

From 4cbcdde672fe8580dcf94b0b63f19bfa5aef94cc Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 21 Nov 2023 16:13:32 +0800
Subject: [PATCH 02/25] copied gpt into 3.x

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py | 976 +++++++++++++++++++++
 1 file changed, 976 insertions(+)
 create mode 100644 neural_compressor/torch/algorithms/gptq.py

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
new file mode 100644
index 00000000000..5c128417531
--- /dev/null
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -0,0 +1,976 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copied from neural_compressor/adaptor/torch_utils/gptq.py
+
+import gc
+import math
+import random
+import re
+import time
+from collections import UserDict, defaultdict
+from functools import partial
+
+import torch
+import torch.nn as nn
+import transformers
+from tqdm import tqdm
+
+from neural_compressor.common.logger import Logger
+
+logger = Logger().get_logger()
+
+
+DEBUG = False
+
+
+# ================ device related ===================
+def move_input_to_device(input, device=torch.device("cpu")):
+    if isinstance(input, dict) or isinstance(input, UserDict):
+        for inp in input.keys():
+            input[inp] = input[inp].to(device) if isinstance(input[inp], torch.Tensor) else input[inp]
+    elif isinstance(input, list) or isinstance(input, tuple):
+        input_res, prev_size = [], None
+        for inp in input:
+            if prev_size:
+                if isinstance(inp, torch.Tensor):
+                    if inp.size() == prev_size:
+                        input_res.append(inp.to(device))
+                else:
+                    if torch.tensor(inp).size == prev_size:
+                        input_res.append(inp)
+            else:
+                input_res.append(inp.to(device) if isinstance(inp, torch.Tensor) else inp)
+            prev_size = torch.tensor(inp).size()
+        input = input_res
+    else:
+        input = input.to(device)  # pylint: disable=no-member
+    return input
+
+
+# ==============model structure related==============
+def is_leaf(module):
+    """Judge whether a module has no child-modules.
+
+    Args:
+        module: torch.nn.Module
+
+    Returns:
+        a bool: whether a module has no child-modules.
+    """
+    children_cnt = 0
+    for n in module.children():
+        children_cnt += 1
+    return True if children_cnt == 0 else False
+
+
+def trace_gptq_target_blocks(module, module_types=[torch.nn.ModuleList, torch.nn.Sequential]):
+    """Search transformer stacked structures, which is critical in LLMs and GPTQ execution.
+
+    Args:
+        module: torch.nn.Module
+        module_types: List of torch.nn.Module.
+
+    Returns:
+        gptq_related_blocks = {
+            "embeddings": {}, # Dict embedding layers before transformer stack module,
+            "transformers_pre": {}, # TODO
+            "transformers_name": string. LLMs' transformer stack module name ,
+            "transformers": torch.nn.ModuleList. LLMs' transformer stack module,
+            "transformers": {}, Dict# TODO
+        }
+    """
+    if type(module).__name__ == "MixFormerSequentialForCausalLM":  # pragma: no cover
+        gptq_related_blocks = {
+            "embeddings": {},
+            "transformers_pre": {},  # todo
+            "transformers_name": "",  # None
+            "transformers": [],  # None
+            "transformers_post": {},  # todo
+        }
+        for n, m in module.named_modules():
+            if type(m) in module_types:
+                gptq_related_blocks["transformers_name"] = n
+                gptq_related_blocks["transformers"] = m
+                break
+            else:
+                continue
+        for n, m in gptq_related_blocks["transformers"][0].named_modules():
+            if is_leaf(m):
+                gptq_related_blocks["embeddings"][n] = m
+        gptq_related_blocks["transformers"] = gptq_related_blocks["transformers"][1:-1]
+    else:
+        gptq_related_blocks = {
+            "embeddings": {},
+            "transformers_pre": {},  # todo
+            "transformers_name": "",  # None
+            "transformers": [],  # None
+            "transformers_post": {},  # todo
+        }
+        for n, m in module.named_modules():
+            if type(m) in module_types:
+                gptq_related_blocks["transformers_name"] = n
+                gptq_related_blocks["transformers"] = m
+                return gptq_related_blocks
+            else:
+                if is_leaf(m):
+                    gptq_related_blocks["embeddings"][n] = m
+    return gptq_related_blocks
+
+
+def find_layers(module, layers=[nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Conv1D], name=""):
+    """Get all layers with target types."""
+    if type(module) in layers:
+        return {name: module}
+    else:
+        # use string type to find name:
+        if type(module).__name__ in ["Linear"]:
+            return {name: module}
+        else:
+            pass
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+    return res
+
+
+def find_layers_name(module, layers=[nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Conv1D], name=""):
+    """Get all layers with target types."""
+    if type(module) in layers:
+        return [name]
+    res = []
+    for name1, child in module.named_children():
+        res += find_layers_name(child, layers=layers, name=name + "." + name1 if name != "" else name1)
+    return res
+
+
+def log_quantizable_layers_per_transformer(
+    transformer_blocks, layers=[nn.Conv2d, nn.Conv1d, nn.Linear, transformers.Conv1D]
+):
+    """Print all layers which will be quantized in GPTQ algorithm."""
+    logger.info("* * Layer to be quantized * *")
+
+    for block_id in range(len(transformer_blocks["transformers"])):
+        transformer_block = transformer_blocks["transformers"][block_id]
+        layers_for_this_tblock = find_layers_name(transformer_block)
+        layer_names = [
+            (transformer_blocks["transformers_name"] + "." + str(block_id) + "." + layer_name)
+            for layer_name in layers_for_this_tblock
+        ]
+        for name in layer_names:
+            logger.info(name)
+
+
+# ===============quantization related============================
+def quantize(x, scale, zero, maxq):
+    """Do quantization."""
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class GPTQuantizer(object):
+    """Main API for GPTQ algorithm.
+
+    Please refer to:
+    GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers
+    url: https://arxiv.org/abs/2210.17323
+    """
+
+    def __init__(
+        self,
+        model,
+        weight_config={},
+        dataloader=None,
+        nsamples=128,
+        use_max_length=True,
+        pad_max_length=2048,
+        device=None,
+        layer_wise=False,
+    ):
+        """
+        Args:
+            model: the fp32 model to quantize
+            weight_config (dict, optional): contains all info required by GPTQ. Defaults to {}. For example,
+            weight_config={
+                'layer1':
+                {
+                    'bits': 4,
+                    'group_size': 32,
+                    'sym': False,
+                    'percdamp': .01,
+                    'act_order': False
+                }
+                ...
+            }
+            dataloader: an iterable containing calibration datasets, contains (inputs, targets)
+            device: cpu or cuda
+        """
+        # model
+        self.model = model
+        # self.use_cache = self.model.config.use_cache
+        self.gptq_related_blocks = trace_gptq_target_blocks(self.model)  # get the transformer block list above
+        self.dtype = next(iter(self.model.parameters())).dtype
+        log_quantizable_layers_per_transformer(self.gptq_related_blocks)
+
+        # weight config
+        self.weight_config = weight_config
+        # default settings, check configs
+        self.wbits_default = 4
+        self.group_size_default = 128
+        self.block_size_default = 128
+        self.percdamp_default = 0.01
+        self.sym_default = False
+        self.act_order_default = False
+        self.perchannel_default = True
+        self.mse_default = False
+        self.check_layer_config()
+
+        # device
+        self.device = device
+        if str(self.model.device).startswith("cuda"):
+            self.device = self.model.device
+        self.is_ready = False
+
+        self.layer_wise = layer_wise
+
+        # dataloader
+        self.use_max_length = use_max_length
+        self.pad_max_length = pad_max_length
+        self.dataloader_original = dataloader
+        self.dataloader = []
+        self.nsamples = nsamples
+        self.prepare_dataloader()
+
+    def prepare_dataloader(self):
+        if self.use_max_length:
+            # (Recommend) only take sequence whose length exceeds self.pad_max_length,
+            # which preserves calibration's tokens are all valid
+            # This is GPTQ official dataloader implementation
+            self.obtain_first_n_samples_fulllength()
+        else:
+            # general selection, no padding, not GPTQ original implementation.
+            self.obtain_first_n_samples()
+        try:
+            self.cache_key_arguments = {
+                "i": 0
+            }  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
+            # Note that the first elements in cache_positional_arguments is main input: hidden_states
+            self.cache_positional_arguments = []  # a list of list, positional arguments ("rotary_pos_emb" in chatglm)
+            self.is_ready = True
+        except:
+            logger.warning("GPTQ Quantizer initialization failed!")
+            pass
+
+    def obtain_first_n_samples(self, seed=0):
+        """Get first nsample data as the real calibration dataset."""
+        self.dataloader.clear()
+        random.seed(seed)
+        for batch in self.dataloader_original:
+            # process data, depends on its data type.
+            if len(self.dataloader) == self.nsamples:
+                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                break
+            # list, tuple
+            if isinstance(batch, list) or isinstance(batch, tuple):
+                if batch[0].shape[-1] > self.pad_max_length:
+                    i = random.randint(0, batch[0].shape[-1] - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
+                else:
+                    batch_final = batch[:]
+            # dict
+            elif isinstance(batch, dict):
+                try:
+                    length = batch["input_ids"].shape[-1]
+                except:
+                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                    continue
+                batch_final = {}
+                if length > self.pad_max_length:
+                    i = random.randint(0, length - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    # may have to slice every sequence related data
+                    for key in batch.keys():
+                        if isinstance(batch[key], torch.Tensor):
+                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
+                        else:
+                            batch_final[key] = batch[key]
+                else:
+                    batch_final = batch
+            # tensor
+            else:
+                if batch.shape[-1] > self.pad_max_length:
+                    i = random.randint(0, batch.shape[-1] - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    batch_final = batch[:, i:j]
+                else:
+                    batch_final = batch
+            self.dataloader.append(batch_final)
+
+        if len(self.dataloader) < self.nsamples:
+            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
+
+    def obtain_first_n_samples_fulllength(self, seed=0):
+        self.dataloader.clear()
+        random.seed(seed)
+        unified_length = self.pad_max_length
+        for batch in self.dataloader_original:
+            if len(self.dataloader) == self.nsamples:
+                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                break
+            # list & tuple, gpt-j-6b mlperf, etc.
+            if isinstance(batch, list) or isinstance(batch, tuple):
+                if batch[0].shape[-1] == unified_length:
+                    batch_final = batch[:]
+                elif batch[0].shape[-1] > unified_length:
+                    i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
+                    j = i + unified_length
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
+                else:
+                    # not match max length, not include in target dataset
+                    continue
+            # dict
+            elif isinstance(batch, dict):
+                try:
+                    length = batch["input_ids"].shape[-1]
+                except:
+                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                    continue
+                batch_final = {}
+                if length == self.pad_max_length:
+                    batch_final = batch
+                elif length > self.pad_max_length:
+                    i = random.randint(0, length - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    # may have to slice every sequence related data
+                    for key in batch.keys():
+                        if isinstance(batch[key], torch.Tensor):
+                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
+                        else:
+                            batch_final[key] = batch[key]
+                else:
+                    # not match max length, not include in target dataset
+                    continue
+            # tensor
+            else:
+                if batch.shape[-1] == unified_length:
+                    batch_final = batch
+                elif batch.shape[-1] > unified_length:
+                    i = random.randint(0, batch.shape[-1] - unified_length - 1)
+                    j = i + unified_length
+                    batch_final = batch[:, i:j]
+                else:
+                    # not match max length, not include in target dataset
+                    continue
+            self.dataloader.append(batch_final)
+        if len(self.dataloader) < self.nsamples:  # pragma: no cover
+            logger.warning(
+                f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
+            but only {len(self.dataloader)} samples are found. Please use smaller 'self.pad_max_length' value."
+            )
+
+    def get_full_layer_name(self, sub_layer_name, block_idx):
+        transformer_name = self.gptq_related_blocks["transformers_name"]
+        return ".".join([transformer_name, str(block_idx), sub_layer_name])
+
+    def check_layer_config(self):
+        """Copy arguments from weight_config to built-in attributes."""
+        if "wbits" in self.weight_config:
+            tmp_weight_config = {}
+            for name, module in self.model.named_modules():
+                tmp_weight_config[name] = {}
+                tmp_weight_config[name]["wbits"] = self.weight_config.get("wbits", self.wbits_default)
+                tmp_weight_config[name]["group_size"] = self.weight_config.get("group_size", self.group_size_default)
+                tmp_weight_config[name]["block_size"] = self.weight_config.get("block_size", self.group_size_default)
+                tmp_weight_config[name]["percdamp"] = self.weight_config.get("pecdamp", self.percdamp_default)
+                tmp_weight_config[name]["sym"] = self.weight_config.get("sym", self.sym_default)
+                tmp_weight_config[name]["act_order"] = self.weight_config.get("act_order", self.act_order_default)
+                tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default)
+                tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default)
+            self.weight_config = tmp_weight_config
+        else:
+            for layer_name, config in self.weight_config.items():
+                self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
+                self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
+                self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
+                self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
+                self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
+                self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
+                self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
+                self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
+
+    def get_layer_config(self, layer_name):
+        """Obtain config for one layer, since GPTQ supports layer-wise config."""
+        # First try the exact name matching, if cannot find, use re to search. For example, can support ".*" in op_name
+        config = None
+        config = self.weight_config.get(layer_name, None)
+        if config is not None:
+            return config
+        else:
+            for k, v in self.weight_config.items():
+                regex = re.compile(k)
+                if len(regex.findall(layer_name)) is not None:
+                    config = v
+                    return config
+                else:
+                    pass
+        return config
+
+    def track_hidden_states(self, data):
+        if isinstance(data, torch.Tensor):
+            return data
+        elif isinstance(data, tuple) or isinstance(data, list):
+            return data[0]
+
+    @torch.no_grad()
+    def pre_quantization(self):
+        """Prepare input calibration data and other attributes which are critical for gptq execution."""
+
+        # critical: hooker function which collects inputs
+        def forward(layer, *args, **kwargs):
+            # inputs[inputs_info['idx']] = input_ids # TODO solve the problem of batchsize!=1
+            self.cache_key_arguments["i"] += 1
+            for arg in kwargs:
+                # TODO: investigate include parameters
+                # each outputs can be different shape, hence also use list to store
+                if isinstance(kwargs[arg], torch.Tensor) or arg == "alibi":
+                    if self.cache_key_arguments.get(arg, None) is None:
+                        self.cache_key_arguments[arg] = []
+                    self.cache_key_arguments[arg].append(kwargs[arg])
+                continue
+            # copy positional arguments, positional arguments are sensitive for their order, be cautious!
+            # Most models in HF has avoid this, but some models still use positional arguments other than
+            # hidden_states, chatglm2-6b etc.
+            for idx, item in enumerate(args):
+                if (idx + 1) > len(self.cache_positional_arguments):
+                    # initialize
+                    self.cache_positional_arguments.append([])
+                self.cache_positional_arguments[idx].append(item)
+            raise ValueError
+
+        # Step1: fetch the embeddings and other layers before the transformer stack.
+        if not self.layer_wise:
+            for embedding_name, embedding_layer in self.gptq_related_blocks["embeddings"].items():
+                embedding_layer = embedding_layer.to(self.device)
+
+        # Step2: modify the first transformer block's forward function to obtain inputs for calibration
+        if not self.layer_wise:
+            self.gptq_related_blocks["transformers"][0] = self.gptq_related_blocks["transformers"][0].to(self.device)
+        forward_cache = self.gptq_related_blocks["transformers"][0].forward
+        self.gptq_related_blocks["transformers"][0].forward = partial(
+            forward, self.gptq_related_blocks["transformers"][0]
+        )
+
+        # Step3: run forward to obtain calibration datasets
+        logger.info("Collecting calibration inputs...")
+        for batch in tqdm(self.dataloader):
+            if not self.layer_wise:
+                batch = move_input_to_device(batch, self.device)
+            try:
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    self.model(batch[0])
+                elif isinstance(batch, dict):
+                    self.model(**batch)
+                else:
+                    self.model(batch)
+            except ValueError:
+                pass
+        # output inp data shape
+        logger.info("All calibration data's shape =>")
+        # check all hidden_states shape
+        try:
+            for hidden_states in self.cache_positional_arguments[0]:
+                logger.info(hidden_states.shape)
+        except:
+            pass
+        logger.info("Done.")
+
+        # Step 4: restore original forward function, relocate layers back to cpu.
+        self.gptq_related_blocks["transformers"][0].forward = forward_cache
+        if not self.layer_wise:
+            self.gptq_related_blocks["transformers"][0] = self.gptq_related_blocks["transformers"][0].cpu()
+            for embedding_name, embedding_layer in self.gptq_related_blocks["embeddings"].items():
+                embedding_layer.to(self.device)
+        torch.cuda.empty_cache()
+        # end
+        logger.info("GPTQ quantization prepared.")
+
+    def gather_single_batch_from_dict(self, data_dict, idx):
+        # obtain a set of keyword input from cache
+        single_batch = {}
+        for k, v in data_dict.items():
+            single_batch[k] = data_dict[k][idx]
+        return single_batch
+
+    def gather_single_batch_from_list(self, data_list, idx):
+        # obtain a set of keyword input from cache
+        single_batch = []
+        for data_item in data_list:
+            single_batch.append(data_item[idx])
+        return single_batch
+
+    def update_blockwise_hidden_states(self, outs):
+        if "hidden_states" in self.cache_key_arguments:
+            self.cache_key_arguments["hidden_states"] = outs[:]
+        else:
+            self.cache_positional_arguments[0] = outs[:]
+
+    @torch.no_grad()
+    def execute_quantization(self, means=None, stds=None, model_path=None):
+        """Run quantization."""
+        # Step1: prepare quantization (calibration datasets)
+
+        logger.info("Begin ====>")
+        self.pre_quantization()
+
+        # Step2: run gptq quantization in a transformer block-wise manner.
+        gptq_config = {}
+        tblock_length = len(self.gptq_related_blocks["transformers"])
+        for block_idx in range(tblock_length):
+            logger.info(f"Quantizing layer {block_idx + 1} / {tblock_length}..")
+            if not self.layer_wise:
+                # if we do not apply layer-wise feature, we still place the entire block on the GPU
+                transformer_block = self.gptq_related_blocks["transformers"][block_idx].to(self.device)
+            else:
+                transformer_block = self.gptq_related_blocks["transformers"][block_idx]  # .to(self.device)
+            # Step2.1: obtain all layers (Linear, Conv2d, etc) in the block which can be quantized.
+            sub_layers = find_layers(transformer_block)
+            sub_layers_to_quant = {}
+            for layer_name, layer_obj in sub_layers.items():
+                # filter sub_layers with included layer_names in self.weight_config
+                full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                # if self.weight_config.get(full_layer_name, None) == None:
+                if self.get_layer_config(full_layer_name) is None:
+                    logger.warning(f"{full_layer_name} can be quantized " + "but excluded from quantization configs.")
+                else:
+                    sub_layers_to_quant[layer_name] = layer_obj
+            del sub_layers
+            sub_layers = sub_layers_to_quant
+            # Step 2.2: Initialize GPTQ quantizers for collected layers.
+            gptq_for_this_block = {}
+            # initialize gptq quantizer for every layer in a transformer block
+            for layer_name in sub_layers:
+                # weight_config_this_layer = self.weight_config.get(
+                #     self.get_full_layer_name(layer_name, block_idx), None
+                # )
+                full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                weight_config_this_layer = self.get_layer_config(full_layer_name)
+                if self.layer_wise:
+                    from ..torch_utils.layer_wise_quant.utils import load_value
+
+                    W = load_value(self.model, full_layer_name + ".weight", model_path)
+                else:
+                    W = sub_layers[layer_name].weight.data.clone()
+
+                gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device)
+                # gptq_for_this_block[layer_name].quantizer = Quantizer()
+                gptq_for_this_block[layer_name].quantizer.configure(
+                    weight_config_this_layer["wbits"],
+                    weight_config_this_layer["perchannel"],
+                    weight_config_this_layer["sym"],
+                    weight_config_this_layer["mse"],
+                )
+
+            # Step 2.3: modify forward functions to hook inputs data (used in gptq execution)
+            def add_batch(_name):
+                def tmp(_, inp, out):
+                    gptq_for_this_block[_name].add_batch(inp[0].data, out.data)  # noqa: F821
+
+                return tmp
+
+            handles = []  # register handles which add inputs and outputs to gptq object
+            for layer_name in sub_layers:
+                handles.append(sub_layers[layer_name].register_forward_hook(add_batch(layer_name)))
+            idx = self.cache_key_arguments.pop("i")
+            for j in range(len(self.dataloader)):
+                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
+                cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
+                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
+                out = self.track_hidden_states(out)
+            self.cache_key_arguments["i"] = idx
+            for h in handles:
+                h.remove()
+            # Step 2.4: everything is prepared, so start quantization!
+            for layer_name in sub_layers:
+                # weight_config_this_layer = self.weight_config.get(
+                #     self.get_full_layer_name(layer_name, block_idx), None
+                # )
+                weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
+                logger.info(f"Quantizing layer {layer_name}")
+                if self.layer_wise:
+                    from ..torch_utils.layer_wise_quant.utils import load_value
+
+                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                    W = load_value(self.model, full_layer_name + ".weight", model_path)
+                else:
+                    W = sub_layers[layer_name].weight.data.clone()
+                scale, zp, Q = gptq_for_this_block[layer_name].fasterquant(
+                    W,
+                    blocksize=weight_config_this_layer["block_size"],
+                    percdamp=weight_config_this_layer["percdamp"],
+                    groupsize=weight_config_this_layer["group_size"],
+                    act_order=weight_config_this_layer["act_order"],
+                )
+                if self.layer_wise:
+                    from ..torch_utils.layer_wise_quant.utils import (
+                        LWQ_WORKSPACE,
+                        clean_module_weight,
+                        load_value,
+                        set_module_tensor_to_device,
+                    )
+
+                    sub_layer = sub_layers[layer_name]
+                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                    for n, p in sub_layer.named_parameters():
+                        param_name = full_layer_name + "." + n
+                        if n == "weight":
+                            set_module_tensor_to_device(self.model, param_name, self.device, Q)
+                        else:
+                            value = load_value(self.model, param_name, model_path)
+                            set_module_tensor_to_device(self.model, param_name, self.device, value)
+                    # sub_layer.weight.data = Q
+                    torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
+                    clean_module_weight(sub_layer)
+                    del Q
+                    gc.collect()
+                else:
+                    sub_layers[layer_name].weight.data = Q
+                gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
+                if not weight_config_this_layer["sym"]:
+                    gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
+                if weight_config_this_layer["act_order"]:  # save perm for restoring the weights
+                    gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[
+                        layer_name
+                    ].perm
+                gptq_for_this_block[layer_name].free()
+
+            # Step 2.5: replace output data with quantized weights
+            outs = []
+            idx = self.cache_key_arguments.pop("i")
+            for j in range(len(self.dataloader)):
+                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
+                cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
+                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
+                out = self.track_hidden_states(out)
+                outs.append(out)
+            self.cache_key_arguments["i"] = idx
+            if self.layer_wise:
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
+            else:
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
+            del gptq_for_this_block
+            torch.cuda.empty_cache()
+            # iteratively replace the input with output, thus layerwise quantization can continue.
+            self.update_blockwise_hidden_states(outs)
+            logger.info("------------------------------")
+
+        logger.info("Quantization done")
+        # self.model.config.use_cache = self.use_cache
+
+        # obtain model (all weight only quantization API function should return)
+        for k, v in gptq_config.items():
+            for m, n in v.items():
+                gptq_config[k][m] = n.tolist()
+        return self.model, gptq_config
+
+
+class GPTQ:
+    """
+    Please refer to:
+    GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers (https://arxiv.org/abs/2210.17323)
+    """
+
+    def __init__(self, layer, W, device="cpu"):
+        self.layer = layer
+        self.device = device
+        # W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d) or isinstance(self.layer, nn.Conv1d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]  # output channels
+        self.columns = W.shape[1]  # input channels
+        self.H = torch.zeros((self.columns, self.columns), device=self.device)
+        self.nsamples = 0
+        self.quantizer = Quantizer()
+        self.perm = None  # act_order choice
+
+    def add_batch(self, inp, out):
+        # if DEBUG:
+        #     self.inp1 = inp
+        #     self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        # TODO: llm's transformer sequential with nn.conv2d is currently not under test
+        # if isinstance(self.layer, nn.Conv2d):
+        #     unfold = nn.Unfold(
+        #         self.layer.kernel_size,
+        #         dilation=self.layer.dilation,
+        #         padding=self.layer.padding,
+        #         stride=self.layer.stride
+        #     )
+        #     inp = unfold(inp)
+        #     inp = inp.permute([1, 0, 2])
+        #     inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())  # H = X*X, which should be a sysm matrix
+
+    def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False):
+        # W = self.layer.weight.data.clone()
+        weight_shape, weight_dtype = W.shape, W.data.dtype
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0  # such channel makes no contribution to quantization computation
+
+        # rearrange considering the diag's value
+        if act_order:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+            self.perm = perm.clone()
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.device)
+        H[diag, diag] += damp  # add a average value of
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        scale = []
+        zero = []
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):  # within a block, channel wise
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if groupsize != -1:
+                    if (i1 + i) % groupsize == 0:
+                        self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + groupsize)], weight=True)
+                        scale.append(self.quantizer.scale)
+                        zero.append(self.quantizer.zero)
+
+                q = quantize(w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            # if DEBUG:
+            #     self.layer.weight.data[:, :i2] = Q[:, :i2]
+            #     self.layer.weight.data[:, i2:] = W[:, i2:]
+            #     logger.info(f"{torch.sum((self.layer(self.inp1) - self.out1) ** 2)}")
+            #     logger.info(f"{torch.sum(Losses)}")
+
+        if str(self.device).startswith("cuda"):
+            torch.cuda.synchronize()
+        logger.info(f"time {(time.time() - tick)}")
+        logger.info(f"error {torch.sum(Losses).item()}")
+
+        if act_order:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        # self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
+        Q = Q.reshape(weight_shape).to(weight_dtype)
+        if DEBUG:
+            logger.info(f"{torch.sum((self.layer(self.inp1) - self.out1) ** 2)}")
+
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, Q
+
+    def free(self):
+        if DEBUG:
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
+
+
+class Quantizer(nn.Module):
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer("maxq", torch.tensor(0))
+        self.register_buffer("scale", torch.zeros(shape))
+        self.register_buffer("zero", torch.zeros(shape))
+
+    def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=0.8, trits=False):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+        if trits:
+            self.maxq = torch.tensor(-1)
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        if self.maxq < 0:
+            self.scale = xmax
+            self.zero = xmin
+        else:
+            self.scale = (xmax - xmin) / self.maxq
+            if self.sym:
+                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+            else:
+                self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float("inf"), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    # def quantize(self, x):
+    #     if self.ready():
+    #         return quantize(x, self.scale, self.zero, self.maxq)
+    #     return x
+
+    # def enabled(self):
+    #     return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)

From 814a3f4550b30cb9ce11ae302d294c784703f0b4 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 21 Nov 2023 16:22:03 +0800
Subject: [PATCH 03/25] add gptq entry (WIP)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/common/utility.py           |   1 +
 .../torch/algorithms/weight_only_algos.py     | 105 ++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 neural_compressor/torch/algorithms/weight_only_algos.py

diff --git a/neural_compressor/common/utility.py b/neural_compressor/common/utility.py
index 51b37092033..32d3adbd4e7 100644
--- a/neural_compressor/common/utility.py
+++ b/neural_compressor/common/utility.py
@@ -26,4 +26,5 @@
 BASE_CONFIG = "base_config"
 COMPOSABLE_CONFIG = "composable_config"
 RTN_WEIGHT_ONLY_QUANT = "rtn_weight_only_quant"
+GTPQ = "gptq"
 DUMMY_CONFIG = "dummy_config"
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
new file mode 100644
index 00000000000..be5acada1e1
--- /dev/null
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Dict
+
+import torch
+
+from neural_compressor.common.base_config import BaseConfig
+from neural_compressor.common.logger import Logger
+from neural_compressor.common.utility import GTPQ, RTN_WEIGHT_ONLY_QUANT
+from neural_compressor.torch.algorithms.rtn import rtn_quantize as torch_rtn_quantize
+from neural_compressor.torch.quantization.config import RTNWeightQuantConfig
+from neural_compressor.torch.utils import fetch_module, register_algo, set_module
+
+logger = Logger().get_logger()
+
+
+def _apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+    enable_full_range = quant_config.enable_full_range
+    enable_mse_search = quant_config.enable_mse_search
+    group_dim = quant_config.group_dim
+    dtype = quant_config.weight_dtype
+    num_bits = quant_config.weight_bits
+    scheme = "sym" if quant_config.weight_sym else "asym"
+    group_size = quant_config.weight_group_size
+    return_int = quant_config.return_int
+    return torch_rtn_quantize(
+        module,
+        num_bits,
+        group_size,
+        scheme,
+        return_int=return_int,
+        data_type=dtype,
+        enable_full_range=enable_full_range,
+        enable_mse_search=enable_mse_search,
+        group_dim=group_dim,
+    )
+
+
+def _convert_quant_config_into_quant_config_mapping(
+    fp32_model: torch.nn.Module, quant_config: BaseConfig
+) -> Dict[str, BaseConfig]:
+    # TODO(Yi) enhance it, currently we only assign the global config to module
+    # model_info: List[Tuple[str, Callable]] = []
+    linear_lst = []
+    for name, module in fp32_model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            linear_lst.append(name)
+    _quant_config = quant_config if quant_config.global_config is None else quant_config.global_config
+    quant_config_mapping: Dict[str, BaseConfig] = {name: _quant_config for name in linear_lst}
+    return quant_config_mapping
+
+
+@register_algo(name=RTN_WEIGHT_ONLY_QUANT)
+def rtn_quantize_entry(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+    quant_config_mapping: Dict[str, RTNWeightQuantConfig] = _convert_quant_config_into_quant_config_mapping(
+        model, quant_config
+    )
+    """The main entry to apply rtn quantization."""
+    for op_name, quant_config in quant_config_mapping.items():
+        original_module = fetch_module(model, op_name)
+        logger.info(f"Apply RTN on module: {op_name}, {original_module}")
+        rtn_module = _apply_rtn_on_single_module(original_module, quant_config)
+        set_module(model, op_name, rtn_module)
+    return model
+
+
+@register_algo(name=GTPQ)
+def gptq_quantize_entry(
+    model,
+    weight_config={},
+    dataloader=None,
+    nsamples=128,
+    use_max_length=True,
+    pad_max_length=2048,
+    device=None,
+    layer_wise=False,
+    model_path=None,
+):
+    """Run weight-only quantization with."""
+    # TODO(Yi) aligned with rtn_quantize_entry
+    # TODO: unify weight_config keys, add docstring, and support default config
+    assert isinstance(model, torch.nn.Module), "only support torch module"
+    if layer_wise:
+        assert model_path is not None, "model_path should not be None when use layer_wise mode"
+    from neural_compressor.torch.algorithms.gptq import GPTQuantizer
+
+    gptq_quantizer = GPTQuantizer(
+        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
+    )
+    fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
+    logger.info("GPTQ quantizing done.")
+    return fp32_modified_model, gptq_config

From fd3d27f77516e3e831b7471c61cce2aeb62923bc Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 22 Nov 2023 12:11:13 +0800
Subject: [PATCH 04/25] add gptq config

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/common/utility.py           |   2 +-
 .../torch/algorithms/weight_only_algos.py     |   4 +-
 .../torch/quantization/config.py              | 101 +++++++++++++++++-
 3 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/common/utility.py b/neural_compressor/common/utility.py
index 32d3adbd4e7..d4287f09632 100644
--- a/neural_compressor/common/utility.py
+++ b/neural_compressor/common/utility.py
@@ -26,5 +26,5 @@
 BASE_CONFIG = "base_config"
 COMPOSABLE_CONFIG = "composable_config"
 RTN_WEIGHT_ONLY_QUANT = "rtn_weight_only_quant"
-GTPQ = "gptq"
+GPTQ = "gptq"
 DUMMY_CONFIG = "dummy_config"
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
index be5acada1e1..ed877e04981 100644
--- a/neural_compressor/torch/algorithms/weight_only_algos.py
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -19,7 +19,7 @@
 
 from neural_compressor.common.base_config import BaseConfig
 from neural_compressor.common.logger import Logger
-from neural_compressor.common.utility import GTPQ, RTN_WEIGHT_ONLY_QUANT
+from neural_compressor.common.utility import GPTQ, RTN_WEIGHT_ONLY_QUANT
 from neural_compressor.torch.algorithms.rtn import rtn_quantize as torch_rtn_quantize
 from neural_compressor.torch.quantization.config import RTNWeightQuantConfig
 from neural_compressor.torch.utils import fetch_module, register_algo, set_module
@@ -77,7 +77,7 @@ def rtn_quantize_entry(model: torch.nn.Module, quant_config: RTNWeightQuantConfi
     return model
 
 
-@register_algo(name=GTPQ)
+@register_algo(name=GPTQ)
 def gptq_quantize_entry(
     model,
     weight_config={},
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 16de62fab36..0cd48de1139 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -23,7 +23,7 @@
 import torch
 
 from neural_compressor.common.base_config import BaseConfig, register_config, registered_configs
-from neural_compressor.common.utility import DUMMY_CONFIG, RTN_WEIGHT_ONLY_QUANT
+from neural_compressor.common.utility import DUMMY_CONFIG, GPTQ, RTN_WEIGHT_ONLY_QUANT
 
 FRAMEWORK_NAME = "torch"
 
@@ -47,6 +47,9 @@ class OperatorConfig(NamedTuple):
 str2operator = {"Linear": torch.nn.Linear, "linear": torch.nn.functional.linear, "Conv2d": torch.nn.Conv2d}
 
 
+######################## RNT Config ###############################
+
+
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN_WEIGHT_ONLY_QUANT)
 class RTNWeightQuantConfig(BaseConfig):
     """Config class for round-to-nearest weight-only quantization."""
@@ -139,6 +142,102 @@ def get_default_rtn_config() -> RTNWeightQuantConfig:
     return RTNWeightQuantConfig()
 
 
+######################## GPTQ Config ###############################
+@register_config(framework_name=FRAMEWORK_NAME, algo_name=GPTQ)
+class GPTQConfig(BaseConfig):
+    """Config class for GPTQ.
+
+    GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers.
+    https://arxiv.org/abs/2210.17323
+    """
+
+    supported_configs: List[OperatorConfig] = []
+    params_list = [
+        "weight_bits",
+        "weight_group_size",
+        "weight_sym",
+        "act_dtype",
+        "group_dim",
+        "nsamples",
+        "percdamp",
+        "act_order",
+        "use_max_length",
+        "pad_max_length",
+        "enable_mse_search",
+    ]
+    name = GPTQ
+
+    def __init__(
+        self,
+        weight_bits: int = 4,
+        weight_group_size: int = 32,
+        weight_sym: bool = True,
+        act_dtype: str = "fp32",
+        group_dim: int = 1,
+        nsamples: int = 128,
+        percdamp: float = 0.01,
+        act_order: bool = False,
+        use_max_length: bool = True,
+        pad_max_length: int = 2048,
+        enable_mse_search: bool = False,
+        device=None,
+        layer_wise: bool = False,
+        return_int: bool = False,
+    ):
+        """Init GPTQ config.
+
+        Args:
+        """
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.weight_group_size = weight_group_size
+        self.weight_sym = weight_sym
+        self.act_dtype = act_dtype
+        self.enable_mse_search = enable_mse_search
+        self.group_dim = group_dim
+        self.nsamples = (nsamples,)
+        self.percdamp = (percdamp,)
+        self.act_order = (act_order,)
+        self.use_max_length = (use_max_length,)
+        self.pad_max_length = (pad_max_length,)
+        self.layer_wise = layer_wise
+        self.device = device
+        self.return_int = return_int
+
+    def to_dict(self):
+        return super().to_dict(params_list=self.params_list, operator2str=operator2str)
+
+    @classmethod
+    def from_dict(cls, config_dict):
+        return super(GPTQConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator)
+
+    @classmethod
+    def register_supported_configs(cls) -> List[OperatorConfig]:
+        supported_configs = []
+        # TODO(Yi)
+        linear_gptq_config = GPTQConfig()
+        operators = [torch.nn.Linear, torch.nn.functional.linear]
+        supported_configs.append(
+            OperatorConfig(config=linear_gptq_config, operators=operators, backend=Backend.DEFAULT)
+        )
+        cls.supported_configs = supported_configs
+
+
+# TODO(Yi) run `register_supported_configs` for all registered config.
+GPTQConfig.register_supported_configs()
+
+
+def get_default_gptq_config() -> GPTQConfig:
+    """Generate the default gptq config.
+
+    Returns:
+        the default gptq config.
+    """
+    return GPTQConfig()
+
+
+######################## Dummy Config ###############################
+# TODO (Yi) remove it after finishing the GPTQ config
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=DUMMY_CONFIG)
 class DummyConfig(BaseConfig):
     """Config class for round-to-nearest weight-only quantization."""

From 5009c52fe6eaadb59d74f86282b4731115d9815d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 22 Nov 2023 14:03:02 +0800
Subject: [PATCH 05/25] port gptq

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/common/base_config.py       |   6 +-
 neural_compressor/torch/__init__.py           |   4 +-
 .../torch/algorithms/__init__.py              |   3 +-
 neural_compressor/torch/algorithms/gptq.py    |  27 ++++
 .../torch/algorithms/weight_only_algos.py     | 128 ++++++++++++++----
 .../torch/quantization/__init__.py            |   2 +
 .../torch/quantization/config.py              |  14 +-
 .../torch/quantization/quantize.py            |  29 ++--
 neural_compressor/torch/utils.py              |   1 +
 test/3x/torch/test_config.py                  |   4 +-
 test/3x/torch/test_gptq.py                    |  84 ++++++++++++
 11 files changed, 251 insertions(+), 51 deletions(-)
 create mode 100644 test/3x/torch/test_gptq.py

diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py
index 51a3f70c1dc..effcf9d606e 100644
--- a/neural_compressor/common/base_config.py
+++ b/neural_compressor/common/base_config.py
@@ -201,11 +201,11 @@ def to_config_mapping(
             global_config = config.global_config
             op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
             for op_name, op_type in model_info:
-                config_mapping.setdefault(op_type, OrderedDict())[op_name] = global_config
+                config_mapping[(op_type, op_name)] = global_config
                 if op_type in op_type_config_dict:
-                    config_mapping[op_type][op_name] = op_name_config_dict[op_type]
+                    config_mapping[(op_type, op_name)] = op_name_config_dict[op_type]
                 if op_name in op_name_config_dict:
-                    config_mapping[op_type][op_name] = op_name_config_dict[op_name]
+                    config_mapping[(op_type, op_name)] = op_name_config_dict[op_name]
         return config_mapping
 
     @staticmethod
diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
index b8606e0b7f8..a0b414e2994 100644
--- a/neural_compressor/torch/__init__.py
+++ b/neural_compressor/torch/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from neural_compressor.torch.utils import register_algo
-from neural_compressor.torch.algorithms import rtn_quantize_entry
+from neural_compressor.torch.algorithms import rtn_quantize_entry, gptq_quantize_entry
 
 from neural_compressor.torch.quantization import (
     quantize,
@@ -21,4 +21,6 @@
     get_default_rtn_config,
     DummyConfig,
     get_default_dummy_config,
+    GPTQConfig,
+    get_default_gptq_config,
 )
diff --git a/neural_compressor/torch/algorithms/__init__.py b/neural_compressor/torch/algorithms/__init__.py
index 94a7739ef89..ebb6e56ae35 100644
--- a/neural_compressor/torch/algorithms/__init__.py
+++ b/neural_compressor/torch/algorithms/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 
 
-from neural_compressor.torch.algorithms.rtn_quantize import rtn_quantize_entry
+from neural_compressor.torch.algorithms.weight_only_algos import rtn_quantize_entry
+from neural_compressor.torch.algorithms.weight_only_algos import gptq_quantize_entry
diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index 5c128417531..70ddf8bc256 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -974,3 +974,30 @@ def find_params(self, x, weight=False):
 
     def ready(self):
         return torch.all(self.scale != 0)
+
+
+def apply_gptq_quantize(
+    model,
+    weight_config={},
+    dataloader=None,
+    nsamples=128,
+    use_max_length=True,
+    pad_max_length=2048,
+    device=None,
+    layer_wise=False,
+    model_path=None,
+):
+    from neural_compressor.torch.algorithms.gptq import GPTQuantizer
+
+    """Run gptq."""
+    # TODO: unify weight_config keys, add docstring, and support default config
+    assert isinstance(model, torch.nn.Module), "only support torch module"
+    if layer_wise:
+        assert model_path is not None, "model_path should not be None when use layer_wise mode"
+
+    gptq_quantizer = GPTQuantizer(
+        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
+    )
+    fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
+    logger.info("GPTQ quantizing done.")
+    return fp32_modified_model, gptq_config
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
index ed877e04981..1a8e5781da0 100644
--- a/neural_compressor/torch/algorithms/weight_only_algos.py
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 
-from typing import Dict
+import os
+from typing import Dict, Tuple
 
 import torch
 
@@ -21,13 +22,15 @@
 from neural_compressor.common.logger import Logger
 from neural_compressor.common.utility import GPTQ, RTN_WEIGHT_ONLY_QUANT
 from neural_compressor.torch.algorithms.rtn import rtn_quantize as torch_rtn_quantize
-from neural_compressor.torch.quantization.config import RTNWeightQuantConfig
+from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig
 from neural_compressor.torch.utils import fetch_module, register_algo, set_module
 
 logger = Logger().get_logger()
 
 
+###################### RTN Algo Entry ##################################
 def _apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+    # TODO (Yi) remove it
     enable_full_range = quant_config.enable_full_range
     enable_mse_search = quant_config.enable_mse_search
     group_dim = quant_config.group_dim
@@ -64,12 +67,11 @@ def _convert_quant_config_into_quant_config_mapping(
 
 
 @register_algo(name=RTN_WEIGHT_ONLY_QUANT)
-def rtn_quantize_entry(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
-    quant_config_mapping: Dict[str, RTNWeightQuantConfig] = _convert_quant_config_into_quant_config_mapping(
-        model, quant_config
-    )
+def rtn_quantize_entry(
+    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNWeightQuantConfig], *args, **kwargs
+) -> torch.nn.Module:
     """The main entry to apply rtn quantization."""
-    for op_name, quant_config in quant_config_mapping.items():
+    for (op_type, op_name), quant_config in configs_mapping.items():
         original_module = fetch_module(model, op_name)
         logger.info(f"Apply RTN on module: {op_name}, {original_module}")
         rtn_module = _apply_rtn_on_single_module(original_module, quant_config)
@@ -77,29 +79,95 @@ def rtn_quantize_entry(model: torch.nn.Module, quant_config: RTNWeightQuantConfi
     return model
 
 
+###################### GPTQ Algo Entry ##################################
+
+
+def gptq_config_mapping(configs_mapping: Dict[Tuple[str, callable], GPTQConfig]):
+    # convert GPTQ_CONFIG to gptq_quantize's weight config
+    # convert tune_cfg to gptq_quantize's weight config
+    """please refer to weight_config which can be analyzed by user-define API function weight_only.gptq_quantize
+    keys of weight_config can not only be specific name, but can also be a re formula
+    weight_config = {
+        "layer_name_1": {
+            'wbits': 4,
+            'group_size': 128,
+            'sym': False,
+            'percdamp': 0.01,
+            'actorder': True
+        },
+        "layer_name_2": {
+            'wbits': 4,
+            'group_size': 128,
+            'sym': False,
+            'percdamp': 0.01,
+            'actorder': True
+        }
+        ...
+    }
+    """
+    # for layer_wise quant mode
+    model_path = None
+    layer_wise = False
+    # TODO (Yi) uncomment it when port layer-wise
+    # if recipe_cfgs.get("layer_wise_quant", False):
+    #     layer_wise = True
+    #     from .torch_utils.layer_wise_quant.utils import LWQ_WORKSPACE, _get_path, register_weight_hooks
+
+    #     os.makedirs(LWQ_WORKSPACE, exist_ok=True)
+    #     # model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
+    #     model_path = model.path
+    #     assert model_path, "model_path should not be None."
+    #     model_path = _get_path(model_path)
+    #     lwq_handles = register_weight_hooks(
+    #         model, model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE
+    #     )
+
+    weight_config = {}
+    for (op_type, op_name), op_config in configs_mapping.items():
+        if op_config.weight_dtype == "fp32":
+            continue
+        else:
+            weight_config[op_name] = {
+                "wbits": op_config.weight_bits,
+                "group_size": op_config.weight_group_size,
+                "sym": op_config.weight_sym,
+                "percdamp": op_config.percdamp,
+                "act_order": op_config.act_order,
+                "block_size": op_config.block_size,
+            }
+            nsamples = op_config.nsamples
+            use_max_length = op_config.use_max_length
+            pad_max_length = op_config.pad_max_length
+            device = op_config.device
+
+    if use_max_length and op_config.pad_max_length == 2048:
+        logger.warning(
+            "You choose to use unified sequence length for calibration, \
+        but you have not set length value. Default sequence length is 2048 and this might cause inference error!"
+        )
+
+    return weight_config, nsamples, use_max_length, pad_max_length, device
+
+
 @register_algo(name=GPTQ)
 def gptq_quantize_entry(
-    model,
-    weight_config={},
-    dataloader=None,
-    nsamples=128,
-    use_max_length=True,
-    pad_max_length=2048,
-    device=None,
-    layer_wise=False,
-    model_path=None,
-):
-    """Run weight-only quantization with."""
-    # TODO(Yi) aligned with rtn_quantize_entry
-    # TODO: unify weight_config keys, add docstring, and support default config
-    assert isinstance(model, torch.nn.Module), "only support torch module"
-    if layer_wise:
-        assert model_path is not None, "model_path should not be None when use layer_wise mode"
-    from neural_compressor.torch.algorithms.gptq import GPTQuantizer
-
-    gptq_quantizer = GPTQuantizer(
-        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
+    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], dataloader, *args, **kwargs
+) -> torch.nn.Module:
+    logger.info("quantizing with the GPTQ algorithm")
+    weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
+    from neural_compressor.torch.algorithms.gptq import apply_gptq_quantize
+
+    model, quantization_perm = apply_gptq_quantize(
+        model=model,
+        weight_config=weight_config,
+        dataloader=dataloader,
+        nsamples=nsamples,
+        use_max_length=use_max_length,
+        pad_max_length=pad_max_length,
+        device=device,
+        layer_wise=False,
+        model_path=None,
     )
-    fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
-    logger.info("GPTQ quantizing done.")
-    return fp32_modified_model, gptq_config
+    # Assign the gptq config as an attribute of model
+    model._gptq_quantization_perm = quantization_perm
+    return model
diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py
index e159bf99bad..24235271dae 100644
--- a/neural_compressor/torch/quantization/__init__.py
+++ b/neural_compressor/torch/quantization/__init__.py
@@ -18,4 +18,6 @@
     get_default_rtn_config,
     DummyConfig,
     get_default_dummy_config,
+    GPTQConfig,
+    get_default_gptq_config,
 )
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 0cd48de1139..d4949d0b0ea 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -169,9 +169,11 @@ class GPTQConfig(BaseConfig):
 
     def __init__(
         self,
+        weight_dtype: str = "int",
         weight_bits: int = 4,
         weight_group_size: int = 32,
         weight_sym: bool = True,
+        block_size: int = 128,
         act_dtype: str = "fp32",
         group_dim: int = 1,
         nsamples: int = 128,
@@ -189,17 +191,19 @@ def __init__(
         Args:
         """
         super().__init__()
+        self.weight_dtype = weight_dtype
         self.weight_bits = weight_bits
         self.weight_group_size = weight_group_size
         self.weight_sym = weight_sym
         self.act_dtype = act_dtype
+        self.block_size = block_size
         self.enable_mse_search = enable_mse_search
         self.group_dim = group_dim
-        self.nsamples = (nsamples,)
-        self.percdamp = (percdamp,)
-        self.act_order = (act_order,)
-        self.use_max_length = (use_max_length,)
-        self.pad_max_length = (pad_max_length,)
+        self.nsamples = nsamples
+        self.percdamp = percdamp
+        self.act_order = act_order
+        self.use_max_length = use_max_length
+        self.pad_max_length = pad_max_length
         self.layer_wise = layer_wise
         self.device = device
         self.return_int = return_int
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index e53023ac363..4d59cf7c8ce 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable
+from typing import Any, Callable, Dict, Tuple
 
 import torch
 
@@ -20,13 +20,21 @@
 from neural_compressor.common.logger import Logger
 from neural_compressor.common.utility import RTN_WEIGHT_ONLY_QUANT
 from neural_compressor.torch.quantization.config import parse_config_from_dict
-from neural_compressor.torch.utils import algos_mapping
+from neural_compressor.torch.utils import algos_mapping, get_model_info
 
 logger = Logger().get_logger()
 
 
+def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name):
+    return any(config.name == algo_name for config in configs_mapping.values())
+
+
 def quantize(
-    model: torch.nn.Module, quant_config: BaseConfig, calib_func: Callable = None, calib_func_arg: Any = None
+    model: torch.nn.Module,
+    quant_config: BaseConfig,
+    calib_dataloader=None,
+    calib_func: Callable = None,
+    calib_func_arg: Any = None,
 ) -> torch.nn.Module:
     """The main entry to quantize model.
 
@@ -49,9 +57,12 @@ def quantize(
     logger.info(f"Quantize model with config: \n {quant_config.to_json_string()} \n")
     # select quantization algo according to config
     # TODO (Yi) support combine more than one algo
-    if quant_config.name == RTN_WEIGHT_ONLY_QUANT:
-        quant_fn = algos_mapping[quant_config.name]
-    else:
-        raise NotImplementedError("Currently, only the rtn algorithm is being ported.")
-    qmodel = quant_fn(model, quant_config)
-    return qmodel
+
+    model_info = get_model_info(model=model, white_module_list=[torch.nn.Linear])
+    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
+    logger.debug(configs_mapping)
+    for algo_name, algo_func in algos_mapping.items():
+        if need_apply(configs_mapping, algo_name):
+            logger.info(f"Start to apply {algo_name} on the model.")
+            model = algo_func(model, configs_mapping, calib_dataloader, calib_func, calib_func_arg)
+    return model
diff --git a/neural_compressor/torch/utils.py b/neural_compressor/torch/utils.py
index 134bb14797c..6556982b8a4 100644
--- a/neural_compressor/torch/utils.py
+++ b/neural_compressor/torch/utils.py
@@ -102,4 +102,5 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) ->
             if pair not in filter_result_set:
                 filter_result_set.add(pair)
                 filter_result.append(pair)
+    logger.debug(f"Get model info: {filter_result}")
     return filter_result
diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py
index e366873eaea..ba7ea51cba9 100644
--- a/test/3x/torch/test_config.py
+++ b/test/3x/torch/test_config.py
@@ -219,8 +219,8 @@ def test_config_mapping(self):
         logger.info(quant_config)
         configs_mapping = quant_config.to_config_mapping(model_info=model_info)
         logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[torch.nn.Linear]["fc1"].weight_bits == 6)
-        self.assertTrue(configs_mapping[torch.nn.Linear]["fc2"].weight_bits == 4)
+        self.assertTrue(configs_mapping[(torch.nn.Linear, "fc1")].weight_bits == 6)
+        self.assertTrue(configs_mapping[(torch.nn.Linear, "fc2")].weight_bits == 4)
 
 
 if __name__ == "__main__":
diff --git a/test/3x/torch/test_gptq.py b/test/3x/torch/test_gptq.py
new file mode 100644
index 00000000000..5e20664ed99
--- /dev/null
+++ b/test/3x/torch/test_gptq.py
@@ -0,0 +1,84 @@
+import unittest
+
+import torch
+
+from neural_compressor.common.logger import Logger
+
+logger = Logger().get_logger()
+
+
+def get_gpt_j():
+    import transformers
+
+    tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
+        "hf-internal-testing/tiny-random-GPTJForCausalLM",
+        torchscript=True,
+    )
+    return tiny_gptj
+
+
+class GPTQLLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(10):
+            yield torch.ones([1, 512], dtype=torch.long)
+
+
+class GPTQLLMDataLoaderList:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(10):
+            yield (torch.ones([1, 512], dtype=torch.long), torch.ones([1, 512], dtype=torch.long))
+
+
+class GPTQLLMDataLoaderDict:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(10):
+            yield {
+                "input_ids": torch.ones([1, 512], dtype=torch.long),
+                "attention_mask": torch.ones([1, 512], dtype=torch.long),
+            }
+
+
+class TestGPTQ(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        pass
+
+    @classmethod
+    def tearDownClass(self):
+        pass
+
+    def setUp(self):
+        # print the test name
+        logger.info(f"Running TestGPTQ test: {self.id()}")
+
+    def test_default_gptq(self):
+        # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+        # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
+        from neural_compressor.torch import GPTQConfig, quantize
+
+        # "gptq_args": {"percdamp": 0.01, "act_order": False, "use_max_length": True, "pad_max_length": 512},
+        quant_config = GPTQConfig(weight_group_size=8, pad_max_length=512)
+        quant_config.set_local("lm_head", GPTQConfig(weight_dtype="fp32"))
+        logger.info(f"Test GPTQ with config {quant_config}")
+        dataloader = GPTQLLMDataLoader()
+
+        # case 1: tensor
+        model_1 = get_gpt_j()
+        input = torch.ones([1, 512], dtype=torch.long)
+        out0 = model_1(input)
+        q_model = quantize(model=model_1, quant_config=quant_config, calib_dataloader=dataloader)
+        out1 = q_model(input)
+        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
+
+
+if __name__ == "__main__":
+    unittest.main()

From cb1f48b115149c45e556f853a9991fbd90007cf4 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 22 Nov 2023 14:05:46 +0800
Subject: [PATCH 06/25] add args palceholder for double quant

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/quantization/config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index d4949d0b0ea..3dffbfdda32 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -185,6 +185,8 @@ def __init__(
         device=None,
         layer_wise: bool = False,
         return_int: bool = False,
+        double_quant_bits: int = 4,
+        double_quant_group_size: int = 16,
     ):
         """Init GPTQ config.
 
@@ -207,6 +209,9 @@ def __init__(
         self.layer_wise = layer_wise
         self.device = device
         self.return_int = return_int
+        # placeholder for double quant
+        self.double_quant_bits: int = double_quant_bits
+        self.double_quant_group_size: int = double_quant_group_size
 
     def to_dict(self):
         return super().to_dict(params_list=self.params_list, operator2str=operator2str)

From 4233e9d5727572cae0b999b352a2126ca4c3b122 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 22 Nov 2023 14:13:17 +0800
Subject: [PATCH 07/25] clean entry

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py    |  53 ++++++++
 neural_compressor/torch/algorithms/rtn.py     |  26 ++++
 .../torch/algorithms/weight_only_algos.py     | 115 +-----------------
 3 files changed, 84 insertions(+), 110 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index 70ddf8bc256..ab034c45eda 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -1001,3 +1001,56 @@ def apply_gptq_quantize(
     fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
     logger.info("GPTQ quantizing done.")
     return fp32_modified_model, gptq_config
+
+
+# TODO (Yi) remove it after unifying the algo config parser
+from typing import Callable, Dict, Tuple
+
+from neural_compressor.torch.quantization.config import GPTQConfig
+
+
+def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig]):
+    # convert GPTQ_CONFIG to gptq_quantize's weight config
+    # convert tune_cfg to gptq_quantize's weight config
+    # for layer_wise quant mode
+    model_path = None
+    layer_wise = False
+    # TODO (Yi) uncomment it when port layer-wise
+    # if recipe_cfgs.get("layer_wise_quant", False):
+    #     layer_wise = True
+    #     from .torch_utils.layer_wise_quant.utils import LWQ_WORKSPACE, _get_path, register_weight_hooks
+
+    #     os.makedirs(LWQ_WORKSPACE, exist_ok=True)
+    #     # model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
+    #     model_path = model.path
+    #     assert model_path, "model_path should not be None."
+    #     model_path = _get_path(model_path)
+    #     lwq_handles = register_weight_hooks(
+    #         model, model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE
+    #     )
+
+    weight_config = {}
+    for (op_type, op_name), op_config in configs_mapping.items():
+        if op_config.weight_dtype == "fp32":
+            continue
+        else:
+            weight_config[op_name] = {
+                "wbits": op_config.weight_bits,
+                "group_size": op_config.weight_group_size,
+                "sym": op_config.weight_sym,
+                "percdamp": op_config.percdamp,
+                "act_order": op_config.act_order,
+                "block_size": op_config.block_size,
+            }
+            nsamples = op_config.nsamples
+            use_max_length = op_config.use_max_length
+            pad_max_length = op_config.pad_max_length
+            device = op_config.device
+
+    if use_max_length and op_config.pad_max_length == 2048:
+        logger.warning(
+            "You choose to use unified sequence length for calibration, \
+        but you have not set length value. Default sequence length is 2048 and this might cause inference error!"
+        )
+
+    return weight_config, nsamples, use_max_length, pad_max_length, device
diff --git a/neural_compressor/torch/algorithms/rtn.py b/neural_compressor/torch/algorithms/rtn.py
index 1c0071d99e9..a6eb9c779af 100644
--- a/neural_compressor/torch/algorithms/rtn.py
+++ b/neural_compressor/torch/algorithms/rtn.py
@@ -658,3 +658,29 @@ def rtn_quantize(
             q_weight = q_weight.T if group_dim == 0 else q_weight
             m.weight.data.copy_(q_weight)
     return model
+
+
+from neural_compressor.torch.quantization.config import RTNWeightQuantConfig
+
+
+def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+    # TODO (Yi) remove it
+    enable_full_range = quant_config.enable_full_range
+    enable_mse_search = quant_config.enable_mse_search
+    group_dim = quant_config.group_dim
+    dtype = quant_config.weight_dtype
+    num_bits = quant_config.weight_bits
+    scheme = "sym" if quant_config.weight_sym else "asym"
+    group_size = quant_config.weight_group_size
+    return_int = quant_config.return_int
+    return rtn_quantize(
+        module,
+        num_bits,
+        group_size,
+        scheme,
+        return_int=return_int,
+        data_type=dtype,
+        enable_full_range=enable_full_range,
+        enable_mse_search=enable_mse_search,
+        group_dim=group_dim,
+    )
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
index 1a8e5781da0..f5938a671c2 100644
--- a/neural_compressor/torch/algorithms/weight_only_algos.py
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -13,15 +13,12 @@
 # limitations under the License.
 
 
-import os
 from typing import Dict, Tuple
 
 import torch
 
-from neural_compressor.common.base_config import BaseConfig
 from neural_compressor.common.logger import Logger
 from neural_compressor.common.utility import GPTQ, RTN_WEIGHT_ONLY_QUANT
-from neural_compressor.torch.algorithms.rtn import rtn_quantize as torch_rtn_quantize
 from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig
 from neural_compressor.torch.utils import fetch_module, register_algo, set_module
 
@@ -29,52 +26,17 @@
 
 
 ###################### RTN Algo Entry ##################################
-def _apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
-    # TODO (Yi) remove it
-    enable_full_range = quant_config.enable_full_range
-    enable_mse_search = quant_config.enable_mse_search
-    group_dim = quant_config.group_dim
-    dtype = quant_config.weight_dtype
-    num_bits = quant_config.weight_bits
-    scheme = "sym" if quant_config.weight_sym else "asym"
-    group_size = quant_config.weight_group_size
-    return_int = quant_config.return_int
-    return torch_rtn_quantize(
-        module,
-        num_bits,
-        group_size,
-        scheme,
-        return_int=return_int,
-        data_type=dtype,
-        enable_full_range=enable_full_range,
-        enable_mse_search=enable_mse_search,
-        group_dim=group_dim,
-    )
-
-
-def _convert_quant_config_into_quant_config_mapping(
-    fp32_model: torch.nn.Module, quant_config: BaseConfig
-) -> Dict[str, BaseConfig]:
-    # TODO(Yi) enhance it, currently we only assign the global config to module
-    # model_info: List[Tuple[str, Callable]] = []
-    linear_lst = []
-    for name, module in fp32_model.named_modules():
-        if isinstance(module, torch.nn.Linear):
-            linear_lst.append(name)
-    _quant_config = quant_config if quant_config.global_config is None else quant_config.global_config
-    quant_config_mapping: Dict[str, BaseConfig] = {name: _quant_config for name in linear_lst}
-    return quant_config_mapping
-
-
 @register_algo(name=RTN_WEIGHT_ONLY_QUANT)
 def rtn_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNWeightQuantConfig], *args, **kwargs
 ) -> torch.nn.Module:
     """The main entry to apply rtn quantization."""
+    from neural_compressor.torch.algorithms.rtn import apply_rtn_on_single_module
+
     for (op_type, op_name), quant_config in configs_mapping.items():
         original_module = fetch_module(model, op_name)
         logger.info(f"Apply RTN on module: {op_name}, {original_module}")
-        rtn_module = _apply_rtn_on_single_module(original_module, quant_config)
+        rtn_module = apply_rtn_on_single_module(original_module, quant_config)
         set_module(model, op_name, rtn_module)
     return model
 
@@ -82,81 +44,14 @@ def rtn_quantize_entry(
 ###################### GPTQ Algo Entry ##################################
 
 
-def gptq_config_mapping(configs_mapping: Dict[Tuple[str, callable], GPTQConfig]):
-    # convert GPTQ_CONFIG to gptq_quantize's weight config
-    # convert tune_cfg to gptq_quantize's weight config
-    """please refer to weight_config which can be analyzed by user-define API function weight_only.gptq_quantize
-    keys of weight_config can not only be specific name, but can also be a re formula
-    weight_config = {
-        "layer_name_1": {
-            'wbits': 4,
-            'group_size': 128,
-            'sym': False,
-            'percdamp': 0.01,
-            'actorder': True
-        },
-        "layer_name_2": {
-            'wbits': 4,
-            'group_size': 128,
-            'sym': False,
-            'percdamp': 0.01,
-            'actorder': True
-        }
-        ...
-    }
-    """
-    # for layer_wise quant mode
-    model_path = None
-    layer_wise = False
-    # TODO (Yi) uncomment it when port layer-wise
-    # if recipe_cfgs.get("layer_wise_quant", False):
-    #     layer_wise = True
-    #     from .torch_utils.layer_wise_quant.utils import LWQ_WORKSPACE, _get_path, register_weight_hooks
-
-    #     os.makedirs(LWQ_WORKSPACE, exist_ok=True)
-    #     # model_path = recipe_cfgs["layer_wise_quant_args"].get("model_path", None)
-    #     model_path = model.path
-    #     assert model_path, "model_path should not be None."
-    #     model_path = _get_path(model_path)
-    #     lwq_handles = register_weight_hooks(
-    #         model, model_path, device=self.device, clean_weight=True, saved_path=LWQ_WORKSPACE
-    #     )
-
-    weight_config = {}
-    for (op_type, op_name), op_config in configs_mapping.items():
-        if op_config.weight_dtype == "fp32":
-            continue
-        else:
-            weight_config[op_name] = {
-                "wbits": op_config.weight_bits,
-                "group_size": op_config.weight_group_size,
-                "sym": op_config.weight_sym,
-                "percdamp": op_config.percdamp,
-                "act_order": op_config.act_order,
-                "block_size": op_config.block_size,
-            }
-            nsamples = op_config.nsamples
-            use_max_length = op_config.use_max_length
-            pad_max_length = op_config.pad_max_length
-            device = op_config.device
-
-    if use_max_length and op_config.pad_max_length == 2048:
-        logger.warning(
-            "You choose to use unified sequence length for calibration, \
-        but you have not set length value. Default sequence length is 2048 and this might cause inference error!"
-        )
-
-    return weight_config, nsamples, use_max_length, pad_max_length, device
-
-
 @register_algo(name=GPTQ)
 def gptq_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], dataloader, *args, **kwargs
 ) -> torch.nn.Module:
     logger.info("quantizing with the GPTQ algorithm")
-    weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
-    from neural_compressor.torch.algorithms.gptq import apply_gptq_quantize
+    from neural_compressor.torch.algorithms.gptq import apply_gptq_quantize, gptq_config_mapping
 
+    weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
     model, quantization_perm = apply_gptq_quantize(
         model=model,
         weight_config=weight_config,

From 102b950bc7a7f6f9258232009d6fe6764d509211 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 22 Nov 2023 14:43:02 +0800
Subject: [PATCH 08/25] fixed the import issue of lwq

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index ab034c45eda..54903cf1430 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -582,7 +582,8 @@ def execute_quantization(self, means=None, stds=None, model_path=None):
                 full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
                 if self.layer_wise:
-                    from ..torch_utils.layer_wise_quant.utils import load_value
+                    # TODO (Yi)
+                    from neural_compressor.adaptor.torch_utils.layer_wise_quant.utils import load_value
 
                     W = load_value(self.model, full_layer_name + ".weight", model_path)
                 else:
@@ -624,7 +625,8 @@ def tmp(_, inp, out):
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 logger.info(f"Quantizing layer {layer_name}")
                 if self.layer_wise:
-                    from ..torch_utils.layer_wise_quant.utils import load_value
+                    # TODO (Yi)
+                    from neural_compressor.adaptor.torch_utils.layer_wise_quant.utils import load_value
 
                     full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                     W = load_value(self.model, full_layer_name + ".weight", model_path)
@@ -638,7 +640,8 @@ def tmp(_, inp, out):
                     act_order=weight_config_this_layer["act_order"],
                 )
                 if self.layer_wise:
-                    from ..torch_utils.layer_wise_quant.utils import (
+                    # TODO (Yi)
+                    from neural_compressor.adaptor.torch_utils.layer_wise_quant.utils import (
                         LWQ_WORKSPACE,
                         clean_module_weight,
                         load_value,

From 49ef348fdce36b7d844ff85803bbf9368b3e63eb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 22 Nov 2023 18:17:02 +0800
Subject: [PATCH 09/25] clean entry

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py    | 48 ++++++++-----------
 .../torch/algorithms/weight_only_algos.py     | 19 ++------
 2 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index 54903cf1430..f6621a2f790 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -979,33 +979,6 @@ def ready(self):
         return torch.all(self.scale != 0)
 
 
-def apply_gptq_quantize(
-    model,
-    weight_config={},
-    dataloader=None,
-    nsamples=128,
-    use_max_length=True,
-    pad_max_length=2048,
-    device=None,
-    layer_wise=False,
-    model_path=None,
-):
-    from neural_compressor.torch.algorithms.gptq import GPTQuantizer
-
-    """Run gptq."""
-    # TODO: unify weight_config keys, add docstring, and support default config
-    assert isinstance(model, torch.nn.Module), "only support torch module"
-    if layer_wise:
-        assert model_path is not None, "model_path should not be None when use layer_wise mode"
-
-    gptq_quantizer = GPTQuantizer(
-        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
-    )
-    fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
-    logger.info("GPTQ quantizing done.")
-    return fp32_modified_model, gptq_config
-
-
 # TODO (Yi) remove it after unifying the algo config parser
 from typing import Callable, Dict, Tuple
 
@@ -1057,3 +1030,24 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
         )
 
     return weight_config, nsamples, use_max_length, pad_max_length, device
+
+
+def apply_gptq_quantize(model, configs_mapping, dataloader, *args, **kwargs):
+    """Apply gptq."""
+    # TODO: unify weight_config keys, add docstring, and support default config
+    weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
+    assert isinstance(model, torch.nn.Module), "only support torch module"
+    # TODO (Yi) disable layer-wise and model_path first
+    layer_wise = False
+    model_path = None
+
+    # Below is the same as the 2.x
+    if layer_wise:
+        assert model_path is not None, "model_path should not be None when use layer_wise mode"
+
+    gptq_quantizer = GPTQuantizer(
+        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
+    )
+    fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
+    logger.info("GPTQ quantization done.")
+    return fp32_modified_model, gptq_config
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
index f5938a671c2..83a9e763138 100644
--- a/neural_compressor/torch/algorithms/weight_only_algos.py
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -42,27 +42,14 @@ def rtn_quantize_entry(
 
 
 ###################### GPTQ Algo Entry ##################################
-
-
 @register_algo(name=GPTQ)
 def gptq_quantize_entry(
     model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], dataloader, *args, **kwargs
 ) -> torch.nn.Module:
-    logger.info("quantizing with the GPTQ algorithm")
-    from neural_compressor.torch.algorithms.gptq import apply_gptq_quantize, gptq_config_mapping
+    logger.info("Quantize model with the GPTQ algorithm.")
+    from neural_compressor.torch.algorithms.gptq import apply_gptq_quantize
 
-    weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
-    model, quantization_perm = apply_gptq_quantize(
-        model=model,
-        weight_config=weight_config,
-        dataloader=dataloader,
-        nsamples=nsamples,
-        use_max_length=use_max_length,
-        pad_max_length=pad_max_length,
-        device=device,
-        layer_wise=False,
-        model_path=None,
-    )
+    model, quantization_perm = apply_gptq_quantize(model=model, configs_mapping=configs_mapping, dataloader=dataloader)
     # Assign the gptq config as an attribute of model
     model._gptq_quantization_perm = quantization_perm
     return model

From 219e02b16ffb33dec897a9c95f8c3476abbcba1e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 23 Nov 2023 13:48:04 +0800
Subject: [PATCH 10/25] complete gptq config

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py       | 2 --
 neural_compressor/torch/quantization/config.py   | 9 ++++++++-
 neural_compressor/torch/quantization/quantize.py | 6 +++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index f6621a2f790..da3d9d0c811 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -989,8 +989,6 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
     # convert GPTQ_CONFIG to gptq_quantize's weight config
     # convert tune_cfg to gptq_quantize's weight config
     # for layer_wise quant mode
-    model_path = None
-    layer_wise = False
     # TODO (Yi) uncomment it when port layer-wise
     # if recipe_cfgs.get("layer_wise_quant", False):
     #     layer_wise = True
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 3dffbfdda32..ca136daeb50 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -151,11 +151,14 @@ class GPTQConfig(BaseConfig):
     https://arxiv.org/abs/2210.17323
     """
 
+    name = GPTQ
     supported_configs: List[OperatorConfig] = []
     params_list = [
+        "weight_dtype",
         "weight_bits",
         "weight_group_size",
         "weight_sym",
+        "block_size",
         "act_dtype",
         "group_dim",
         "nsamples",
@@ -164,8 +167,12 @@ class GPTQConfig(BaseConfig):
         "use_max_length",
         "pad_max_length",
         "enable_mse_search",
+        "device",
+        "layer_wise",
+        "return_int",
+        "double_quant_bits",
+        "double_quant_group_size",
     ]
-    name = GPTQ
 
     def __init__(
         self,
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 4d59cf7c8ce..75121e3ba3d 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -18,7 +18,6 @@
 
 from neural_compressor.common.base_config import BaseConfig
 from neural_compressor.common.logger import Logger
-from neural_compressor.common.utility import RTN_WEIGHT_ONLY_QUANT
 from neural_compressor.torch.quantization.config import parse_config_from_dict
 from neural_compressor.torch.utils import algos_mapping, get_model_info
 
@@ -41,22 +40,23 @@ def quantize(
     Args:
         model: a float model to be quantized.
         quant_config: a quantization configuration.
+        calib_dataloader: a calibration dataloader for calibrating the model. Defaults to None.
         calib_func: a calibration function for calibrating the model. Defaults to None.
         calib_func_arg: positional arguments for `calib_func`. Defaults to None.
 
     Returns:
         The quantized model.
     """
+    # TODO (Yi) support combine more than one algo
     if isinstance(quant_config, dict):
         quant_config = parse_config_from_dict(quant_config)
-        logger.info("Parsed dict to construct the quantization config.")
+        logger.info(f"Parsed a config dict to construct the quantization config: {quant_config}.")
     else:
         assert isinstance(
             quant_config, BaseConfig
         ), "Please pass a dict or config instance as the quantization configuration."
     logger.info(f"Quantize model with config: \n {quant_config.to_json_string()} \n")
     # select quantization algo according to config
-    # TODO (Yi) support combine more than one algo
 
     model_info = get_model_info(model=model, white_module_list=[torch.nn.Linear])
     configs_mapping = quant_config.to_config_mapping(model_info=model_info)

From 8a7b571c382060f89af276a898e18167ed22fa94 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 12:52:09 +0800
Subject: [PATCH 11/25] add UTs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/common/base_config.py       | 28 +++++++++++++------
 .../torch/quantization/config.py              |  9 ------
 .../torch/quantization/quantize.py            |  6 ++--
 test/3x/torch/test_config.py                  | 12 +++++++-
 4 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py
index effcf9d606e..c94de80f215 100644
--- a/neural_compressor/common/base_config.py
+++ b/neural_compressor/common/base_config.py
@@ -118,12 +118,16 @@ def from_dict(cls, config_dict, str2operator=None):
         Returns:
             The constructed config.
         """
-        config = cls(**config_dict.get(GLOBAL, {}))
-        operator_config = config_dict.get(LOCAL, {})
-        if operator_config:
-            for op_name, op_config in operator_config.items():
-                config.set_local(op_name, cls(**op_config))
-        return config
+        if GLOBAL not in config_dict and LOCAL not in config_dict:
+            config = cls(**config_dict)
+            return config
+        else:
+            config = cls(**config_dict.get(GLOBAL, {}))
+            operator_config = config_dict.get(LOCAL, {})
+            if operator_config:
+                for op_name, op_config in operator_config.items():
+                    config.set_local(op_name, cls(**op_config))
+            return config
 
     @classmethod
     def to_diff_dict(cls, instance) -> Dict[str, Any]:
@@ -234,9 +238,15 @@ def to_dict(self, params_list=[], operator2str=None):
         return result
 
     @classmethod
-    def from_dict(cls, config_dict, str2operator=None):
-        # TODO(Yi)
-        pass
+    def from_dict(cls, config_dict: OrderedDict[str, Dict], config_registry: Dict[str, BaseConfig]):
+        assert len(config_dict) >= 1, "The config dict must include at least one configuration."
+        num_configs = len(config_dict)
+        name, value = next(iter(config_dict.items()))
+        config = config_registry[name].from_dict(value)
+        for _ in range(num_configs - 1):
+            name, value = next(iter(config_dict.items()))
+            config += config_registry[name].from_dict(value)
+        return config
 
     def to_json_string(self, use_diff: bool = False) -> str:
         return json.dumps(self.to_dict(), indent=2) + "\n"
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index ca136daeb50..bb426e62365 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -316,12 +316,3 @@ def get_default_dummy_config() -> DummyConfig:
 
 def get_all_registered_configs() -> Dict[str, BaseConfig]:
     return registered_configs.get(FRAMEWORK_NAME, {})
-
-
-def parse_config_from_dict(config_dict: Dict) -> BaseConfig:
-    torch_registered_configs = get_all_registered_configs()
-    for key, val in config_dict.items():
-        if key in torch_registered_configs:
-            config = torch_registered_configs[key].from_dict(val)
-            return config
-        # TODO(Yi) parse multiple configs after support configs add
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 75121e3ba3d..09dda320812 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -16,9 +16,9 @@
 
 import torch
 
-from neural_compressor.common.base_config import BaseConfig
+from neural_compressor.common.base_config import BaseConfig, ComposableConfig, registered_configs
 from neural_compressor.common.logger import Logger
-from neural_compressor.torch.quantization.config import parse_config_from_dict
+from neural_compressor.torch.quantization.config import FRAMEWORK_NAME
 from neural_compressor.torch.utils import algos_mapping, get_model_info
 
 logger = Logger().get_logger()
@@ -49,7 +49,7 @@ def quantize(
     """
     # TODO (Yi) support combine more than one algo
     if isinstance(quant_config, dict):
-        quant_config = parse_config_from_dict(quant_config)
+        quant_config = ComposableConfig.from_dict(quant_config, config_registry=registered_configs[FRAMEWORK_NAME])
         logger.info(f"Parsed a config dict to construct the quantization config: {quant_config}.")
     else:
         assert isinstance(
diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py
index ba7ea51cba9..dd210f11bfa 100644
--- a/test/3x/torch/test_config.py
+++ b/test/3x/torch/test_config.py
@@ -119,7 +119,7 @@ def test_config_from_dict(self):
                 },
             }
         }
-        config = RTNWeightQuantConfig.from_dict(quant_config)
+        config = RTNWeightQuantConfig.from_dict(quant_config["rtn_weight_only_quant"])
         self.assertIsNotNone(config.local_config)
 
     def test_config_to_dict(self):
@@ -222,6 +222,16 @@ def test_config_mapping(self):
         self.assertTrue(configs_mapping[(torch.nn.Linear, "fc1")].weight_bits == 6)
         self.assertTrue(configs_mapping[(torch.nn.Linear, "fc2")].weight_bits == 4)
 
+    def test_gptq_config(self):
+        from neural_compressor.torch.quantization import GPTQConfig
+
+        gptq_config1 = GPTQConfig(weight_bits=8, pad_max_length=512)
+        quant_config_dict = {
+            "gptq": {"weight_bits": 8, "pad_max_length": 512},
+        }
+        gptq_config2 = GPTQConfig.from_dict(quant_config_dict["gptq"])
+        self.assertEqual(gptq_config1.to_dict(), gptq_config2.to_dict())
+
 
 if __name__ == "__main__":
     unittest.main()

From 88e709afd9aa3d79dc250157b01dbbacfca0b5ac Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 13:21:06 +0800
Subject: [PATCH 12/25] add more UTs for GPTQ

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/3x/torch/test_gptq.py | 57 ++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/test/3x/torch/test_gptq.py b/test/3x/torch/test_gptq.py
index 5e20664ed99..8200a3a3d54 100644
--- a/test/3x/torch/test_gptq.py
+++ b/test/3x/torch/test_gptq.py
@@ -18,32 +18,27 @@ def get_gpt_j():
 
 
 class GPTQLLMDataLoader:
-    def __init__(self):
+    def __init__(self, length=512):
         self.batch_size = 1
+        self.length = length
 
     def __iter__(self):
         for i in range(10):
-            yield torch.ones([1, 512], dtype=torch.long)
+            yield torch.ones([1, self.length], dtype=torch.long)
 
 
-class GPTQLLMDataLoaderList:
-    def __init__(self):
-        self.batch_size = 1
-
+class GPTQLLMDataLoaderList(GPTQLLMDataLoader):
     def __iter__(self):
         for i in range(10):
-            yield (torch.ones([1, 512], dtype=torch.long), torch.ones([1, 512], dtype=torch.long))
+            yield (torch.ones([1, self.length], dtype=torch.long), torch.ones([1, self.length], dtype=torch.long))
 
 
-class GPTQLLMDataLoaderDict:
-    def __init__(self):
-        self.batch_size = 1
-
+class GPTQLLMDataLoaderDict(GPTQLLMDataLoader):
     def __iter__(self):
         for i in range(10):
             yield {
-                "input_ids": torch.ones([1, 512], dtype=torch.long),
-                "attention_mask": torch.ones([1, 512], dtype=torch.long),
+                "input_ids": torch.ones([1, self.length], dtype=torch.long),
+                "attention_mask": torch.ones([1, self.length], dtype=torch.long),
             }
 
 
@@ -60,7 +55,7 @@ def setUp(self):
         # print the test name
         logger.info(f"Running TestGPTQ test: {self.id()}")
 
-    def test_default_gptq(self):
+    def test_gptq(self):
         # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
         # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
         from neural_compressor.torch import GPTQConfig, quantize
@@ -79,6 +74,40 @@ def test_default_gptq(self):
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 
+    def _apply_gptq(self, input, model, quant_config, dataloader):
+        logger.info(f"Test GPTQ with config {quant_config}")
+        from neural_compressor.torch import quantize
+
+        out0 = model(input)
+        q_model = quantize(model=model, quant_config=quant_config, calib_dataloader=dataloader)
+        out1 = q_model(input)
+        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
+
+    def test_more_gptq(self):
+        import random
+        from itertools import product
+
+        from neural_compressor.torch import GPTQConfig
+
+        # some tests were skipped to accelerate the CI
+        input = torch.ones([1, 512], dtype=torch.long)
+        # dataloader
+        dataloader_collections = [GPTQLLMDataLoader, GPTQLLMDataLoaderList, GPTQLLMDataLoaderDict]
+        gptq_options = {
+            "weight_sym": [False, True],
+            "weight_group_size": [8],
+            "use_max_length": [False, True],
+            "pad_max_length": [512],
+        }
+        for dataloader in dataloader_collections:
+            for value in product(*gptq_options.values()):
+                d = dict(zip(gptq_options.keys(), value))
+                quant_config = GPTQConfig(**d)
+                length = 512 if quant_config.use_max_length else random.randint(1, 1024)
+                self._apply_gptq(
+                    model=get_gpt_j(), input=input, quant_config=quant_config, dataloader=dataloader(length)
+                )
+
 
 if __name__ == "__main__":
     unittest.main()

From a5c6427a20b119cf968a445d2376ec07d641d08c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 13:36:55 +0800
Subject: [PATCH 13/25] remove attrs for double quant

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/quantization/config.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index bb426e62365..7c1fd990658 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -170,8 +170,6 @@ class GPTQConfig(BaseConfig):
         "device",
         "layer_wise",
         "return_int",
-        "double_quant_bits",
-        "double_quant_group_size",
     ]
 
     def __init__(
@@ -192,8 +190,6 @@ def __init__(
         device=None,
         layer_wise: bool = False,
         return_int: bool = False,
-        double_quant_bits: int = 4,
-        double_quant_group_size: int = 16,
     ):
         """Init GPTQ config.
 
@@ -216,9 +212,6 @@ def __init__(
         self.layer_wise = layer_wise
         self.device = device
         self.return_int = return_int
-        # placeholder for double quant
-        self.double_quant_bits: int = double_quant_bits
-        self.double_quant_group_size: int = double_quant_group_size
 
     def to_dict(self):
         return super().to_dict(params_list=self.params_list, operator2str=operator2str)

From af82d642ecdb11ce4f19e1b78b954dcea3dcb117 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 13:55:18 +0800
Subject: [PATCH 14/25] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/quantization/quantize.py | 5 ++---
 neural_compressor/torch/utils.py                 | 3 +++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 09dda320812..7ca4f4a47dc 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -19,7 +19,7 @@
 from neural_compressor.common.base_config import BaseConfig, ComposableConfig, registered_configs
 from neural_compressor.common.logger import Logger
 from neural_compressor.torch.quantization.config import FRAMEWORK_NAME
-from neural_compressor.torch.utils import algos_mapping, get_model_info
+from neural_compressor.torch.utils import WHITE_MODULE_LIST, algos_mapping, get_model_info
 
 logger = Logger().get_logger()
 
@@ -47,7 +47,6 @@ def quantize(
     Returns:
         The quantized model.
     """
-    # TODO (Yi) support combine more than one algo
     if isinstance(quant_config, dict):
         quant_config = ComposableConfig.from_dict(quant_config, config_registry=registered_configs[FRAMEWORK_NAME])
         logger.info(f"Parsed a config dict to construct the quantization config: {quant_config}.")
@@ -58,7 +57,7 @@ def quantize(
     logger.info(f"Quantize model with config: \n {quant_config.to_json_string()} \n")
     # select quantization algo according to config
 
-    model_info = get_model_info(model=model, white_module_list=[torch.nn.Linear])
+    model_info = get_model_info(model=model, white_module_list=WHITE_MODULE_LIST)
     configs_mapping = quant_config.to_config_mapping(model_info=model_info)
     logger.debug(configs_mapping)
     for algo_name, algo_func in algos_mapping.items():
diff --git a/neural_compressor/torch/utils.py b/neural_compressor/torch/utils.py
index 6556982b8a4..289a488ef86 100644
--- a/neural_compressor/torch/utils.py
+++ b/neural_compressor/torch/utils.py
@@ -24,6 +24,9 @@
 
 import torch
 
+# All constants for torch
+WHITE_MODULE_LIST = [torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d]
+
 
 def register_algo(name):
     """Decorator function to register algorithms in the algos_mapping dictionary.

From 9980d88f28011407594c70660c35ea7e39d25609 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 14:59:44 +0800
Subject: [PATCH 15/25] add more UTs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/torch_utils/gptq.py | 77 ++++---------------
 neural_compressor/torch/algorithms/gptq.py    |  1 +
 test/3x/torch/test_gptq.py                    | 17 ++++
 3 files changed, 31 insertions(+), 64 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
index 1a33addb364..6c7f7bf68f8 100644
--- a/neural_compressor/adaptor/torch_utils/gptq.py
+++ b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -396,29 +396,15 @@ def get_full_layer_name(self, sub_layer_name, block_idx):
 
     def check_layer_config(self):
         """Copy arguments from weight_config to built-in attributes."""
-        if "wbits" in self.weight_config:
-            tmp_weight_config = {}
-            for name, module in self.model.named_modules():
-                tmp_weight_config[name] = {}
-                tmp_weight_config[name]["wbits"] = self.weight_config.get("wbits", self.wbits_default)
-                tmp_weight_config[name]["group_size"] = self.weight_config.get("group_size", self.group_size_default)
-                tmp_weight_config[name]["block_size"] = self.weight_config.get("block_size", self.group_size_default)
-                tmp_weight_config[name]["percdamp"] = self.weight_config.get("pecdamp", self.percdamp_default)
-                tmp_weight_config[name]["sym"] = self.weight_config.get("sym", self.sym_default)
-                tmp_weight_config[name]["act_order"] = self.weight_config.get("act_order", self.act_order_default)
-                tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default)
-                tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default)
-            self.weight_config = tmp_weight_config
-        else:
-            for layer_name, config in self.weight_config.items():
-                self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
-                self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
-                self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
-                self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
-                self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
-                self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
-                self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
-                self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
+        for layer_name, config in self.weight_config.items():
+            self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
+            self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
+            self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
+            self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
+            self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
+            self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
+            self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
+            self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
 
     def get_layer_config(self, layer_name):
         """Obtain config for one layer, since GPTQ supports layer-wise config."""
@@ -576,12 +562,7 @@ def execute_quantization(self, means=None, stds=None, model_path=None):
                 # )
                 full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
-                if self.layer_wise:
-                    from ..torch_utils.layer_wise_quant.utils import load_value
-
-                    W = load_value(self.model, full_layer_name + ".weight", model_path)
-                else:
-                    W = sub_layers[layer_name].weight.data.clone()
+                W = sub_layers[layer_name].weight.data.clone()
 
                 gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device)
                 # gptq_for_this_block[layer_name].quantizer = Quantizer()
@@ -618,13 +599,7 @@ def tmp(_, inp, out):
                 # )
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 logger.info(f"Quantizing layer {layer_name}")
-                if self.layer_wise:
-                    from ..torch_utils.layer_wise_quant.utils import load_value
-
-                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    W = load_value(self.model, full_layer_name + ".weight", model_path)
-                else:
-                    W = sub_layers[layer_name].weight.data.clone()
+                W = sub_layers[layer_name].weight.data.clone()
                 scale, zp, Q = gptq_for_this_block[layer_name].fasterquant(
                     W,
                     blocksize=weight_config_this_layer["block_size"],
@@ -632,30 +607,7 @@ def tmp(_, inp, out):
                     groupsize=weight_config_this_layer["group_size"],
                     act_order=weight_config_this_layer["act_order"],
                 )
-                if self.layer_wise:
-                    from ..torch_utils.layer_wise_quant.utils import (
-                        LWQ_WORKSPACE,
-                        clean_module_weight,
-                        load_value,
-                        set_module_tensor_to_device,
-                    )
-
-                    sub_layer = sub_layers[layer_name]
-                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    for n, p in sub_layer.named_parameters():
-                        param_name = full_layer_name + "." + n
-                        if n == "weight":
-                            set_module_tensor_to_device(self.model, param_name, self.device, Q)
-                        else:
-                            value = load_value(self.model, param_name, model_path)
-                            set_module_tensor_to_device(self.model, param_name, self.device, value)
-                    # sub_layer.weight.data = Q
-                    torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
-                    clean_module_weight(sub_layer)
-                    del Q
-                    gc.collect()
-                else:
-                    sub_layers[layer_name].weight.data = Q
+                sub_layers[layer_name].weight.data = Q
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
@@ -675,10 +627,7 @@ def tmp(_, inp, out):
                 out = self.track_hidden_states(out)
                 outs.append(out)
             self.cache_key_arguments["i"] = idx
-            if self.layer_wise:
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
-            else:
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
+            self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.
diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index da3d9d0c811..82bf0913791 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -1015,6 +1015,7 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
                 "percdamp": op_config.percdamp,
                 "act_order": op_config.act_order,
                 "block_size": op_config.block_size,
+                "mse": op_config.enable_mse_search,
             }
             nsamples = op_config.nsamples
             use_max_length = op_config.use_max_length
diff --git a/test/3x/torch/test_gptq.py b/test/3x/torch/test_gptq.py
index 8200a3a3d54..8a1a5e76d13 100644
--- a/test/3x/torch/test_gptq.py
+++ b/test/3x/torch/test_gptq.py
@@ -108,6 +108,23 @@ def test_more_gptq(self):
                     model=get_gpt_j(), input=input, quant_config=quant_config, dataloader=dataloader(length)
                 )
 
+    def test_gptq_advance(self):
+        # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+        # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
+        from neural_compressor.torch import GPTQConfig, quantize
+
+        # "gptq_args": {"percdamp": 0.01, "act_order": False, "use_max_length": True, "pad_max_length": 512},
+        quant_config = GPTQConfig(weight_group_size=8, act_order=True, enable_mse_search=True, pad_max_length=512)
+        quant_config.set_local("lm_head", GPTQConfig(weight_dtype="fp32"))
+        logger.info(f"Test GPTQ with config {quant_config}")
+        dataloader = GPTQLLMDataLoader()
+        model_1 = get_gpt_j()
+        input = torch.ones([1, 512], dtype=torch.long)
+        out0 = model_1(input)
+        q_model = quantize(model=model_1, quant_config=quant_config, calib_dataloader=dataloader)
+        out1 = q_model(input)
+        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
+
 
 if __name__ == "__main__":
     unittest.main()

From 5e0910ae1b5db86d1e9090bd395fdae198901bd3 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 24 Nov 2023 15:30:31 +0800
Subject: [PATCH 16/25] print itrex commit

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/run_itrex.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh
index b3380bae308..6d9edbf3af3 100644
--- a/.azure-pipelines/scripts/ut/run_itrex.sh
+++ b/.azure-pipelines/scripts/ut/run_itrex.sh
@@ -6,6 +6,7 @@ echo "run itrex ut..."
 
 # prepare itrex
 git clone https://github.com/intel/intel-extension-for-transformers.git /intel-extension-for-transformers
+cd /intel-extension-for-transformers && git rev-parse --short HEAD
 bash /intel-extension-for-transformers/.github/workflows/script/prepare_env.sh
 bash /intel-extension-for-transformers/.github/workflows/script/install_binary.sh
 

From aa4e8d81d25adca4c9a8f1ddcfeb035440f46528 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 15:25:57 +0800
Subject: [PATCH 17/25] revert change

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/adaptor/torch_utils/gptq.py | 77 +++++++++++++++----
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
index 6c7f7bf68f8..1a33addb364 100644
--- a/neural_compressor/adaptor/torch_utils/gptq.py
+++ b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -396,15 +396,29 @@ def get_full_layer_name(self, sub_layer_name, block_idx):
 
     def check_layer_config(self):
         """Copy arguments from weight_config to built-in attributes."""
-        for layer_name, config in self.weight_config.items():
-            self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
-            self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
-            self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
-            self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
-            self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
-            self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
-            self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
-            self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
+        if "wbits" in self.weight_config:
+            tmp_weight_config = {}
+            for name, module in self.model.named_modules():
+                tmp_weight_config[name] = {}
+                tmp_weight_config[name]["wbits"] = self.weight_config.get("wbits", self.wbits_default)
+                tmp_weight_config[name]["group_size"] = self.weight_config.get("group_size", self.group_size_default)
+                tmp_weight_config[name]["block_size"] = self.weight_config.get("block_size", self.group_size_default)
+                tmp_weight_config[name]["percdamp"] = self.weight_config.get("pecdamp", self.percdamp_default)
+                tmp_weight_config[name]["sym"] = self.weight_config.get("sym", self.sym_default)
+                tmp_weight_config[name]["act_order"] = self.weight_config.get("act_order", self.act_order_default)
+                tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default)
+                tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default)
+            self.weight_config = tmp_weight_config
+        else:
+            for layer_name, config in self.weight_config.items():
+                self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
+                self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
+                self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
+                self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
+                self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
+                self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
+                self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
+                self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
 
     def get_layer_config(self, layer_name):
         """Obtain config for one layer, since GPTQ supports layer-wise config."""
@@ -562,7 +576,12 @@ def execute_quantization(self, means=None, stds=None, model_path=None):
                 # )
                 full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
-                W = sub_layers[layer_name].weight.data.clone()
+                if self.layer_wise:
+                    from ..torch_utils.layer_wise_quant.utils import load_value
+
+                    W = load_value(self.model, full_layer_name + ".weight", model_path)
+                else:
+                    W = sub_layers[layer_name].weight.data.clone()
 
                 gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device)
                 # gptq_for_this_block[layer_name].quantizer = Quantizer()
@@ -599,7 +618,13 @@ def tmp(_, inp, out):
                 # )
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 logger.info(f"Quantizing layer {layer_name}")
-                W = sub_layers[layer_name].weight.data.clone()
+                if self.layer_wise:
+                    from ..torch_utils.layer_wise_quant.utils import load_value
+
+                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                    W = load_value(self.model, full_layer_name + ".weight", model_path)
+                else:
+                    W = sub_layers[layer_name].weight.data.clone()
                 scale, zp, Q = gptq_for_this_block[layer_name].fasterquant(
                     W,
                     blocksize=weight_config_this_layer["block_size"],
@@ -607,7 +632,30 @@ def tmp(_, inp, out):
                     groupsize=weight_config_this_layer["group_size"],
                     act_order=weight_config_this_layer["act_order"],
                 )
-                sub_layers[layer_name].weight.data = Q
+                if self.layer_wise:
+                    from ..torch_utils.layer_wise_quant.utils import (
+                        LWQ_WORKSPACE,
+                        clean_module_weight,
+                        load_value,
+                        set_module_tensor_to_device,
+                    )
+
+                    sub_layer = sub_layers[layer_name]
+                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
+                    for n, p in sub_layer.named_parameters():
+                        param_name = full_layer_name + "." + n
+                        if n == "weight":
+                            set_module_tensor_to_device(self.model, param_name, self.device, Q)
+                        else:
+                            value = load_value(self.model, param_name, model_path)
+                            set_module_tensor_to_device(self.model, param_name, self.device, value)
+                    # sub_layer.weight.data = Q
+                    torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
+                    clean_module_weight(sub_layer)
+                    del Q
+                    gc.collect()
+                else:
+                    sub_layers[layer_name].weight.data = Q
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
@@ -627,7 +675,10 @@ def tmp(_, inp, out):
                 out = self.track_hidden_states(out)
                 outs.append(out)
             self.cache_key_arguments["i"] = idx
-            self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
+            if self.layer_wise:
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
+            else:
+                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.

From 9fbafa941ef9f9d2f3188d3aa34f7681f98b1fc3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 15:29:00 +0800
Subject: [PATCH 18/25] remove lwy

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py | 81 ++++------------------
 1 file changed, 13 insertions(+), 68 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index 82bf0913791..48d742b05e4 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -401,29 +401,15 @@ def get_full_layer_name(self, sub_layer_name, block_idx):
 
     def check_layer_config(self):
         """Copy arguments from weight_config to built-in attributes."""
-        if "wbits" in self.weight_config:
-            tmp_weight_config = {}
-            for name, module in self.model.named_modules():
-                tmp_weight_config[name] = {}
-                tmp_weight_config[name]["wbits"] = self.weight_config.get("wbits", self.wbits_default)
-                tmp_weight_config[name]["group_size"] = self.weight_config.get("group_size", self.group_size_default)
-                tmp_weight_config[name]["block_size"] = self.weight_config.get("block_size", self.group_size_default)
-                tmp_weight_config[name]["percdamp"] = self.weight_config.get("pecdamp", self.percdamp_default)
-                tmp_weight_config[name]["sym"] = self.weight_config.get("sym", self.sym_default)
-                tmp_weight_config[name]["act_order"] = self.weight_config.get("act_order", self.act_order_default)
-                tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default)
-                tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default)
-            self.weight_config = tmp_weight_config
-        else:
-            for layer_name, config in self.weight_config.items():
-                self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
-                self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
-                self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
-                self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
-                self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
-                self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
-                self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
-                self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
+        for layer_name, config in self.weight_config.items():
+            self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
+            self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
+            self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
+            self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
+            self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
+            self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
+            self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
+            self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
 
     def get_layer_config(self, layer_name):
         """Obtain config for one layer, since GPTQ supports layer-wise config."""
@@ -581,14 +567,7 @@ def execute_quantization(self, means=None, stds=None, model_path=None):
                 # )
                 full_layer_name = self.get_full_layer_name(layer_name, block_idx)
                 weight_config_this_layer = self.get_layer_config(full_layer_name)
-                if self.layer_wise:
-                    # TODO (Yi)
-                    from neural_compressor.adaptor.torch_utils.layer_wise_quant.utils import load_value
-
-                    W = load_value(self.model, full_layer_name + ".weight", model_path)
-                else:
-                    W = sub_layers[layer_name].weight.data.clone()
-
+                W = sub_layers[layer_name].weight.data.clone()
                 gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device)
                 # gptq_for_this_block[layer_name].quantizer = Quantizer()
                 gptq_for_this_block[layer_name].quantizer.configure(
@@ -624,14 +603,7 @@ def tmp(_, inp, out):
                 # )
                 weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx))
                 logger.info(f"Quantizing layer {layer_name}")
-                if self.layer_wise:
-                    # TODO (Yi)
-                    from neural_compressor.adaptor.torch_utils.layer_wise_quant.utils import load_value
-
-                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    W = load_value(self.model, full_layer_name + ".weight", model_path)
-                else:
-                    W = sub_layers[layer_name].weight.data.clone()
+                W = sub_layers[layer_name].weight.data.clone()
                 scale, zp, Q = gptq_for_this_block[layer_name].fasterquant(
                     W,
                     blocksize=weight_config_this_layer["block_size"],
@@ -639,31 +611,7 @@ def tmp(_, inp, out):
                     groupsize=weight_config_this_layer["group_size"],
                     act_order=weight_config_this_layer["act_order"],
                 )
-                if self.layer_wise:
-                    # TODO (Yi)
-                    from neural_compressor.adaptor.torch_utils.layer_wise_quant.utils import (
-                        LWQ_WORKSPACE,
-                        clean_module_weight,
-                        load_value,
-                        set_module_tensor_to_device,
-                    )
-
-                    sub_layer = sub_layers[layer_name]
-                    full_layer_name = self.get_full_layer_name(layer_name, block_idx)
-                    for n, p in sub_layer.named_parameters():
-                        param_name = full_layer_name + "." + n
-                        if n == "weight":
-                            set_module_tensor_to_device(self.model, param_name, self.device, Q)
-                        else:
-                            value = load_value(self.model, param_name, model_path)
-                            set_module_tensor_to_device(self.model, param_name, self.device, value)
-                    # sub_layer.weight.data = Q
-                    torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt")
-                    clean_module_weight(sub_layer)
-                    del Q
-                    gc.collect()
-                else:
-                    sub_layers[layer_name].weight.data = Q
+                sub_layers[layer_name].weight.data = Q
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
@@ -683,10 +631,7 @@ def tmp(_, inp, out):
                 out = self.track_hidden_states(out)
                 outs.append(out)
             self.cache_key_arguments["i"] = idx
-            if self.layer_wise:
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block
-            else:
-                self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
+            self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.

From 759613763b806f58edfd8ca6f4381ede69c0b505 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 16:50:29 +0800
Subject: [PATCH 19/25] remove calib_dataloader

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py    | 16 +++++++++----
 .../torch/algorithms/weight_only_algos.py     |  4 ++--
 .../torch/quantization/quantize.py            |  8 +++----
 test/3x/torch/test_gptq.py                    | 24 +++++++++++++------
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index 48d742b05e4..8adf3f3ca28 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -196,12 +196,13 @@ def __init__(
         self,
         model,
         weight_config={},
-        dataloader=None,
         nsamples=128,
         use_max_length=True,
         pad_max_length=2048,
         device=None,
         layer_wise=False,
+        *args,
+        **kwargs,
     ):
         """
         Args:
@@ -252,12 +253,19 @@ def __init__(
         # dataloader
         self.use_max_length = use_max_length
         self.pad_max_length = pad_max_length
-        self.dataloader_original = dataloader
+        self.dataloader_original = None
         self.dataloader = []
         self.nsamples = nsamples
+        self.args = args
+        self.kwargs = kwargs
         self.prepare_dataloader()
 
     def prepare_dataloader(self):
+        if self.dataloader_original is None:
+            run_fn = self.kwargs.get("calib_func", None)
+            fn_args = self.kwargs.get("calib_func_args", None)
+            assert run_fn, "Since the dataloader not is provided, please provide a run func as the "
+            self.dataloader_original = run_fn(fn_args)
         if self.use_max_length:
             # (Recommend) only take sequence whose length exceeds self.pad_max_length,
             # which preserves calibration's tokens are all valid
@@ -976,7 +984,7 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
     return weight_config, nsamples, use_max_length, pad_max_length, device
 
 
-def apply_gptq_quantize(model, configs_mapping, dataloader, *args, **kwargs):
+def apply_gptq_quantize(model, configs_mapping, *args, **kwargs):
     """Apply gptq."""
     # TODO: unify weight_config keys, add docstring, and support default config
     weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
@@ -990,7 +998,7 @@ def apply_gptq_quantize(model, configs_mapping, dataloader, *args, **kwargs):
         assert model_path is not None, "model_path should not be None when use layer_wise mode"
 
     gptq_quantizer = GPTQuantizer(
-        model, weight_config, dataloader, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise
+        model, weight_config, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise, *args, **kwargs
     )
     fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
     logger.info("GPTQ quantization done.")
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
index 83a9e763138..dd07c0d1494 100644
--- a/neural_compressor/torch/algorithms/weight_only_algos.py
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -44,12 +44,12 @@ def rtn_quantize_entry(
 ###################### GPTQ Algo Entry ##################################
 @register_algo(name=GPTQ)
 def gptq_quantize_entry(
-    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], dataloader, *args, **kwargs
+    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], *args, **kwargs
 ) -> torch.nn.Module:
     logger.info("Quantize model with the GPTQ algorithm.")
     from neural_compressor.torch.algorithms.gptq import apply_gptq_quantize
 
-    model, quantization_perm = apply_gptq_quantize(model=model, configs_mapping=configs_mapping, dataloader=dataloader)
+    model, quantization_perm = apply_gptq_quantize(model=model, configs_mapping=configs_mapping, *args, **kwargs)
     # Assign the gptq config as an attribute of model
     model._gptq_quantization_perm = quantization_perm
     return model
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 7ca4f4a47dc..543def1a954 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -31,18 +31,16 @@ def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_nam
 def quantize(
     model: torch.nn.Module,
     quant_config: BaseConfig,
-    calib_dataloader=None,
     calib_func: Callable = None,
-    calib_func_arg: Any = None,
+    calib_func_args: Any = None,
 ) -> torch.nn.Module:
     """The main entry to quantize model.
 
     Args:
         model: a float model to be quantized.
         quant_config: a quantization configuration.
-        calib_dataloader: a calibration dataloader for calibrating the model. Defaults to None.
         calib_func: a calibration function for calibrating the model. Defaults to None.
-        calib_func_arg: positional arguments for `calib_func`. Defaults to None.
+        calib_func_args: positional arguments for `calib_func`. Defaults to None.
 
     Returns:
         The quantized model.
@@ -63,5 +61,5 @@ def quantize(
     for algo_name, algo_func in algos_mapping.items():
         if need_apply(configs_mapping, algo_name):
             logger.info(f"Start to apply {algo_name} on the model.")
-            model = algo_func(model, configs_mapping, calib_dataloader, calib_func, calib_func_arg)
+            model = algo_func(model, configs_mapping, calib_func=calib_func, calib_func_arg=calib_func_args)
     return model
diff --git a/test/3x/torch/test_gptq.py b/test/3x/torch/test_gptq.py
index 8a1a5e76d13..9c50aac0a7a 100644
--- a/test/3x/torch/test_gptq.py
+++ b/test/3x/torch/test_gptq.py
@@ -70,16 +70,20 @@ def test_gptq(self):
         model_1 = get_gpt_j()
         input = torch.ones([1, 512], dtype=torch.long)
         out0 = model_1(input)
-        q_model = quantize(model=model_1, quant_config=quant_config, calib_dataloader=dataloader)
+
+        def calib_func(*args):
+            return dataloader
+
+        q_model = quantize(model=model_1, quant_config=quant_config, calib_func=calib_func)
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 
-    def _apply_gptq(self, input, model, quant_config, dataloader):
+    def _apply_gptq(self, input, model, quant_config, calib_func):
         logger.info(f"Test GPTQ with config {quant_config}")
         from neural_compressor.torch import quantize
 
         out0 = model(input)
-        q_model = quantize(model=model, quant_config=quant_config, calib_dataloader=dataloader)
+        q_model = quantize(model=model, quant_config=quant_config, calib_func=calib_func)
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 
@@ -104,9 +108,11 @@ def test_more_gptq(self):
                 d = dict(zip(gptq_options.keys(), value))
                 quant_config = GPTQConfig(**d)
                 length = 512 if quant_config.use_max_length else random.randint(1, 1024)
-                self._apply_gptq(
-                    model=get_gpt_j(), input=input, quant_config=quant_config, dataloader=dataloader(length)
-                )
+
+                def calib_func(*args):
+                    return dataloader(length)
+
+                self._apply_gptq(model=get_gpt_j(), input=input, quant_config=quant_config, calib_func=calib_func)
 
     def test_gptq_advance(self):
         # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -121,7 +127,11 @@ def test_gptq_advance(self):
         model_1 = get_gpt_j()
         input = torch.ones([1, 512], dtype=torch.long)
         out0 = model_1(input)
-        q_model = quantize(model=model_1, quant_config=quant_config, calib_dataloader=dataloader)
+
+        def calib_func(*args):
+            return dataloader
+
+        q_model = quantize(model=model_1, quant_config=quant_config, calib_func=calib_func)
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 

From 1f02346ec24f94abfe44c3bc2050c0a5bc1afbfb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 24 Nov 2023 16:55:21 +0800
Subject: [PATCH 20/25] rename calib_func to run_fn, calib_func_args to
 run_args

---
 neural_compressor/torch/algorithms/gptq.py       |  6 +++---
 neural_compressor/torch/quantization/quantize.py | 10 +++++-----
 test/3x/torch/test_gptq.py                       | 16 ++++++++--------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index 8adf3f3ca28..fc79429e659 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -262,10 +262,10 @@ def __init__(
 
     def prepare_dataloader(self):
         if self.dataloader_original is None:
-            run_fn = self.kwargs.get("calib_func", None)
-            fn_args = self.kwargs.get("calib_func_args", None)
+            run_fn = self.kwargs.get("run_fn", None)
+            run_args = self.kwargs.get("run_args", None)
             assert run_fn, "Since the dataloader not is provided, please provide a run func as the "
-            self.dataloader_original = run_fn(fn_args)
+            self.dataloader_original = run_fn(run_args)
         if self.use_max_length:
             # (Recommend) only take sequence whose length exceeds self.pad_max_length,
             # which preserves calibration's tokens are all valid
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 543def1a954..7fce5cef7fc 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -31,16 +31,16 @@ def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_nam
 def quantize(
     model: torch.nn.Module,
     quant_config: BaseConfig,
-    calib_func: Callable = None,
-    calib_func_args: Any = None,
+    run_fn: Callable = None,
+    run_args: Any = None,
 ) -> torch.nn.Module:
     """The main entry to quantize model.
 
     Args:
         model: a float model to be quantized.
         quant_config: a quantization configuration.
-        calib_func: a calibration function for calibrating the model. Defaults to None.
-        calib_func_args: positional arguments for `calib_func`. Defaults to None.
+        run_fn: a calibration function for calibrating the model. Defaults to None.
+        run_args: positional arguments for `run_fn`. Defaults to None.
 
     Returns:
         The quantized model.
@@ -61,5 +61,5 @@ def quantize(
     for algo_name, algo_func in algos_mapping.items():
         if need_apply(configs_mapping, algo_name):
             logger.info(f"Start to apply {algo_name} on the model.")
-            model = algo_func(model, configs_mapping, calib_func=calib_func, calib_func_arg=calib_func_args)
+            model = algo_func(model, configs_mapping, run_fn=run_fn, calib_func_arg=run_args)
     return model
diff --git a/test/3x/torch/test_gptq.py b/test/3x/torch/test_gptq.py
index 9c50aac0a7a..dd9ce720eef 100644
--- a/test/3x/torch/test_gptq.py
+++ b/test/3x/torch/test_gptq.py
@@ -71,19 +71,19 @@ def test_gptq(self):
         input = torch.ones([1, 512], dtype=torch.long)
         out0 = model_1(input)
 
-        def calib_func(*args):
+        def run_fn(*args):
             return dataloader
 
-        q_model = quantize(model=model_1, quant_config=quant_config, calib_func=calib_func)
+        q_model = quantize(model=model_1, quant_config=quant_config, run_fn=run_fn)
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 
-    def _apply_gptq(self, input, model, quant_config, calib_func):
+    def _apply_gptq(self, input, model, quant_config, run_fn):
         logger.info(f"Test GPTQ with config {quant_config}")
         from neural_compressor.torch import quantize
 
         out0 = model(input)
-        q_model = quantize(model=model, quant_config=quant_config, calib_func=calib_func)
+        q_model = quantize(model=model, quant_config=quant_config, run_fn=run_fn)
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 
@@ -109,10 +109,10 @@ def test_more_gptq(self):
                 quant_config = GPTQConfig(**d)
                 length = 512 if quant_config.use_max_length else random.randint(1, 1024)
 
-                def calib_func(*args):
+                def run_fn(*args):
                     return dataloader(length)
 
-                self._apply_gptq(model=get_gpt_j(), input=input, quant_config=quant_config, calib_func=calib_func)
+                self._apply_gptq(model=get_gpt_j(), input=input, quant_config=quant_config, run_fn=run_fn)
 
     def test_gptq_advance(self):
         # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
@@ -128,10 +128,10 @@ def test_gptq_advance(self):
         input = torch.ones([1, 512], dtype=torch.long)
         out0 = model_1(input)
 
-        def calib_func(*args):
+        def run_fn(*args):
             return dataloader
 
-        q_model = quantize(model=model_1, quant_config=quant_config, calib_func=calib_func)
+        q_model = quantize(model=model_1, quant_config=quant_config, run_fn=run_fn)
         out1 = q_model(input)
         self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
 

From 42a07d958fbd22efc8dea680f144b5a81d9f0c08 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 26 Nov 2023 20:09:01 +0800
Subject: [PATCH 21/25] refactor gptq

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py    | 367 ++++++++++--------
 .../torch/quantization/config.py              |   4 +
 .../torch/quantization/quantize.py            |   2 +-
 test/3x/torch/{test_gptq.py => _test_gptq.py} |   2 +
 4 files changed, 213 insertions(+), 162 deletions(-)
 rename test/3x/torch/{test_gptq.py => _test_gptq.py} (99%)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index fc79429e659..eef5843e6bb 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -197,6 +197,7 @@ def __init__(
         model,
         weight_config={},
         nsamples=128,
+        dataloader_len=10,
         use_max_length=True,
         pad_max_length=2048,
         device=None,
@@ -255,153 +256,24 @@ def __init__(
         self.pad_max_length = pad_max_length
         self.dataloader_original = None
         self.dataloader = []
+        self.dataloader_len = dataloader_len
         self.nsamples = nsamples
         self.args = args
         self.kwargs = kwargs
-        self.prepare_dataloader()
-
-    def prepare_dataloader(self):
-        if self.dataloader_original is None:
-            run_fn = self.kwargs.get("run_fn", None)
-            run_args = self.kwargs.get("run_args", None)
-            assert run_fn, "Since the dataloader not is provided, please provide a run func as the "
-            self.dataloader_original = run_fn(run_args)
-        if self.use_max_length:
-            # (Recommend) only take sequence whose length exceeds self.pad_max_length,
-            # which preserves calibration's tokens are all valid
-            # This is GPTQ official dataloader implementation
-            self.obtain_first_n_samples_fulllength()
-        else:
-            # general selection, no padding, not GPTQ original implementation.
-            self.obtain_first_n_samples()
-        try:
-            self.cache_key_arguments = {
-                "i": 0
-            }  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
-            # Note that the first elements in cache_positional_arguments is main input: hidden_states
-            self.cache_positional_arguments = []  # a list of list, positional arguments ("rotary_pos_emb" in chatglm)
-            self.is_ready = True
-        except:
-            logger.warning("GPTQ Quantizer initialization failed!")
-            pass
-
-    def obtain_first_n_samples(self, seed=0):
-        """Get first nsample data as the real calibration dataset."""
-        self.dataloader.clear()
-        random.seed(seed)
-        for batch in self.dataloader_original:
-            # process data, depends on its data type.
-            if len(self.dataloader) == self.nsamples:
-                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
-                break
-            # list, tuple
-            if isinstance(batch, list) or isinstance(batch, tuple):
-                if batch[0].shape[-1] > self.pad_max_length:
-                    i = random.randint(0, batch[0].shape[-1] - self.pad_max_length - 1)
-                    j = i + self.pad_max_length
-                    batch_final = []
-                    for item in batch:
-                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
-                            batch_final.append(item[:, i:j])
-                        else:
-                            batch_final.append(item)
-                else:
-                    batch_final = batch[:]
-            # dict
-            elif isinstance(batch, dict):
-                try:
-                    length = batch["input_ids"].shape[-1]
-                except:
-                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
-                    continue
-                batch_final = {}
-                if length > self.pad_max_length:
-                    i = random.randint(0, length - self.pad_max_length - 1)
-                    j = i + self.pad_max_length
-                    # may have to slice every sequence related data
-                    for key in batch.keys():
-                        if isinstance(batch[key], torch.Tensor):
-                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
-                        else:
-                            batch_final[key] = batch[key]
-                else:
-                    batch_final = batch
-            # tensor
-            else:
-                if batch.shape[-1] > self.pad_max_length:
-                    i = random.randint(0, batch.shape[-1] - self.pad_max_length - 1)
-                    j = i + self.pad_max_length
-                    batch_final = batch[:, i:j]
-                else:
-                    batch_final = batch
-            self.dataloader.append(batch_final)
-
-        if len(self.dataloader) < self.nsamples:
-            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
-
-    def obtain_first_n_samples_fulllength(self, seed=0):
-        self.dataloader.clear()
-        random.seed(seed)
-        unified_length = self.pad_max_length
-        for batch in self.dataloader_original:
-            if len(self.dataloader) == self.nsamples:
-                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
-                break
-            # list & tuple, gpt-j-6b mlperf, etc.
-            if isinstance(batch, list) or isinstance(batch, tuple):
-                if batch[0].shape[-1] == unified_length:
-                    batch_final = batch[:]
-                elif batch[0].shape[-1] > unified_length:
-                    i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
-                    j = i + unified_length
-                    batch_final = []
-                    for item in batch:
-                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
-                            batch_final.append(item[:, i:j])
-                        else:
-                            batch_final.append(item)
-                else:
-                    # not match max length, not include in target dataset
-                    continue
-            # dict
-            elif isinstance(batch, dict):
-                try:
-                    length = batch["input_ids"].shape[-1]
-                except:
-                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
-                    continue
-                batch_final = {}
-                if length == self.pad_max_length:
-                    batch_final = batch
-                elif length > self.pad_max_length:
-                    i = random.randint(0, length - self.pad_max_length - 1)
-                    j = i + self.pad_max_length
-                    # may have to slice every sequence related data
-                    for key in batch.keys():
-                        if isinstance(batch[key], torch.Tensor):
-                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
-                        else:
-                            batch_final[key] = batch[key]
-                else:
-                    # not match max length, not include in target dataset
-                    continue
-            # tensor
-            else:
-                if batch.shape[-1] == unified_length:
-                    batch_final = batch
-                elif batch.shape[-1] > unified_length:
-                    i = random.randint(0, batch.shape[-1] - unified_length - 1)
-                    j = i + unified_length
-                    batch_final = batch[:, i:j]
-                else:
-                    # not match max length, not include in target dataset
-                    continue
-            self.dataloader.append(batch_final)
-        if len(self.dataloader) < self.nsamples:  # pragma: no cover
-            logger.warning(
-                f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
-            but only {len(self.dataloader)} samples are found. Please use smaller 'self.pad_max_length' value."
-            )
+        self.run_fn = self.kwargs.get("run_fn", None)
+        self.run_args = self.kwargs.get("run_args", None)
+        self.dataloader_len = dataloader_len
+        # compare 2.x, use run_fn to calibration
+        # self.prepare_dataloader()
+        self._post_init()
+
+    def _post_init(self):
+        self.cache_key_arguments = {
+            "i": 0
+        }  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
+        # Note that the first elements in cache_positional_arguments is main input: hidden_states
+        self.cache_positional_arguments = []  # a list of list, positional arguments ("rotary_pos_emb" in chatglm)
+        self.is_ready = True
 
     def get_full_layer_name(self, sub_layer_name, block_idx):
         transformer_name = self.gptq_related_blocks["transformers_name"]
@@ -483,18 +355,24 @@ def forward(layer, *args, **kwargs):
 
         # Step3: run forward to obtain calibration datasets
         logger.info("Collecting calibration inputs...")
-        for batch in tqdm(self.dataloader):
-            if not self.layer_wise:
-                batch = move_input_to_device(batch, self.device)
-            try:
-                if isinstance(batch, tuple) or isinstance(batch, list):
-                    self.model(batch[0])
-                elif isinstance(batch, dict):
-                    self.model(**batch)
-                else:
-                    self.model(batch)
-            except ValueError:
-                pass
+        logger.info("Collecting calibration inputs by running the run_fn provided by user.")
+        if self.run_args:
+            self.run_fn(self.model, self.run_args)
+        else:
+            self.run_fn(self.model)
+
+        # for batch in tqdm(self.dataloader):
+        #     if not self.layer_wise:
+        #         batch = move_input_to_device(batch, self.device)
+        #     try:
+        #         if isinstance(batch, tuple) or isinstance(batch, list):
+        #             self.model(batch[0])
+        #         elif isinstance(batch, dict):
+        #             self.model(**batch)
+        #         else:
+        #             self.model(batch)
+        #     except ValueError:
+        #         pass
         # output inp data shape
         logger.info("All calibration data's shape =>")
         # check all hidden_states shape
@@ -596,7 +474,8 @@ def tmp(_, inp, out):
             for layer_name in sub_layers:
                 handles.append(sub_layers[layer_name].register_forward_hook(add_batch(layer_name)))
             idx = self.cache_key_arguments.pop("i")
-            for j in range(len(self.dataloader)):
+            # for j in range(len(self.dataloader)):
+            for j in range(self.dataloader_len):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
                 out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
@@ -632,7 +511,8 @@ def tmp(_, inp, out):
             # Step 2.5: replace output data with quantized weights
             outs = []
             idx = self.cache_key_arguments.pop("i")
-            for j in range(len(self.dataloader)):
+            # for j in range(len(self.dataloader)):
+            for j in range(self.dataloader_len):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
                 out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
@@ -971,6 +851,7 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
                 "mse": op_config.enable_mse_search,
             }
             nsamples = op_config.nsamples
+            dataloader_len = op_config.dataloader_len
             use_max_length = op_config.use_max_length
             pad_max_length = op_config.pad_max_length
             device = op_config.device
@@ -981,13 +862,15 @@ def gptq_config_mapping(configs_mapping: Dict[Tuple[str, Callable], GPTQConfig])
         but you have not set length value. Default sequence length is 2048 and this might cause inference error!"
         )
 
-    return weight_config, nsamples, use_max_length, pad_max_length, device
+    return weight_config, nsamples, use_max_length, pad_max_length, device, dataloader_len
 
 
 def apply_gptq_quantize(model, configs_mapping, *args, **kwargs):
     """Apply gptq."""
     # TODO: unify weight_config keys, add docstring, and support default config
-    weight_config, nsamples, use_max_length, pad_max_length, device = gptq_config_mapping(configs_mapping)
+    weight_config, nsamples, use_max_length, pad_max_length, device, dataloader_len = gptq_config_mapping(
+        configs_mapping
+    )
     assert isinstance(model, torch.nn.Module), "only support torch module"
     # TODO (Yi) disable layer-wise and model_path first
     layer_wise = False
@@ -998,8 +881,170 @@ def apply_gptq_quantize(model, configs_mapping, *args, **kwargs):
         assert model_path is not None, "model_path should not be None when use layer_wise mode"
 
     gptq_quantizer = GPTQuantizer(
-        model, weight_config, nsamples, use_max_length, pad_max_length, device, layer_wise=layer_wise, *args, **kwargs
+        model,
+        weight_config,
+        nsamples,
+        dataloader_len,
+        use_max_length,
+        pad_max_length,
+        device,
+        layer_wise=layer_wise,
+        *args,
+        **kwargs,
     )
     fp32_modified_model, gptq_config = gptq_quantizer.execute_quantization(model_path=model_path)
     logger.info("GPTQ quantization done.")
     return fp32_modified_model, gptq_config
+
+
+class DataloaderPreprocessor:
+    def __init__(self, dataloader_original, use_max_length=False, pad_max_length=2048, nsamples=128) -> None:
+        self.dataloader_original = dataloader_original
+        self.use_max_length = use_max_length
+        self.pad_max_length = pad_max_length
+        self.nsamples = nsamples
+        self.dataloader = []
+        self.is_ready = False
+
+    def get_prepared_dataloader(self):
+        if not self.is_ready:
+            self.prepare_dataloader()
+        return self.dataloader
+
+    def prepare_dataloader(self):
+        if self.use_max_length:
+            # (Recommend) only take sequence whose length exceeds self.pad_max_length,
+            # which preserves calibration's tokens are all valid
+            # This is GPTQ official dataloader implementation
+            self.obtain_first_n_samples_fulllength()
+        else:
+            # general selection, no padding, not GPTQ original implementation.
+            self.obtain_first_n_samples()
+        try:
+            self.cache_key_arguments = {
+                "i": 0
+            }  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
+            # Note that the first elements in cache_positional_arguments is main input: hidden_states
+            self.cache_positional_arguments = []  # a list of list, positional arguments ("rotary_pos_emb" in chatglm)
+            self.is_ready = True
+        except:
+            logger.warning("GPTQ Quantizer initialization failed!")
+            pass
+
+    def obtain_first_n_samples(self, seed=0):
+        """Get first nsample data as the real calibration dataset."""
+        self.dataloader.clear()
+        random.seed(seed)
+        for batch in self.dataloader_original:
+            # process data, depends on its data type.
+            if len(self.dataloader) == self.nsamples:
+                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                break
+            # list, tuple
+            if isinstance(batch, list) or isinstance(batch, tuple):
+                if batch[0].shape[-1] > self.pad_max_length:
+                    i = random.randint(0, batch[0].shape[-1] - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
+                else:
+                    batch_final = batch[:]
+            # dict
+            elif isinstance(batch, dict):
+                try:
+                    length = batch["input_ids"].shape[-1]
+                except:
+                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                    continue
+                batch_final = {}
+                if length > self.pad_max_length:
+                    i = random.randint(0, length - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    # may have to slice every sequence related data
+                    for key in batch.keys():
+                        if isinstance(batch[key], torch.Tensor):
+                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
+                        else:
+                            batch_final[key] = batch[key]
+                else:
+                    batch_final = batch
+            # tensor
+            else:
+                if batch.shape[-1] > self.pad_max_length:
+                    i = random.randint(0, batch.shape[-1] - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    batch_final = batch[:, i:j]
+                else:
+                    batch_final = batch
+            self.dataloader.append(batch_final)
+
+        if len(self.dataloader) < self.nsamples:
+            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
+
+    def obtain_first_n_samples_fulllength(self, seed=0):
+        self.dataloader.clear()
+        random.seed(seed)
+        unified_length = self.pad_max_length
+        for batch in self.dataloader_original:
+            if len(self.dataloader) == self.nsamples:
+                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                break
+            # list & tuple, gpt-j-6b mlperf, etc.
+            if isinstance(batch, list) or isinstance(batch, tuple):
+                if batch[0].shape[-1] == unified_length:
+                    batch_final = batch[:]
+                elif batch[0].shape[-1] > unified_length:
+                    i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
+                    j = i + unified_length
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
+                else:
+                    # not match max length, not include in target dataset
+                    continue
+            # dict
+            elif isinstance(batch, dict):
+                try:
+                    length = batch["input_ids"].shape[-1]
+                except:
+                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                    continue
+                batch_final = {}
+                if length == self.pad_max_length:
+                    batch_final = batch
+                elif length > self.pad_max_length:
+                    i = random.randint(0, length - self.pad_max_length - 1)
+                    j = i + self.pad_max_length
+                    # may have to slice every sequence related data
+                    for key in batch.keys():
+                        if isinstance(batch[key], torch.Tensor):
+                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
+                        else:
+                            batch_final[key] = batch[key]
+                else:
+                    # not match max length, not include in target dataset
+                    continue
+            # tensor
+            else:
+                if batch.shape[-1] == unified_length:
+                    batch_final = batch
+                elif batch.shape[-1] > unified_length:
+                    i = random.randint(0, batch.shape[-1] - unified_length - 1)
+                    j = i + unified_length
+                    batch_final = batch[:, i:j]
+                else:
+                    # not match max length, not include in target dataset
+                    continue
+            self.dataloader.append(batch_final)
+        if len(self.dataloader) < self.nsamples:  # pragma: no cover
+            logger.warning(
+                f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
+            but only {len(self.dataloader)} samples are found. Please use smaller 'self.pad_max_length' value."
+            )
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 7c1fd990658..f492baa8517 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -162,6 +162,7 @@ class GPTQConfig(BaseConfig):
         "act_dtype",
         "group_dim",
         "nsamples",
+        "dataloader_len",
         "percdamp",
         "act_order",
         "use_max_length",
@@ -182,6 +183,7 @@ def __init__(
         act_dtype: str = "fp32",
         group_dim: int = 1,
         nsamples: int = 128,
+        dataloader_len: int = 10,
         percdamp: float = 0.01,
         act_order: bool = False,
         use_max_length: bool = True,
@@ -205,6 +207,8 @@ def __init__(
         self.enable_mse_search = enable_mse_search
         self.group_dim = group_dim
         self.nsamples = nsamples
+        # TODO(Yi) detect it auto
+        self.dataloader_len = dataloader_len
         self.percdamp = percdamp
         self.act_order = act_order
         self.use_max_length = use_max_length
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 7fce5cef7fc..90744385b95 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -61,5 +61,5 @@ def quantize(
     for algo_name, algo_func in algos_mapping.items():
         if need_apply(configs_mapping, algo_name):
             logger.info(f"Start to apply {algo_name} on the model.")
-            model = algo_func(model, configs_mapping, run_fn=run_fn, calib_func_arg=run_args)
+            model = algo_func(model, configs_mapping, run_fn=run_fn, run_args=run_args)
     return model
diff --git a/test/3x/torch/test_gptq.py b/test/3x/torch/_test_gptq.py
similarity index 99%
rename from test/3x/torch/test_gptq.py
rename to test/3x/torch/_test_gptq.py
index dd9ce720eef..fbb21170c7e 100644
--- a/test/3x/torch/test_gptq.py
+++ b/test/3x/torch/_test_gptq.py
@@ -1,3 +1,5 @@
+# TODO (Yi) remove before merge
+
 import unittest
 
 import torch

From e913b045206e2f5cbc0b4202b8b4f947e3c1ba4a Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 26 Nov 2023 20:12:20 +0800
Subject: [PATCH 22/25] add UTs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/3x/torch/test_gptq_algo.py | 190 ++++++++++++++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 test/3x/torch/test_gptq_algo.py

diff --git a/test/3x/torch/test_gptq_algo.py b/test/3x/torch/test_gptq_algo.py
new file mode 100644
index 00000000000..4d93fe42969
--- /dev/null
+++ b/test/3x/torch/test_gptq_algo.py
@@ -0,0 +1,190 @@
+import unittest
+
+import torch
+
+from neural_compressor.common.logger import Logger
+
+logger = Logger().get_logger()
+
+
+def get_gpt_j():
+    import transformers
+
+    tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
+        "hf-internal-testing/tiny-random-GPTJForCausalLM",
+        torchscript=True,
+    )
+    return tiny_gptj
+
+
+class GPTQLLMDataLoader:
+    def __init__(self, length=512):
+        self.batch_size = 1
+        self.length = length
+
+    def __iter__(self):
+        for i in range(10):
+            yield torch.ones([1, self.length], dtype=torch.long)
+
+
+class GPTQLLMDataLoaderList(GPTQLLMDataLoader):
+    def __iter__(self):
+        for i in range(10):
+            yield (torch.ones([1, self.length], dtype=torch.long), torch.ones([1, self.length], dtype=torch.long))
+
+
+class GPTQLLMDataLoaderDict(GPTQLLMDataLoader):
+    def __iter__(self):
+        for i in range(10):
+            yield {
+                "input_ids": torch.ones([1, self.length], dtype=torch.long),
+                "attention_mask": torch.ones([1, self.length], dtype=torch.long),
+            }
+
+
+from tqdm import tqdm
+
+from neural_compressor.torch.algorithms.gptq import move_input_to_device
+
+
+def run_fn_for_gptq(model, dataloader_for_calibration, *args):
+    logger.info("Collecting calibration inputs...")
+    for batch in tqdm(dataloader_for_calibration):
+        batch = move_input_to_device(batch, device=None)
+        try:
+            if isinstance(batch, tuple) or isinstance(batch, list):
+                model(batch[0])
+            elif isinstance(batch, dict):
+                model(**batch)
+            else:
+                model(batch)
+        except ValueError:
+            pass
+    return
+
+
+class TestGPTQ(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        pass
+
+    @classmethod
+    def tearDownClass(self):
+        pass
+
+    def setUp(self):
+        # print the test name
+        logger.info(f"Running TestGPTQ test: {self.id()}")
+
+    def test_gptq(self):
+        # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+        # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
+        from neural_compressor.torch import GPTQConfig, quantize
+
+        dataloader = GPTQLLMDataLoader()
+
+        # case 1: tensor
+        model_1 = get_gpt_j()
+        input = torch.ones([1, 512], dtype=torch.long)
+        out0 = model_1(input)
+        device = None
+        from neural_compressor.torch.algorithms.gptq import DataloaderPreprocessor
+
+        dataloaderPreprocessor = DataloaderPreprocessor(
+            dataloader_original=dataloader, use_max_length=False, pad_max_length=512, nsamples=128
+        )
+        dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader()
+
+        quant_config = GPTQConfig(
+            weight_group_size=8, dataloader_len=len(dataloader_for_calibration), pad_max_length=512
+        )
+        quant_config.set_local("lm_head", GPTQConfig(weight_dtype="fp32"))
+        logger.info(f"Test GPTQ with config {quant_config}")
+        q_model = quantize(
+            model=model_1, quant_config=quant_config, run_fn=run_fn_for_gptq, run_args=dataloader_for_calibration
+        )
+        out1 = q_model(input)
+        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
+
+    def test_gptq_advance(self):
+        # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
+        # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
+        from neural_compressor.torch import GPTQConfig, quantize
+
+        dataloader = GPTQLLMDataLoader()
+        model_1 = get_gpt_j()
+        input = torch.ones([1, 512], dtype=torch.long)
+        out0 = model_1(input)
+
+        device = None
+        from neural_compressor.torch.algorithms.gptq import DataloaderPreprocessor
+
+        dataloaderPreprocessor = DataloaderPreprocessor(
+            dataloader_original=dataloader, use_max_length=False, pad_max_length=512, nsamples=128
+        )
+        dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader()
+
+        quant_config = GPTQConfig(
+            weight_group_size=8,
+            dataloader_len=len(dataloader_for_calibration),
+            act_order=True,
+            enable_mse_search=True,
+            pad_max_length=512,
+        )
+        quant_config.set_local("lm_head", GPTQConfig(weight_dtype="fp32"))
+        logger.info(f"Test GPTQ with config {quant_config}")
+        q_model = quantize(
+            model=model_1, quant_config=quant_config, run_fn=run_fn_for_gptq, run_args=dataloader_for_calibration
+        )
+        out1 = q_model(input)
+        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
+
+    def _apply_gptq(self, input, model, quant_config, run_fn, run_args):
+        logger.info(f"Test GPTQ with config {quant_config}")
+        from neural_compressor.torch import quantize
+
+        out0 = model(input)
+        q_model = quantize(model=model, quant_config=quant_config, run_fn=run_fn, run_args=run_args)
+        out1 = q_model(input)
+        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
+
+    def test_more_gptq(self):
+        import random
+        from itertools import product
+
+        from neural_compressor.torch import GPTQConfig
+
+        # some tests were skipped to accelerate the CI
+        input = torch.ones([1, 512], dtype=torch.long)
+        # dataloader
+        dataloader_collections = [GPTQLLMDataLoader, GPTQLLMDataLoaderList, GPTQLLMDataLoaderDict]
+        gptq_options = {
+            "weight_sym": [False, True],
+            "weight_group_size": [8],
+            "use_max_length": [False, True],
+            "pad_max_length": [512],
+        }
+        for dataloader_cls in dataloader_collections:
+            for value in product(*gptq_options.values()):
+                d = dict(zip(gptq_options.keys(), value))
+                quant_config = GPTQConfig(**d)
+                length = 512 if quant_config.use_max_length else random.randint(1, 1024)
+                from neural_compressor.torch.algorithms.gptq import DataloaderPreprocessor
+
+                dataloaderPreprocessor = DataloaderPreprocessor(
+                    dataloader_original=dataloader_cls(length), use_max_length=False, pad_max_length=512, nsamples=128
+                )
+                dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader()
+                quant_config.dataloader_len = len(dataloader_for_calibration)
+
+                self._apply_gptq(
+                    model=get_gpt_j(),
+                    input=input,
+                    quant_config=quant_config,
+                    run_fn=run_fn_for_gptq,
+                    run_args=dataloader_for_calibration,
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()

From b6a57ce74655c64c1b8c352795966c91ee813c06 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 26 Nov 2023 20:23:45 +0800
Subject: [PATCH 23/25] fix ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/3x/torch/test_gptq_algo.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/3x/torch/test_gptq_algo.py b/test/3x/torch/test_gptq_algo.py
index 4d93fe42969..a9e98aa9732 100644
--- a/test/3x/torch/test_gptq_algo.py
+++ b/test/3x/torch/test_gptq_algo.py
@@ -172,7 +172,10 @@ def test_more_gptq(self):
                 from neural_compressor.torch.algorithms.gptq import DataloaderPreprocessor
 
                 dataloaderPreprocessor = DataloaderPreprocessor(
-                    dataloader_original=dataloader_cls(length), use_max_length=False, pad_max_length=512, nsamples=128
+                    dataloader_original=dataloader_cls(length),
+                    use_max_length=d["use_max_length"],
+                    pad_max_length=d["pad_max_length"],
+                    nsamples=128,
                 )
                 dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader()
                 quant_config.dataloader_len = len(dataloader_for_calibration)

From ac0aeeaabf44968388cd35ee24e3a45aa621080b Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 27 Nov 2023 08:57:43 +0800
Subject: [PATCH 24/25] add more UTs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/torch/algorithms/gptq.py | 59 ++++++--------
 test/3x/torch/test_gptq_algo.py            | 92 ++++++++++++++++++++++
 2 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/neural_compressor/torch/algorithms/gptq.py b/neural_compressor/torch/algorithms/gptq.py
index eef5843e6bb..a42908e7385 100644
--- a/neural_compressor/torch/algorithms/gptq.py
+++ b/neural_compressor/torch/algorithms/gptq.py
@@ -281,15 +281,29 @@ def get_full_layer_name(self, sub_layer_name, block_idx):
 
     def check_layer_config(self):
         """Copy arguments from weight_config to built-in attributes."""
-        for layer_name, config in self.weight_config.items():
-            self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
-            self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
-            self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
-            self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
-            self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
-            self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
-            self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
-            self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
+        if "wbits" in self.weight_config:
+            tmp_weight_config = {}
+            for name, module in self.model.named_modules():
+                tmp_weight_config[name] = {}
+                tmp_weight_config[name]["wbits"] = self.weight_config.get("wbits", self.wbits_default)
+                tmp_weight_config[name]["group_size"] = self.weight_config.get("group_size", self.group_size_default)
+                tmp_weight_config[name]["block_size"] = self.weight_config.get("block_size", self.group_size_default)
+                tmp_weight_config[name]["percdamp"] = self.weight_config.get("pecdamp", self.percdamp_default)
+                tmp_weight_config[name]["sym"] = self.weight_config.get("sym", self.sym_default)
+                tmp_weight_config[name]["act_order"] = self.weight_config.get("act_order", self.act_order_default)
+                tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default)
+                tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default)
+            self.weight_config = tmp_weight_config
+        else:
+            for layer_name, config in self.weight_config.items():
+                self.weight_config[layer_name]["wbits"] = config.get("wbits", self.wbits_default)
+                self.weight_config[layer_name]["group_size"] = config.get("group_size", self.group_size_default)
+                self.weight_config[layer_name]["block_size"] = config.get("block_size", self.group_size_default)
+                self.weight_config[layer_name]["percdamp"] = config.get("pecdamp", self.percdamp_default)
+                self.weight_config[layer_name]["sym"] = config.get("sym", self.sym_default)
+                self.weight_config[layer_name]["act_order"] = config.get("act_order", self.act_order_default)
+                self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default)
+                self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default)
 
     def get_layer_config(self, layer_name):
         """Obtain config for one layer, since GPTQ supports layer-wise config."""
@@ -426,11 +440,8 @@ def execute_quantization(self, means=None, stds=None, model_path=None):
         tblock_length = len(self.gptq_related_blocks["transformers"])
         for block_idx in range(tblock_length):
             logger.info(f"Quantizing layer {block_idx + 1} / {tblock_length}..")
-            if not self.layer_wise:
-                # if we do not apply layer-wise feature, we still place the entire block on the GPU
-                transformer_block = self.gptq_related_blocks["transformers"][block_idx].to(self.device)
-            else:
-                transformer_block = self.gptq_related_blocks["transformers"][block_idx]  # .to(self.device)
+            # if we do not apply layer-wise feature, we still place the entire block on the GPU
+            transformer_block = self.gptq_related_blocks["transformers"][block_idx].to(self.device)
             # Step2.1: obtain all layers (Linear, Conv2d, etc) in the block which can be quantized.
             sub_layers = find_layers(transformer_block)
             sub_layers_to_quant = {}
@@ -474,7 +485,6 @@ def tmp(_, inp, out):
             for layer_name in sub_layers:
                 handles.append(sub_layers[layer_name].register_forward_hook(add_batch(layer_name)))
             idx = self.cache_key_arguments.pop("i")
-            # for j in range(len(self.dataloader)):
             for j in range(self.dataloader_len):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
@@ -511,7 +521,6 @@ def tmp(_, inp, out):
             # Step 2.5: replace output data with quantized weights
             outs = []
             idx = self.cache_key_arguments.pop("i")
-            # for j in range(len(self.dataloader)):
             for j in range(self.dataloader_len):
                 cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
@@ -690,9 +699,6 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F
         return scale, zero, Q
 
     def free(self):
-        if DEBUG:
-            self.inp1 = None
-            self.out1 = None
         self.H = None
         self.Losses = None
         self.Trace = None
@@ -876,10 +882,6 @@ def apply_gptq_quantize(model, configs_mapping, *args, **kwargs):
     layer_wise = False
     model_path = None
 
-    # Below is the same as the 2.x
-    if layer_wise:
-        assert model_path is not None, "model_path should not be None when use layer_wise mode"
-
     gptq_quantizer = GPTQuantizer(
         model,
         weight_config,
@@ -920,16 +922,7 @@ def prepare_dataloader(self):
         else:
             # general selection, no padding, not GPTQ original implementation.
             self.obtain_first_n_samples()
-        try:
-            self.cache_key_arguments = {
-                "i": 0
-            }  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
-            # Note that the first elements in cache_positional_arguments is main input: hidden_states
-            self.cache_positional_arguments = []  # a list of list, positional arguments ("rotary_pos_emb" in chatglm)
-            self.is_ready = True
-        except:
-            logger.warning("GPTQ Quantizer initialization failed!")
-            pass
+        self.is_ready = True
 
     def obtain_first_n_samples(self, seed=0):
         """Get first nsample data as the real calibration dataset."""
diff --git a/test/3x/torch/test_gptq_algo.py b/test/3x/torch/test_gptq_algo.py
index a9e98aa9732..edf35626b81 100644
--- a/test/3x/torch/test_gptq_algo.py
+++ b/test/3x/torch/test_gptq_algo.py
@@ -188,6 +188,98 @@ def test_more_gptq(self):
                     run_args=dataloader_for_calibration,
                 )
 
+    def test_gptq_wbits(self):
+        import copy
+        import random
+
+        class GPTQLLMDataLoader:
+            def __init__(self):
+                self.batch_size = 1
+
+            def __iter__(self):
+                for i in range(20):
+                    length = random.randint(1, 1024)
+                    yield torch.ones([1, length], dtype=torch.long)
+
+        dataloader = GPTQLLMDataLoader()
+        model = copy.deepcopy(get_gpt_j())
+        weight_config = {
+            "transformer.h.0.attn.k_proj": {
+                "wbits": 4,
+                "group_size": 128,
+                "sym": True,
+                "percdamp": 0.01,
+                "perchannel": False,
+            },
+            "transformer.h.1.attn.k_proj": {
+                "wbits": 3,
+                "group_size": -1,
+                "sym": False,
+                "percdamp": 0.01,
+                "act_order": True,
+            },
+            "transformer.h.2.attn.k_proj": {
+                "wbits": 3,
+                "group_size": 32,
+                "sym": False,
+                "percdamp": 0.01,
+                "mse": True,
+                "act_order": False,
+            },
+            "transformer.h.3.attn.k_proj": {
+                "wbits": 3,
+                "group_size": 256,
+                "sym": False,
+                "percdamp": 0.01,
+                "mse": True,
+                "act_order": False,
+            },
+        }
+        from neural_compressor.torch.algorithms.gptq import DataloaderPreprocessor
+
+        dataloaderPreprocessor = DataloaderPreprocessor(
+            dataloader_original=dataloader,
+            use_max_length=True,
+            pad_max_length=512,
+            nsamples=128,
+        )
+        preprocessed_dataloader = dataloaderPreprocessor.get_prepared_dataloader()
+        from neural_compressor.torch.algorithms.gptq import GPTQuantizer
+
+        quantizer = GPTQuantizer(
+            model=model,
+            weight_config=weight_config,
+            dataloader_len=13,
+            use_max_length=True,
+            pad_max_length=512,
+            run_fn=run_fn_for_gptq,
+            run_args=preprocessed_dataloader,
+        )
+        quantizer.execute_quantization()
+        self.assertTrue(isinstance(model, torch.nn.Module))
+        self.gptj = get_gpt_j()
+
+        model = copy.deepcopy(self.gptj)
+        weight_config = {"wbits": 4}
+        dataloaderPreprocessor = DataloaderPreprocessor(
+            dataloader_original=dataloader,
+            use_max_length=False,
+            pad_max_length=512,
+            nsamples=128,
+        )
+        quantizer = GPTQuantizer(
+            model=model,
+            weight_config=weight_config,
+            dataloader_len=13,
+            use_max_length=False,
+            pad_max_length=512,
+            run_fn=run_fn_for_gptq,
+            run_args=preprocessed_dataloader,
+        )
+        quantizer.execute_quantization()
+        preprocessed_dataloader = dataloaderPreprocessor.get_prepared_dataloader()
+        self.assertTrue(isinstance(model, torch.nn.Module))
+
 
 if __name__ == "__main__":
     unittest.main()

From d78fd78391f22d32310e3272069be696b65bc232 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 27 Nov 2023 16:33:03 +0800
Subject: [PATCH 25/25] remove unused test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/3x/torch/_test_gptq.py | 142 ------------------------------------
 1 file changed, 142 deletions(-)
 delete mode 100644 test/3x/torch/_test_gptq.py

diff --git a/test/3x/torch/_test_gptq.py b/test/3x/torch/_test_gptq.py
deleted file mode 100644
index fbb21170c7e..00000000000
--- a/test/3x/torch/_test_gptq.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# TODO (Yi) remove before merge
-
-import unittest
-
-import torch
-
-from neural_compressor.common.logger import Logger
-
-logger = Logger().get_logger()
-
-
-def get_gpt_j():
-    import transformers
-
-    tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
-        "hf-internal-testing/tiny-random-GPTJForCausalLM",
-        torchscript=True,
-    )
-    return tiny_gptj
-
-
-class GPTQLLMDataLoader:
-    def __init__(self, length=512):
-        self.batch_size = 1
-        self.length = length
-
-    def __iter__(self):
-        for i in range(10):
-            yield torch.ones([1, self.length], dtype=torch.long)
-
-
-class GPTQLLMDataLoaderList(GPTQLLMDataLoader):
-    def __iter__(self):
-        for i in range(10):
-            yield (torch.ones([1, self.length], dtype=torch.long), torch.ones([1, self.length], dtype=torch.long))
-
-
-class GPTQLLMDataLoaderDict(GPTQLLMDataLoader):
-    def __iter__(self):
-        for i in range(10):
-            yield {
-                "input_ids": torch.ones([1, self.length], dtype=torch.long),
-                "attention_mask": torch.ones([1, self.length], dtype=torch.long),
-            }
-
-
-class TestGPTQ(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        pass
-
-    @classmethod
-    def tearDownClass(self):
-        pass
-
-    def setUp(self):
-        # print the test name
-        logger.info(f"Running TestGPTQ test: {self.id()}")
-
-    def test_gptq(self):
-        # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
-        # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
-        from neural_compressor.torch import GPTQConfig, quantize
-
-        # "gptq_args": {"percdamp": 0.01, "act_order": False, "use_max_length": True, "pad_max_length": 512},
-        quant_config = GPTQConfig(weight_group_size=8, pad_max_length=512)
-        quant_config.set_local("lm_head", GPTQConfig(weight_dtype="fp32"))
-        logger.info(f"Test GPTQ with config {quant_config}")
-        dataloader = GPTQLLMDataLoader()
-
-        # case 1: tensor
-        model_1 = get_gpt_j()
-        input = torch.ones([1, 512], dtype=torch.long)
-        out0 = model_1(input)
-
-        def run_fn(*args):
-            return dataloader
-
-        q_model = quantize(model=model_1, quant_config=quant_config, run_fn=run_fn)
-        out1 = q_model(input)
-        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
-
-    def _apply_gptq(self, input, model, quant_config, run_fn):
-        logger.info(f"Test GPTQ with config {quant_config}")
-        from neural_compressor.torch import quantize
-
-        out0 = model(input)
-        q_model = quantize(model=model, quant_config=quant_config, run_fn=run_fn)
-        out1 = q_model(input)
-        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
-
-    def test_more_gptq(self):
-        import random
-        from itertools import product
-
-        from neural_compressor.torch import GPTQConfig
-
-        # some tests were skipped to accelerate the CI
-        input = torch.ones([1, 512], dtype=torch.long)
-        # dataloader
-        dataloader_collections = [GPTQLLMDataLoader, GPTQLLMDataLoaderList, GPTQLLMDataLoaderDict]
-        gptq_options = {
-            "weight_sym": [False, True],
-            "weight_group_size": [8],
-            "use_max_length": [False, True],
-            "pad_max_length": [512],
-        }
-        for dataloader in dataloader_collections:
-            for value in product(*gptq_options.values()):
-                d = dict(zip(gptq_options.keys(), value))
-                quant_config = GPTQConfig(**d)
-                length = 512 if quant_config.use_max_length else random.randint(1, 1024)
-
-                def run_fn(*args):
-                    return dataloader(length)
-
-                self._apply_gptq(model=get_gpt_j(), input=input, quant_config=quant_config, run_fn=run_fn)
-
-    def test_gptq_advance(self):
-        # Ported from test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
-        # TestPytorchWeightOnlyAdaptor.test_GPTQ_fixed_length_quant
-        from neural_compressor.torch import GPTQConfig, quantize
-
-        # "gptq_args": {"percdamp": 0.01, "act_order": False, "use_max_length": True, "pad_max_length": 512},
-        quant_config = GPTQConfig(weight_group_size=8, act_order=True, enable_mse_search=True, pad_max_length=512)
-        quant_config.set_local("lm_head", GPTQConfig(weight_dtype="fp32"))
-        logger.info(f"Test GPTQ with config {quant_config}")
-        dataloader = GPTQLLMDataLoader()
-        model_1 = get_gpt_j()
-        input = torch.ones([1, 512], dtype=torch.long)
-        out0 = model_1(input)
-
-        def run_fn(*args):
-            return dataloader
-
-        q_model = quantize(model=model_1, quant_config=quant_config, run_fn=run_fn)
-        out1 = q_model(input)
-        self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02))
-
-
-if __name__ == "__main__":
-    unittest.main()