From c7770af9afc37ac8c8d368fb213a709ae9f37731 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 21 May 2024 03:10:54 -0400
Subject: [PATCH 01/40] add file

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_awq.py | 234 +++++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 auto_round/export/export_to_awq.py

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
new file mode 100644
index 00000000..35374b01
--- /dev/null
+++ b/auto_round/export/export_to_awq.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import json
+import os
+from os.path import isdir, isfile, join
+from typing import Dict, List, Optional, Union
+import torch.nn as nn
+
+# MIT License
+#
+# Copyright (c) 2023 潘其威(William)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+
+from auto_round.export.register import register_format
+from auto_round.utils import (
+    check_to_quantized, 
+    get_block_names, 
+    get_module, 
+    get_module_name,
+    get_named_linears,
+    set_op_by_name,
+    logger
+)
+
+
+
+@register_format("auto_awq")
+def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
+    """Export the model to autogptq format to easily leverage cuda kernel."""
+    model = kwargs["model"]
+    weight_config = kwargs["weight_config"]
+    sym = kwargs["sym"]
+    bits = kwargs["bits"]
+    group_size = kwargs["group_size"]
+    iters = kwargs["iters"]
+    lr = kwargs["lr"]
+    minmax_lr = kwargs["minmax_lr"]
+    enable_minmax_tuning = kwargs["enable_minmax_tuning"]
+    enable_quanted_input = kwargs["enable_quanted_input"]
+    scale_dtype = kwargs["scale_dtype"]
+    tokenizer = kwargs["tokenizer"]
+    supported_types = kwargs["supported_types"]
+
+    logger.info("Saving quantized model to autoawq format")
+    if tokenizer is not None:
+        tokenizer.save_pretrained(output_dir)
+    ##check module quantized in block, this may have bug for mixed precision quantization
+    block_name = get_block_names(model)[0]
+    first_block = get_module(model, block_name)
+    all_to_quantized = True
+    modules_in_block_to_quantize = []
+    for n, m in first_block.named_modules():
+        is_supported_type = False
+        for supported_type in supported_types:
+            if isinstance(m, supported_type):
+                is_supported_type = True
+                break
+        if not is_supported_type:
+            continue
+        if not check_to_quantized(m):
+            all_to_quantized = False
+        else:
+            modules_in_block_to_quantize.append(n)
+    modules_in_block_to_quantize = [modules_in_block_to_quantize]
+    if all_to_quantized:
+        modules_in_block_to_quantize = None
+
+    if inplace:
+        compressed_model = model.to("cpu")
+    else:
+        compressed_model = copy.deepcopy(model.to("cpu"))
+
+    from awq.modules.linear import WQLinear_GEMM
+    from awq.utils.utils import clear_memory
+    from awq import AutoAWQForCausalLM
+    import sys
+
+    q_linear_module = WQLinear_GEMM
+    awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
+    logger.info(f"lyt_debug Approximate memory usage of compressed_model: {sizeof_fmt(sys.getsizeof(compressed_model))}, device: {compressed_model.device}")
+    logger.info(f"lyt_debug Approximate memory usage of model: {sizeof_fmt(sys.getsizeof(model))}, device: {model.device}")
+    try:
+        logger.info(f"lyt_debug Approximate memory usage of awq_model: {sizeof_fmt(sys.getsizeof(awq_model))}")
+    except:
+        logger.info("lyt_debug awq model unable to calc")
+    self_modules = awq_model.get_model_layers(compressed_model)
+    for i in range(len(self_modules)):
+        module = self_modules[i]
+        named_linears = get_named_linears(module)
+        for name, linear_layer in named_linears.items():
+            key = get_module_name(compressed_model, linear_layer)
+            info = weight_config[key]
+            if not check_to_quantized(info):
+                continue
+            info["zp"] = info["zp"].to(torch.float32)
+            scale, zp = info['scale'], info['zp']
+            scale = scale.t().contiguous()
+            zp = zp.t().contiguous()
+            q_linear = q_linear_module.from_linear(
+                linear=linear_layer,
+                w_bit=bits,
+                group_size=group_size,
+                init_only=False,
+                scales=scale,
+                zeros=zp,
+            )
+            linear_layer.cpu()
+            q_linear.to(next(module.parameters()).device)
+            set_op_by_name(module, name, q_linear)
+            clear_memory()
+            
+    quant_config = {
+            "quant_method": 'awq',
+            "zero_point": not sym,
+            "group_size": group_size,
+            "bits": bits,
+            "version": 'gemm',
+            "modules_to_not_convert": None,
+        }
+
+    save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config)
+
+
+from transformers.modeling_utils import shard_checkpoint
+from safetensors.torch import save_file
+def save_quantized(
+    model,
+    save_dir,
+    quant_config,
+    safetensors=True,
+    shard_size="5GB",
+):
+    save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir
+
+    # Save model
+    class EmptyModule(nn.Module):
+        def __init__(self):
+            super(EmptyModule, self).__init__()
+
+        def forward(self, x):
+            return x
+
+    # Save model and config files with empty state dict
+    from awq.models._config import AwqConfig
+    
+    model.config.quantization_config = quant_config
+    model.generation_config.do_sample = True
+    model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict())
+
+    # Remove empty state dict
+    default_paths = [
+        f"{save_dir}/model.safetensors",
+        f"{save_dir}/pytorch_model.bin",
+    ]
+    for path in default_paths:
+        if os.path.exists(path):
+            os.remove(path)
+
+    # model_name has no extension, add it when saving state_dict
+    model_name = "model.safetensors" if safetensors else "pytorch_model.bin"
+
+    # shard checkpoint into chunks (10GB default)
+    shards, index = shard_checkpoint(
+        model.state_dict(), max_shard_size=shard_size, weights_name=model_name
+    )
+
+    for shard_file, shard in shards.items():
+        if safetensors:
+            # safetensors must be in the same memory, so we duplicate and use contiguous memory
+            shard = {k: v.clone().contiguous() for k, v in shard.items()}
+            save_file(
+                shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"}
+            )
+        else:
+            torch.save(shard, os.path.join(save_dir, shard_file))
+
+    # save shard index
+    if index is not None:
+        with open(f"{save_dir}/{model_name}.index.json", "w+") as file:
+            file.write(json.dumps(index, indent=4))
+
+    with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
+        json.dump(quant_config, f, indent=2)
+    
+
+
+
+
+
+
+
+
+
+def sizeof_fmt(num, suffix='B'):
+    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}{suffix}"
+        num /= 1024.0
+    return f"{num:.1f}Yi{suffix}"
+
+
+

From 008899d5130552188d780d034497d79a5171d9dd Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 21 May 2024 03:13:07 -0400
Subject: [PATCH 02/40] add file

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py
index 9b4384ff..268f8096 100644
--- a/auto_round/export/__init__.py
+++ b/auto_round/export/__init__.py
@@ -15,3 +15,4 @@
 from .register import EXPORT_FORMAT
 from .export_to_autogptq import save_quantized_as_autogptq
 from .export_to_itrex import save_quantized_as_itrex, QuantConfig
+from .export_to_awq import save_quantized_as_autoawq

From 49deb12ba3ff6582efdb75e1b2bb54a78484b445 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 21 May 2024 04:13:32 -0400
Subject: [PATCH 03/40] update

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_awq.py          | 16 ++++-----
 auto_round/utils.py                         | 39 +++++++++++++++++++++
 examples/language-modeling/main.py          | 11 ++++++
 examples/language-modeling/requirements.txt |  2 +-
 4 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 35374b01..ab74cc56 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -105,17 +105,11 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     from awq.modules.linear import WQLinear_GEMM
     from awq.utils.utils import clear_memory
     from awq import AutoAWQForCausalLM
-    import sys
 
     q_linear_module = WQLinear_GEMM
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
-    logger.info(f"lyt_debug Approximate memory usage of compressed_model: {sizeof_fmt(sys.getsizeof(compressed_model))}, device: {compressed_model.device}")
-    logger.info(f"lyt_debug Approximate memory usage of model: {sizeof_fmt(sys.getsizeof(model))}, device: {model.device}")
-    try:
-        logger.info(f"lyt_debug Approximate memory usage of awq_model: {sizeof_fmt(sys.getsizeof(awq_model))}")
-    except:
-        logger.info("lyt_debug awq model unable to calc")
     self_modules = awq_model.get_model_layers(compressed_model)
+    del awq_model # release memory
     for i in range(len(self_modules)):
         module = self_modules[i]
         named_linears = get_named_linears(module)
@@ -173,8 +167,6 @@ def forward(self, x):
             return x
 
     # Save model and config files with empty state dict
-    from awq.models._config import AwqConfig
-    
     model.config.quantization_config = quant_config
     model.generation_config.do_sample = True
     model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict())
@@ -211,6 +203,7 @@ def forward(self, x):
         with open(f"{save_dir}/{model_name}.index.json", "w+") as file:
             file.write(json.dumps(index, indent=4))
 
+    # save quantize_config
     with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
         json.dump(quant_config, f, indent=2)
     
@@ -232,3 +225,8 @@ def sizeof_fmt(num, suffix='B'):
 
 
 
+def get_size(model):
+    total = 0
+    for param in model.parameters():
+        total += param.nelement() * param.element_size()
+    return total
\ No newline at end of file
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 0e2b832f..8312248e 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -743,3 +743,42 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
         bs = 1
 
     return False, seqlen, bs
+
+
+def get_named_linears(module):
+    """Get the name, linear_op pairs of a given module.
+
+    Args:
+    module: A module to be searched.
+    """
+    return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
+
+
+def set_op_by_name(layer, name, new_module):
+    levels = name.split(".")
+    if len(levels) > 1:
+        mod_ = layer
+        for l_idx in range(len(levels) - 1):
+            if levels[l_idx].isdigit():
+                mod_ = mod_[int(levels[l_idx])]
+            else:
+                mod_ = getattr(mod_, levels[l_idx])
+        setattr(mod_, levels[-1], new_module)
+    else:
+        setattr(layer, name, new_module)
+
+
+def get_module_name(model, module_to_find):
+    """Get the name of a given module in a model.
+
+    Args:
+    model: The model.
+    module_to_find: A module to be found.
+
+    Returns:
+    name: The corresponding name of the given module.
+    """
+    for name, module in model.named_modules():
+        if module is module_to_find:
+            return name
+    return None
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 90a83d93..044c6bb7 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -2,6 +2,8 @@
 import sys
 
 sys.path.insert(0, '../..')
+sys.path.insert(0, '/home/lyt/ChineseLLM_quant/AR_debug/auto-round')
+print(f"lyt_dbeug sys.path： {sys.path}")
 parser = argparse.ArgumentParser()
 import torch
 import os
@@ -141,6 +143,14 @@
     import subprocess
 
 
+    import logging
+
+    logging.basicConfig(level=logging.INFO,
+                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    logger = logging.getLogger(__name__)
+    logger.info("lyt_debug This is an info message")
+
+
     def get_library_version(library_name):
         try:
             version = subprocess.check_output(['pip', 'show', library_name]).decode().split('\n')[1].split(': ')[1]
@@ -309,6 +319,7 @@ def get_library_version(library_name):
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
                                  compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,
                                  device="xpu")
+    autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
     if "cpu" in deployment_device:
         autoround.save_quantized(output_dir=f'{export_dir}-cpu', format='itrex', inplace=inplace)
     if "fake" in deployment_device:
diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt
index 9b0df5e0..b6ddd19c 100644
--- a/examples/language-modeling/requirements.txt
+++ b/examples/language-modeling/requirements.txt
@@ -15,4 +15,4 @@ auto-gptq
 openpyxl
 wandb
 py-cpuinfo
-
+autoawq

From b1b729dd3c36a2ef491824c0b8cca1ce3b465a76 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 02:48:18 +0000
Subject: [PATCH 04/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_awq.py | 60 ++++++++++++------------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index ab74cc56..895e8df6 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -18,7 +18,6 @@
 import os
 from os.path import isdir, isfile, join
 from typing import Dict, List, Optional, Union
-import torch.nn as nn
 
 # MIT License
 #
@@ -42,20 +41,20 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import torch
+import torch.nn as nn
 
 from auto_round.export.register import register_format
 from auto_round.utils import (
-    check_to_quantized, 
-    get_block_names, 
-    get_module, 
+    check_to_quantized,
+    get_block_names,
+    get_module,
     get_module_name,
     get_named_linears,
+    logger,
     set_op_by_name,
-    logger
 )
 
 
-
 @register_format("auto_awq")
 def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     """Export the model to autogptq format to easily leverage cuda kernel."""
@@ -102,14 +101,14 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     else:
         compressed_model = copy.deepcopy(model.to("cpu"))
 
+    from awq import AutoAWQForCausalLM
     from awq.modules.linear import WQLinear_GEMM
     from awq.utils.utils import clear_memory
-    from awq import AutoAWQForCausalLM
 
     q_linear_module = WQLinear_GEMM
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
     self_modules = awq_model.get_model_layers(compressed_model)
-    del awq_model # release memory
+    del awq_model  # release memory
     for i in range(len(self_modules)):
         module = self_modules[i]
         named_linears = get_named_linears(module)
@@ -119,7 +118,7 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
             if not check_to_quantized(info):
                 continue
             info["zp"] = info["zp"].to(torch.float32)
-            scale, zp = info['scale'], info['zp']
+            scale, zp = info["scale"], info["zp"]
             scale = scale.t().contiguous()
             zp = zp.t().contiguous()
             q_linear = q_linear_module.from_linear(
@@ -134,21 +133,23 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
             q_linear.to(next(module.parameters()).device)
             set_op_by_name(module, name, q_linear)
             clear_memory()
-            
+
     quant_config = {
-            "quant_method": 'awq',
-            "zero_point": not sym,
-            "group_size": group_size,
-            "bits": bits,
-            "version": 'gemm',
-            "modules_to_not_convert": None,
-        }
+        "quant_method": "awq",
+        "zero_point": not sym,
+        "group_size": group_size,
+        "bits": bits,
+        "version": "gemm",
+        "modules_to_not_convert": None,
+    }
 
     save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config)
 
 
-from transformers.modeling_utils import shard_checkpoint
 from safetensors.torch import save_file
+from transformers.modeling_utils import shard_checkpoint
+
+
 def save_quantized(
     model,
     save_dir,
@@ -184,17 +185,13 @@ def forward(self, x):
     model_name = "model.safetensors" if safetensors else "pytorch_model.bin"
 
     # shard checkpoint into chunks (10GB default)
-    shards, index = shard_checkpoint(
-        model.state_dict(), max_shard_size=shard_size, weights_name=model_name
-    )
+    shards, index = shard_checkpoint(model.state_dict(), max_shard_size=shard_size, weights_name=model_name)
 
     for shard_file, shard in shards.items():
         if safetensors:
             # safetensors must be in the same memory, so we duplicate and use contiguous memory
             shard = {k: v.clone().contiguous() for k, v in shard.items()}
-            save_file(
-                shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"}
-            )
+            save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"})
         else:
             torch.save(shard, os.path.join(save_dir, shard_file))
 
@@ -206,27 +203,18 @@ def forward(self, x):
     # save quantize_config
     with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
         json.dump(quant_config, f, indent=2)
-    
-
-
-
-
 
 
-
-
-
-def sizeof_fmt(num, suffix='B'):
-    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
         if abs(num) < 1024.0:
             return f"{num:3.1f}{unit}{suffix}"
         num /= 1024.0
     return f"{num:.1f}Yi{suffix}"
 
 
-
 def get_size(model):
     total = 0
     for param in model.parameters():
         total += param.nelement() * param.element_size()
-    return total
\ No newline at end of file
+    return total

From 9e6cf36e153c28bdee725d0c941910157a417461 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Wed, 22 May 2024 22:58:16 -0400
Subject: [PATCH 05/40] update

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_awq.py |  2 +-
 examples/language-modeling/main.py | 12 +-----------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 895e8df6..8ce5fdfa 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -155,7 +155,7 @@ def save_quantized(
     save_dir,
     quant_config,
     safetensors=True,
-    shard_size="5GB",
+    shard_size="10GB",
 ):
     save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir
 
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 044c6bb7..6a3c12bf 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -2,8 +2,6 @@
 import sys
 
 sys.path.insert(0, '../..')
-sys.path.insert(0, '/home/lyt/ChineseLLM_quant/AR_debug/auto-round')
-print(f"lyt_dbeug sys.path： {sys.path}")
 parser = argparse.ArgumentParser()
 import torch
 import os
@@ -143,14 +141,6 @@
     import subprocess
 
 
-    import logging
-
-    logging.basicConfig(level=logging.INFO,
-                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-    logger = logging.getLogger(__name__)
-    logger.info("lyt_debug This is an info message")
-
-
     def get_library_version(library_name):
         try:
             version = subprocess.check_output(['pip', 'show', library_name]).decode().split('\n')[1].split(': ')[1]
@@ -319,13 +309,13 @@ def get_library_version(library_name):
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
                                  compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,
                                  device="xpu")
-    autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
     if "cpu" in deployment_device:
         autoround.save_quantized(output_dir=f'{export_dir}-cpu', format='itrex', inplace=inplace)
     if "fake" in deployment_device:
         model = model.to("cpu")
         model.save_pretrained(output_dir)
         tokenizer.save_pretrained(output_dir)
+    autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
 
     if not args.disable_eval and "fake" in deployment_device:  ##support autogptq real eval later
         excel_name = f"{output_dir}_result.xlsx"

From 86b9b96a8efd67384cf0721986a7e261f3cafb0c Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 23 May 2024 03:42:22 -0400
Subject: [PATCH 06/40] fix import error

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 09874574..cdc39334 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ py-cpuinfo
 sentencepiece
 torch
 transformers
+autoawq
\ No newline at end of file

From 10a91e686b8ff5235ad8823dce45c1493a861f2c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 07:43:20 +0000
Subject: [PATCH 07/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index cdc39334..eb65a0c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
 accelerate
 auto-gptq
+autoawq
 datasets
 py-cpuinfo
 sentencepiece
 torch
 transformers
-autoawq
\ No newline at end of file

From 4807994b9a64c3ffb82a048862da6a6343dbad2a Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 23 May 2024 04:05:45 -0400
Subject: [PATCH 08/40] minor change

---
 auto_round/export/export_to_awq.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 8ce5fdfa..e2c822a0 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -204,17 +204,3 @@ def forward(self, x):
     with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
         json.dump(quant_config, f, indent=2)
 
-
-def sizeof_fmt(num, suffix="B"):
-    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
-        if abs(num) < 1024.0:
-            return f"{num:3.1f}{unit}{suffix}"
-        num /= 1024.0
-    return f"{num:.1f}Yi{suffix}"
-
-
-def get_size(model):
-    total = 0
-    for param in model.parameters():
-        total += param.nelement() * param.element_size()
-    return total

From 5822543c09b7d31baca018d3eee2fe1b78694f1a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 08:07:32 +0000
Subject: [PATCH 09/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_awq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index e2c822a0..228ad663 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -203,4 +203,3 @@ def forward(self, x):
     # save quantize_config
     with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
         json.dump(quant_config, f, indent=2)
-

From 991b5e9f949463a4998ee6d4a5d61ee14c90adac Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 23 May 2024 05:06:50 -0400
Subject: [PATCH 10/40] update

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_awq.py | 91 ++++++++++++++++++++++++------
 auto_round/utils.py                | 38 +------------
 requirements.txt                   |  1 -
 3 files changed, 75 insertions(+), 55 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 228ad663..58bf0799 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -20,19 +20,19 @@
 from typing import Dict, List, Optional, Union
 
 # MIT License
-#
-# Copyright (c) 2023 潘其威(William)
-#
+
+# Copyright (c) 2023 MIT HAN Lab
+
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
+
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
-#
+
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -48,10 +48,8 @@
     check_to_quantized,
     get_block_names,
     get_module,
-    get_module_name,
-    get_named_linears,
     logger,
-    set_op_by_name,
+    convert_dtype_torch2str_hf
 )
 
 
@@ -134,14 +132,20 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
             set_op_by_name(module, name, q_linear)
             clear_memory()
 
-    quant_config = {
-        "quant_method": "awq",
-        "zero_point": not sym,
-        "group_size": group_size,
-        "bits": bits,
-        "version": "gemm",
-        "modules_to_not_convert": None,
-    }
+    quant_config = {}
+    quant_config["quant_method"] = "awq"
+    quant_config["modules_to_not_convert"] = None
+    quant_config["version"] = "gemm"
+    quant_config["iters"] = iters
+    quant_config["lr"] = lr
+    quant_config["minmax_lr"] = minmax_lr
+    quant_config["enable_minmax_tuning"] = enable_minmax_tuning
+    quant_config["enable_quanted_input"] = enable_quanted_input
+    quant_config["scale_dtype"] = convert_dtype_torch2str_hf(scale_dtype)
+    quant_config["sym"] = sym
+    quant_config["bits"] = bits
+    quant_config["group_size"] = group_size
+    quant_config["zero_point"] = not sym
 
     save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config)
 
@@ -168,7 +172,16 @@ def forward(self, x):
             return x
 
     # Save model and config files with empty state dict
-    model.config.quantization_config = quant_config
+    awq_quant_config =  {
+        "quant_method": "awq",
+        "zero_point": quant_config["zero_point"],
+        "group_size": quant_config["group_size"],
+        "bits": quant_config["bits"],
+        "version": "gemm",
+        "modules_to_not_convert": None,
+    }
+
+    model.config.quantization_config = awq_quant_config
     model.generation_config.do_sample = True
     model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict())
 
@@ -200,6 +213,50 @@ def forward(self, x):
         with open(f"{save_dir}/{model_name}.index.json", "w+") as file:
             file.write(json.dumps(index, indent=4))
 
+    q
     # save quantize_config
     with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
         json.dump(quant_config, f, indent=2)
+
+
+
+def get_named_linears(module):
+    """Get the name, linear_op pairs of a given module.
+
+    Args:
+    module: A module to be searched.
+    """
+    return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
+
+
+def set_op_by_name(layer, name, new_module):
+    levels = name.split(".")
+    if len(levels) > 1:
+        mod_ = layer
+        for l_idx in range(len(levels) - 1):
+            if levels[l_idx].isdigit():
+                mod_ = mod_[int(levels[l_idx])]
+            else:
+                mod_ = getattr(mod_, levels[l_idx])
+        setattr(mod_, levels[-1], new_module)
+    else:
+        setattr(layer, name, new_module)
+
+
+def get_module_name(model, module_to_find):
+    """Get the name of a given module in a model.
+
+    Args:
+    model: The model.
+    module_to_find: A module to be found.
+
+    Returns:
+    name: The corresponding name of the given module.
+    """
+    for name, module in model.named_modules():
+        if module is module_to_find:
+            return name
+    return None
+
+
+
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 8312248e..922be595 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -70,6 +70,7 @@ def __call__(self, *args, **kwargs):
 
 auto_gptq = LazyImport("auto_gptq")
 htcore = LazyImport("habana_frameworks.torch.core")
+awq = LazyImport("autoawq")
 
 
 def is_optimum_habana_available():
@@ -745,40 +746,3 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
     return False, seqlen, bs
 
 
-def get_named_linears(module):
-    """Get the name, linear_op pairs of a given module.
-
-    Args:
-    module: A module to be searched.
-    """
-    return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
-
-
-def set_op_by_name(layer, name, new_module):
-    levels = name.split(".")
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_module_name(model, module_to_find):
-    """Get the name of a given module in a model.
-
-    Args:
-    model: The model.
-    module_to_find: A module to be found.
-
-    Returns:
-    name: The corresponding name of the given module.
-    """
-    for name, module in model.named_modules():
-        if module is module_to_find:
-            return name
-    return None
diff --git a/requirements.txt b/requirements.txt
index eb65a0c9..09874574 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 accelerate
 auto-gptq
-autoawq
 datasets
 py-cpuinfo
 sentencepiece

From 4051f55c557b112741afbb0389a80f627ec80c4e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 09:07:49 +0000
Subject: [PATCH 11/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_awq.py | 38 +++++++++++-------------------
 auto_round/utils.py                |  2 --
 2 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 58bf0799..b4bda892 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -19,6 +19,19 @@
 from os.path import isdir, isfile, join
 from typing import Dict, List, Optional, Union
 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import torch.nn as nn
+
+from auto_round.export.register import register_format
+from auto_round.utils import check_to_quantized, convert_dtype_torch2str_hf, get_block_names, get_module, logger
+
 # MIT License
 
 # Copyright (c) 2023 MIT HAN Lab
@@ -33,25 +46,6 @@
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import torch.nn as nn
-
-from auto_round.export.register import register_format
-from auto_round.utils import (
-    check_to_quantized,
-    get_block_names,
-    get_module,
-    logger,
-    convert_dtype_torch2str_hf
-)
-
 
 @register_format("auto_awq")
 def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
@@ -172,7 +166,7 @@ def forward(self, x):
             return x
 
     # Save model and config files with empty state dict
-    awq_quant_config =  {
+    awq_quant_config = {
         "quant_method": "awq",
         "zero_point": quant_config["zero_point"],
         "group_size": quant_config["group_size"],
@@ -219,7 +213,6 @@ def forward(self, x):
         json.dump(quant_config, f, indent=2)
 
 
-
 def get_named_linears(module):
     """Get the name, linear_op pairs of a given module.
 
@@ -257,6 +250,3 @@ def get_module_name(model, module_to_find):
         if module is module_to_find:
             return name
     return None
-
-
-
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 922be595..f4602627 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -744,5 +744,3 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
         bs = 1
 
     return False, seqlen, bs
-
-

From f9668121a254e5eb21c0e9f1a27d8c5d12c51c11 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 23 May 2024 05:11:58 -0400
Subject: [PATCH 12/40] fix pylint error

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_awq.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index b4bda892..0217621d 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -93,9 +93,9 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     else:
         compressed_model = copy.deepcopy(model.to("cpu"))
 
-    from awq import AutoAWQForCausalLM
-    from awq.modules.linear import WQLinear_GEMM
-    from awq.utils.utils import clear_memory
+    from awq import AutoAWQForCausalLM # pylint: disable=E0401
+    from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
+    from awq.utils.utils import clear_memory # pylint: disable=E0401
 
     q_linear_module = WQLinear_GEMM
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
@@ -207,7 +207,6 @@ def forward(self, x):
         with open(f"{save_dir}/{model_name}.index.json", "w+") as file:
             file.write(json.dumps(index, indent=4))
 
-    q
     # save quantize_config
     with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
         json.dump(quant_config, f, indent=2)

From 5a1188c7ad9bf1028b07f365c1b41f3917840e08 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 09:12:44 +0000
Subject: [PATCH 13/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_awq.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 0217621d..ac4c66ff 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -93,9 +93,9 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     else:
         compressed_model = copy.deepcopy(model.to("cpu"))
 
-    from awq import AutoAWQForCausalLM # pylint: disable=E0401
-    from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
-    from awq.utils.utils import clear_memory # pylint: disable=E0401
+    from awq import AutoAWQForCausalLM  # pylint: disable=E0401
+    from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+    from awq.utils.utils import clear_memory  # pylint: disable=E0401
 
     q_linear_module = WQLinear_GEMM
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)

From 9c4f73b3a7182f88afede6e20ca81c6732bf190a Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 23 May 2024 23:20:48 -0400
Subject: [PATCH 14/40] minor fix

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 examples/language-modeling/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 43eb91d0..36a44f19 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -320,7 +320,8 @@ def get_library_version(library_name):
         model = model.to("cpu")
         model.save_pretrained(output_dir)
         tokenizer.save_pretrained(output_dir)
-    autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
+    if args.bits == 4:
+        autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
 
     if not args.disable_eval and "fake" in deployment_device:  ##support autogptq real eval later
         excel_name = f"{output_dir}_result.xlsx"

From 05d6e1db8350c0eed79cd21abff7240342fe0cd9 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 23 May 2024 23:51:01 -0400
Subject: [PATCH 15/40] fix pylint issue

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 .../export/export_to_autoround/autoround_quantizer.py       | 6 +++---
 .../export/export_to_autoround/export_to_autoround.py       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/auto_round/export/export_to_autoround/autoround_quantizer.py b/auto_round/export/export_to_autoround/autoround_quantizer.py
index 9a265104..dfb2b5af 100644
--- a/auto_round/export/export_to_autoround/autoround_quantizer.py
+++ b/auto_round/export/export_to_autoround/autoround_quantizer.py
@@ -361,14 +361,14 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
                 in_features = layer.weight.shape[0]
                 out_features = layer.weight.shape[1]
             bias = layer.bias is not None
-            new_layer = QuantLinear(
+            new_layer = QuantLinear( # pylint: disable=E1123
                 bits,
                 group_size,
                 in_features,
                 out_features,
                 bias,
-                weight_dtype=layer.weight.dtype,  # pylint: disable=E1123
-            )
+                weight_dtype=layer.weight.dtype,  
+            ) 
 
             new_layer.device = device
             set_module(module, layer_name, new_layer)
diff --git a/auto_round/export/export_to_autoround/export_to_autoround.py b/auto_round/export/export_to_autoround/export_to_autoround.py
index fab55a26..c20967d8 100644
--- a/auto_round/export/export_to_autoround/export_to_autoround.py
+++ b/auto_round/export/export_to_autoround/export_to_autoround.py
@@ -140,9 +140,9 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav
             out_features = layer.weight.shape[1]
         bias = layer.bias is not None and torch.any(layer.bias)
 
-        new_layer = QuantLinear(
-            bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype  ##pylint: disable=E1123
-        )
+        new_layer = QuantLinear(  # pylint: disable=E1123
+            bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
+        ) 
 
         new_layer.device = device
         set_module(model, name, new_layer)

From 72b48b1d2eb23ac5dfcbfada55bd7f4a574c5d7c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 May 2024 03:51:44 +0000
Subject: [PATCH 16/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../export/export_to_autoround/autoround_quantizer.py       | 6 +++---
 .../export/export_to_autoround/export_to_autoround.py       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_autoround/autoround_quantizer.py b/auto_round/export/export_to_autoround/autoround_quantizer.py
index dfb2b5af..0b77fb6a 100644
--- a/auto_round/export/export_to_autoround/autoround_quantizer.py
+++ b/auto_round/export/export_to_autoround/autoround_quantizer.py
@@ -361,14 +361,14 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
                 in_features = layer.weight.shape[0]
                 out_features = layer.weight.shape[1]
             bias = layer.bias is not None
-            new_layer = QuantLinear( # pylint: disable=E1123
+            new_layer = QuantLinear(  # pylint: disable=E1123
                 bits,
                 group_size,
                 in_features,
                 out_features,
                 bias,
-                weight_dtype=layer.weight.dtype,  
-            ) 
+                weight_dtype=layer.weight.dtype,
+            )
 
             new_layer.device = device
             set_module(module, layer_name, new_layer)
diff --git a/auto_round/export/export_to_autoround/export_to_autoround.py b/auto_round/export/export_to_autoround/export_to_autoround.py
index c20967d8..1016d1eb 100644
--- a/auto_round/export/export_to_autoround/export_to_autoround.py
+++ b/auto_round/export/export_to_autoround/export_to_autoround.py
@@ -142,7 +142,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav
 
         new_layer = QuantLinear(  # pylint: disable=E1123
             bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-        ) 
+        )
 
         new_layer.device = device
         set_module(model, name, new_layer)

From 40e8ce7e348e107251c57d18110a2fc0530f4def Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Fri, 24 May 2024 02:08:09 -0400
Subject: [PATCH 17/40] update import of awq

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/autoround.py                     | 6 ++++++
 auto_round/utils.py                         | 1 -
 examples/language-modeling/main.py          | 4 ++--
 examples/language-modeling/requirements.txt | 3 +--
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index dcb08bfb..9be4ded5 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1350,6 +1350,12 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw
         if format not in EXPORT_FORMAT:
             logger.error(f"export format only supports {EXPORT_FORMAT.keys()}")
             exit()
+        if format == "auto_awq":
+            try:
+                import awq
+            except:
+                logger.error("autoawq is required. Please install it to support auto_awq format.")
+                return 
         save_quantized_as_format = EXPORT_FORMAT.get(format)
         compressed_model = save_quantized_as_format(  ##TODO refine the code
             output_dir,
diff --git a/auto_round/utils.py b/auto_round/utils.py
index ccd8c7d0..c830b8d8 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -70,7 +70,6 @@ def __call__(self, *args, **kwargs):
 
 auto_gptq = LazyImport("auto_gptq")
 htcore = LazyImport("habana_frameworks.torch.core")
-awq = LazyImport("autoawq")
 
 
 def is_optimum_habana_available():
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 36a44f19..a4116c77 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -310,6 +310,8 @@ def get_library_version(library_name):
             autoround.save_quantized(f'{export_dir}-gpu', format="autoround", use_triton=True, inplace=inplace)
         else:
             autoround.save_quantized(f'{export_dir}-gpu', format="auto_gptq", use_triton=True, inplace=inplace)
+        if args.bits == 4:
+            autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
     if 'xpu' in deployment_device:
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
                                  compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,
@@ -320,8 +322,6 @@ def get_library_version(library_name):
         model = model.to("cpu")
         model.save_pretrained(output_dir)
         tokenizer.save_pretrained(output_dir)
-    if args.bits == 4:
-        autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
 
     if not args.disable_eval and "fake" in deployment_device:  ##support autogptq real eval later
         excel_name = f"{output_dir}_result.xlsx"
diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt
index b6ddd19c..04b21595 100644
--- a/examples/language-modeling/requirements.txt
+++ b/examples/language-modeling/requirements.txt
@@ -14,5 +14,4 @@ protobuf
 auto-gptq
 openpyxl
 wandb
-py-cpuinfo
-autoawq
+py-cpuinfo
\ No newline at end of file

From 55b05e661dd18f634908d15c020733d66a1050b1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 May 2024 06:08:51 +0000
Subject: [PATCH 18/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 9be4ded5..bb1e060e 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1355,7 +1355,7 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw
                 import awq
             except:
                 logger.error("autoawq is required. Please install it to support auto_awq format.")
-                return 
+                return
         save_quantized_as_format = EXPORT_FORMAT.get(format)
         compressed_model = save_quantized_as_format(  ##TODO refine the code
             output_dir,

From 07c0f184fbd3ccdf8a7da9cc81281b9fbb26b44b Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Fri, 24 May 2024 02:19:56 -0400
Subject: [PATCH 19/40] fix import error

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index bb1e060e..fa143bc4 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1352,7 +1352,7 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw
             exit()
         if format == "auto_awq":
             try:
-                import awq
+                import awq # pylint: disable=E0401
             except:
                 logger.error("autoawq is required. Please install it to support auto_awq format.")
                 return

From 8184d8479ff9caccc9036e221271b57c9164189a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 May 2024 06:20:38 +0000
Subject: [PATCH 20/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index fa143bc4..bff90ba1 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1352,7 +1352,7 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw
             exit()
         if format == "auto_awq":
             try:
-                import awq # pylint: disable=E0401
+                import awq  # pylint: disable=E0401
             except:
                 logger.error("autoawq is required. Please install it to support auto_awq format.")
                 return

From 02687acacec41acd0fc836f316f138cd2503e683 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Sun, 26 May 2024 22:01:54 -0400
Subject: [PATCH 21/40] modify code

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/autoround.py            |  6 ------
 auto_round/export/export_to_awq.py | 21 ++++++++++++---------
 examples/language-modeling/main.py |  4 ++--
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index bff90ba1..dcb08bfb 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1350,12 +1350,6 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw
         if format not in EXPORT_FORMAT:
             logger.error(f"export format only supports {EXPORT_FORMAT.keys()}")
             exit()
-        if format == "auto_awq":
-            try:
-                import awq  # pylint: disable=E0401
-            except:
-                logger.error("autoawq is required. Please install it to support auto_awq format.")
-                return
         save_quantized_as_format = EXPORT_FORMAT.get(format)
         compressed_model = save_quantized_as_format(  ##TODO refine the code
             output_dir,
diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index ac4c66ff..0c3753ba 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -48,8 +48,18 @@
 
 
 @register_format("auto_awq")
-def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
+def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
     """Export the model to autogptq format to easily leverage cuda kernel."""
+
+    try:
+        from awq import AutoAWQForCausalLM  # pylint: disable=E0401
+        from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+        from awq.utils.utils import clear_memory  # pylint: disable=E0401
+    except:
+        logger.error("autoawq is required. Please install it to support auto_awq format.")
+        return
+
+
     model = kwargs["model"]
     weight_config = kwargs["weight_config"]
     sym = kwargs["sym"]
@@ -88,14 +98,7 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     if all_to_quantized:
         modules_in_block_to_quantize = None
 
-    if inplace:
-        compressed_model = model.to("cpu")
-    else:
-        compressed_model = copy.deepcopy(model.to("cpu"))
-
-    from awq import AutoAWQForCausalLM  # pylint: disable=E0401
-    from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
-    from awq.utils.utils import clear_memory  # pylint: disable=E0401
+    compressed_model = copy.deepcopy(model.to("cpu"))
 
     q_linear_module = WQLinear_GEMM
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index a4116c77..244eb593 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -310,8 +310,8 @@ def get_library_version(library_name):
             autoround.save_quantized(f'{export_dir}-gpu', format="autoround", use_triton=True, inplace=inplace)
         else:
             autoround.save_quantized(f'{export_dir}-gpu', format="auto_gptq", use_triton=True, inplace=inplace)
-        if args.bits == 4:
-            autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name)
+    if deployment_device == ['gpu'] and args.bits == 4:
+        autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", model_path=args.model_name)
     if 'xpu' in deployment_device:
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
                                  compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,

From 6d6b8b14bf3cf0694a20f74466ddc4f617ea0393 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 27 May 2024 02:06:18 +0000
Subject: [PATCH 22/40] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_awq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 0c3753ba..03e3a722 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -59,7 +59,6 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
         logger.error("autoawq is required. Please install it to support auto_awq format.")
         return
 
-
     model = kwargs["model"]
     weight_config = kwargs["weight_config"]
     sym = kwargs["sym"]

From e91fb2057ab44b934b63030650d6073b4a2754e7 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Sun, 26 May 2024 22:20:09 -0400
Subject: [PATCH 23/40] fix doc typo

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 docs/Meta-Llama-3-8B-Instruct-acc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Meta-Llama-3-8B-Instruct-acc.md b/docs/Meta-Llama-3-8B-Instruct-acc.md
index 24db9992..448c4e47 100644
--- a/docs/Meta-Llama-3-8B-Instruct-acc.md
+++ b/docs/Meta-Llama-3-8B-Instruct-acc.md
@@ -9,7 +9,7 @@ for evaluation with quantized lm-head
 ```bash
 git clone https://github.com/intel/auto-round
 cd auto-round/examples/language-modeling
-python3 eval_042/evluation.py --model_name "./" --eval_bs 16
+python3 eval_042/evaluation.py --model_name "./" --eval_bs 16
 ```
 
 | Metric           | **BF16** | w4g128 w/o lm-head | w4g128 with lm-head |

From a775b21c09281c9b4442ff2bdaa2927b568972dd Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Mon, 27 May 2024 05:33:37 -0400
Subject: [PATCH 24/40] fix mixtral issue

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_awq.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index 03e3a722..bb095f25 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -111,7 +111,6 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
             info = weight_config[key]
             if not check_to_quantized(info):
                 continue
-            info["zp"] = info["zp"].to(torch.float32)
             scale, zp = info["scale"], info["zp"]
             scale = scale.t().contiguous()
             zp = zp.t().contiguous()
@@ -131,6 +130,8 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
     quant_config = {}
     quant_config["quant_method"] = "awq"
     quant_config["modules_to_not_convert"] = None
+    if compressed_model.config.model_type == 'mixtral':
+       quant_config["modules_to_not_convert"] = ["gate"] 
     quant_config["version"] = "gemm"
     quant_config["iters"] = iters
     quant_config["lr"] = lr
@@ -174,9 +175,8 @@ def forward(self, x):
         "group_size": quant_config["group_size"],
         "bits": quant_config["bits"],
         "version": "gemm",
-        "modules_to_not_convert": None,
+        "modules_to_not_convert": quant_config["modules_to_not_convert"],
     }
-
     model.config.quantization_config = awq_quant_config
     model.generation_config.do_sample = True
     model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict())

From fc39b70fdaf353803372f689ac93a9877c885879 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 28 May 2024 01:36:30 -0400
Subject: [PATCH 25/40] mv awq to autoround format, fix mixtral issue and
 follow autoround format keys

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 .../export_to_autoround.py                    | 190 +++++++++---------
 auto_round/export/export_to_awq.py            |  45 +----
 examples/language-modeling/main.py            |   2 -
 3 files changed, 108 insertions(+), 129 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_autoround.py b/auto_round/export/export_to_autoround/export_to_autoround.py
index 34d0ac13..66999a85 100644
--- a/auto_round/export/export_to_autoround/export_to_autoround.py
+++ b/auto_round/export/export_to_autoround/export_to_autoround.py
@@ -94,99 +94,107 @@ def get_autogptq_backend_config(backend, bits=4):
 
 
 @register_format("autoround")
-def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav2", **kwargs):
-    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
-
-    model = kwargs["model"]
-    if not inplace:
-        model = copy.deepcopy(model.to("cpu"))
-    layer_names_in_block = get_layer_names_in_block(model)
-
-    weight_config = kwargs["weight_config"]
-    for name in weight_config.keys():
-
-        config = kwargs["weight_config"][name]
-        if config["data_type"] != "int" and config["bits"] >= 16:
-            continue
-        logger.info(f"packing {name}")
-
-        bits = config["bits"]
-        group_size = config["group_size"]
-        use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin = get_autogptq_backend_config(
-            backend, bits
-        )
-
-        layer = get_module(model, name)
-        device = "cpu"
-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=use_triton,
-            desc_act=False,
-            group_size=group_size,
-            bits=bits,
-            disable_exllama=disable_exllamav1,
-            disable_exllamav2=disable_exllamav2,
-            use_qigen=use_qigen,
-            disable_marlin=disable_marlin,
-        )
-
-        if isinstance(layer, nn.Linear):
-            in_features = layer.in_features
-            out_features = layer.out_features
-        elif isinstance(layer, nn.Conv2d):
-            in_features = layer.in_channels
-            out_features = layer.out_channels
-        elif isinstance(layer, transformers.pytorch_utils.Conv1D):
-            in_features = layer.weight.shape[0]
-            out_features = layer.weight.shape[1]
-        bias = layer.bias is not None and torch.any(layer.bias)
-
-        new_layer = QuantLinear(  ##pylint: disable=E1123
-            bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-        )
-
-        new_layer.device = device
-        set_module(model, name, new_layer)
-        qlayer = new_layer
-        scale = weight_config[name]["scale"]
-        zero = weight_config[name]["zp"]
-        # so far can only pack layer on CPU
-        qlayer.to("cpu")
-        layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu")
-        qlayer.pack(layer, scale, zero, None)
-        qlayer.to(device)
-    quantization_config = kwargs["serialization_dict"]
-    quantization_config["quant_method"] = "intel/auto-round"
-    quantization_config["backend"] = backend
-    extra_config = {}
-    for layer_name in weight_config:
-        if weight_config[layer_name]["data_type"] != "int" and weight_config[layer_name]["bits"] >= 16:
-            continue
-        if layer_name not in layer_names_in_block:
-            extra_config[layer_name] = {}
-            extra_config[layer_name]["bits"] = weight_config[layer_name]["bits"]
-            extra_config[layer_name]["data_type"] = weight_config[layer_name]["data_type"]
-            extra_config[layer_name]["group_size"] = weight_config[layer_name]["group_size"]
-            extra_config[layer_name]["sym"] = weight_config[layer_name]["sym"]
-        else:
-            neq_keys = check_neq_config(
-                weight_config[layer_name],
-                data_type=quantization_config["data_type"],
-                bits=quantization_config["bits"],
-                group_size=quantization_config["group_size"],
-                sym=quantization_config["sym"],
+def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav2", model_path="", **kwargs):
+    if "awq" not in backend:
+        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+        model = kwargs["model"]
+        if not inplace:
+            model = copy.deepcopy(model.to("cpu"))
+        layer_names_in_block = get_layer_names_in_block(model)
+
+        weight_config = kwargs["weight_config"]
+        for name in weight_config.keys():
+
+            config = kwargs["weight_config"][name]
+            if config["data_type"] != "int" and config["bits"] >= 16:
+                continue
+            logger.info(f"packing {name}")
+
+            bits = config["bits"]
+            group_size = config["group_size"]
+            use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin = get_autogptq_backend_config(
+                backend, bits
             )
-            if len(neq_keys) > 0:
+
+            layer = get_module(model, name)
+            device = "cpu"
+            QuantLinear = dynamically_import_QuantLinear(
+                use_triton=use_triton,
+                desc_act=False,
+                group_size=group_size,
+                bits=bits,
+                disable_exllama=disable_exllamav1,
+                disable_exllamav2=disable_exllamav2,
+                use_qigen=use_qigen,
+                disable_marlin=disable_marlin,
+            )
+
+            if isinstance(layer, nn.Linear):
+                in_features = layer.in_features
+                out_features = layer.out_features
+            elif isinstance(layer, nn.Conv2d):
+                in_features = layer.in_channels
+                out_features = layer.out_channels
+            elif isinstance(layer, transformers.pytorch_utils.Conv1D):
+                in_features = layer.weight.shape[0]
+                out_features = layer.weight.shape[1]
+            bias = layer.bias is not None and torch.any(layer.bias)
+
+            new_layer = QuantLinear(  # pylint: disable=E1123
+                bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
+            )
+
+            new_layer.device = device
+            set_module(model, name, new_layer)
+            qlayer = new_layer
+            scale = weight_config[name]["scale"]
+            zero = weight_config[name]["zp"]
+            # so far can only pack layer on CPU
+            qlayer.to("cpu")
+            layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu")
+            qlayer.pack(layer, scale, zero, None)
+            qlayer.to(device)
+        quantization_config = kwargs["serialization_dict"]
+        quantization_config["quant_method"] = "intel/auto-round"
+        quantization_config["backend"] = backend
+        extra_config = {}
+        for layer_name in weight_config:
+            if weight_config[layer_name]["data_type"] != "int" and weight_config[layer_name]["bits"] >= 16:
+                continue
+            if layer_name not in layer_names_in_block:
                 extra_config[layer_name] = {}
-            for key in neq_keys:
-                extra_config[layer_name][key] = weight_config[layer_name][key]
-    if len(extra_config) > 0:
-        quantization_config["extra_config"] = extra_config
-    if hasattr(model, "config"):
-        model.config.quantization_config = quantization_config
-    tokenizer = kwargs["tokenizer"]
-    if tokenizer is not None:
-        tokenizer.save_pretrained(output_dir)
-    save(model, output_dir)
+                extra_config[layer_name]["bits"] = weight_config[layer_name]["bits"]
+                extra_config[layer_name]["data_type"] = weight_config[layer_name]["data_type"]
+                extra_config[layer_name]["group_size"] = weight_config[layer_name]["group_size"]
+                extra_config[layer_name]["sym"] = weight_config[layer_name]["sym"]
+            else:
+                neq_keys = check_neq_config(
+                    weight_config[layer_name],
+                    data_type=quantization_config["data_type"],
+                    bits=quantization_config["bits"],
+                    group_size=quantization_config["group_size"],
+                    sym=quantization_config["sym"],
+                )
+                if len(neq_keys) > 0:
+                    extra_config[layer_name] = {}
+                for key in neq_keys:
+                    extra_config[layer_name][key] = weight_config[layer_name][key]
+        if len(extra_config) > 0:
+            quantization_config["extra_config"] = extra_config
+        if hasattr(model, "config"):
+            model.config.quantization_config = quantization_config
+        tokenizer = kwargs["tokenizer"]
+        if tokenizer is not None:
+            tokenizer.save_pretrained(output_dir)
+        save(model, output_dir)
+    else:
+        if not model_path:
+            logger.error("Please provide model path for awq format.")
+            return
+        from ..export_to_awq import save_quantized_as_autoawq
+        save_quantized_as_autoawq(output_dir=output_dir, model_path=model_path, kwargs=kwargs)
+
 
 
 def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index bb095f25..aab878f1 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -48,7 +48,7 @@
 
 
 @register_format("auto_awq")
-def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
+def save_quantized_as_autoawq(output_dir, model_path, kwargs):
     """Export the model to autogptq format to easily leverage cuda kernel."""
 
     try:
@@ -76,26 +76,6 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
     logger.info("Saving quantized model to autoawq format")
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
-    ##check module quantized in block, this may have bug for mixed precision quantization
-    block_name = get_block_names(model)[0]
-    first_block = get_module(model, block_name)
-    all_to_quantized = True
-    modules_in_block_to_quantize = []
-    for n, m in first_block.named_modules():
-        is_supported_type = False
-        for supported_type in supported_types:
-            if isinstance(m, supported_type):
-                is_supported_type = True
-                break
-        if not is_supported_type:
-            continue
-        if not check_to_quantized(m):
-            all_to_quantized = False
-        else:
-            modules_in_block_to_quantize.append(n)
-    modules_in_block_to_quantize = [modules_in_block_to_quantize]
-    if all_to_quantized:
-        modules_in_block_to_quantize = None
 
     compressed_model = copy.deepcopy(model.to("cpu"))
 
@@ -103,6 +83,7 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
     self_modules = awq_model.get_model_layers(compressed_model)
     del awq_model  # release memory
+    modules_to_not_convert = []
     for i in range(len(self_modules)):
         module = self_modules[i]
         named_linears = get_named_linears(module)
@@ -110,7 +91,9 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
             key = get_module_name(compressed_model, linear_layer)
             info = weight_config[key]
             if not check_to_quantized(info):
+                modules_to_not_convert.append(key)
                 continue
+            info["zp"] = info["zp"].to(torch.float32)
             scale, zp = info["scale"], info["zp"]
             scale = scale.t().contiguous()
             zp = zp.t().contiguous()
@@ -127,22 +110,12 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs):
             set_op_by_name(module, name, q_linear)
             clear_memory()
 
-    quant_config = {}
-    quant_config["quant_method"] = "awq"
-    quant_config["modules_to_not_convert"] = None
-    if compressed_model.config.model_type == 'mixtral':
-       quant_config["modules_to_not_convert"] = ["gate"] 
-    quant_config["version"] = "gemm"
-    quant_config["iters"] = iters
-    quant_config["lr"] = lr
-    quant_config["minmax_lr"] = minmax_lr
-    quant_config["enable_minmax_tuning"] = enable_minmax_tuning
-    quant_config["enable_quanted_input"] = enable_quanted_input
-    quant_config["scale_dtype"] = convert_dtype_torch2str_hf(scale_dtype)
-    quant_config["sym"] = sym
-    quant_config["bits"] = bits
-    quant_config["group_size"] = group_size
+    quant_config = kwargs["serialization_dict"]
     quant_config["zero_point"] = not sym
+    quant_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert
+    quant_config["version"] = "gemm"
+    quant_config["quant_method"] = "intel/auto-round"
+    quant_config["backend"] = "awq"
 
     save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config)
 
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index ccf08eca..07832924 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -320,8 +320,6 @@ def get_library_version(library_name):
     output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"
 
     inplace = True if len(deployment_device) < 2 else False
-    if deployment_device == ['gpu'] and args.bits == 4:
-        autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", model_path=args.model_name)
     if 'gpu' in deployment_device:
         autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace)
     if 'xpu' in deployment_device:

From 3480b8d608f3c0c0774cdf505d00bd1d55991709 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 28 May 2024 03:28:23 -0400
Subject: [PATCH 26/40] minor fix

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/__init__.py      | 1 -
 auto_round/export/export_to_awq.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py
index 71e9ac34..5a48d41f 100644
--- a/auto_round/export/__init__.py
+++ b/auto_round/export/__init__.py
@@ -15,6 +15,5 @@
 from .register import EXPORT_FORMAT
 from .export_to_autogptq import save_quantized_as_autogptq
 from .export_to_itrex import save_quantized_as_itrex, QuantConfig
-from .export_to_awq import save_quantized_as_autoawq
 from .export_to_autoround.export_to_autoround import save_quantized_as_autoround
 from .export_to_autoround import AutoHfQuantizer
diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
index aab878f1..673eeda2 100644
--- a/auto_round/export/export_to_awq.py
+++ b/auto_round/export/export_to_awq.py
@@ -47,7 +47,6 @@
 # copies or substantial portions of the Software.
 
 
-@register_format("auto_awq")
 def save_quantized_as_autoawq(output_dir, model_path, kwargs):
     """Export the model to autogptq format to easily leverage cuda kernel."""
 

From 81576c5ffc968c1ba0fa4497cf70e6d830bad896 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 13 Jun 2024 20:44:13 -0400
Subject: [PATCH 27/40] mv to autoround format

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 .../export/export_to_autoround/export.py      | 109 +++++++--
 auto_round/export/export_to_awq.py            | 225 ------------------
 2 files changed, 94 insertions(+), 240 deletions(-)
 delete mode 100644 auto_round/export/export_to_awq.py

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index f89a03aa..b973bab1 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -88,6 +88,13 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size):
         )
         return QuantLinear
     ##export all use trition, inference use exllamav2
+    elif "awq" in backend:
+        try:
+            from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+            return WQLinear_GEMM
+        except:
+            logger.error("autoawq is required. Please install it to support auto_awq format.")
+            return
     elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend:
         from auto_round_extension.cuda.qliner_triton import QuantLinear
         return QuantLinear
@@ -103,11 +110,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
         model = copy.deepcopy(model.to("cpu"))
     layer_names_in_block = get_layer_names_in_block(model)
 
+    modules_to_not_convert = []
     weight_config = kwargs["weight_config"]
     for name in weight_config.keys():
 
         config = kwargs["weight_config"][name]
         if config["data_type"] != "int" and config["bits"] >= 16:
+            if "awq" in backend:
+                modules_to_not_convert.append(name)
             continue
         logger.info(f"packing {name}")
 
@@ -130,23 +140,46 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
             out_features = layer.weight.shape[1]
         bias = layer.bias is not None and torch.any(layer.bias)
 
-        new_layer = QuantLinear(  ##pylint: disable=E1123
-            bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-        )
+        if "awq" not in backend:
+            new_layer = QuantLinear(  ##pylint: disable=E1123
+                bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
+            )
 
-        new_layer.device = device
-        set_module(model, name, new_layer)
-        qlayer = new_layer
-        scale = weight_config[name]["scale"]
-        zero = weight_config[name]["zp"]
-        # so far can only pack layer on CPU
-        qlayer.to("cpu")
-        ##force to float32 to be compatible with torch 2.0
-        layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
-        qlayer.pack(layer, scale, zero, None)
-        qlayer.to(device)
+            new_layer.device = device
+            set_module(model, name, new_layer)
+            qlayer = new_layer
+            scale = weight_config[name]["scale"]
+            zero = weight_config[name]["zp"]
+            # so far can only pack layer on CPU
+            qlayer.to("cpu")
+            ##force to float32 to be compatible with torch 2.0
+            layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
+            qlayer.pack(layer, scale, zero, None)
+            qlayer.to(device)
+        else:
+            logger.info("lyt_debug starting awq format packing")
+            from awq.utils.utils import clear_memory
+            scale, zp = weight_config[name]["scale"].to(torch.float32), weight_config[name]["zp"].to(torch.float32)
+            scale = scale.t().contiguous()
+            zp = zp.t().contiguous()
+            if bits != 4:
+                logger.error("AutoAWQ format only supports 4-bits quantization.")
+            qlayer = QuantLinear.from_linear(
+                linear=layer,
+                w_bit=bits,
+                group_size=group_size,
+                init_only=False,
+                scales=scale,
+                zeros=zp,
+            )
+            qlayer.to(device)
+            set_module(model, name, qlayer)
+            clear_memory()
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
+    # if "awq" in backend:
+    #     quantization_config["quant_method"], quantization_config["version"] = "awq", "gemm"
+    #     quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert
     quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in weight_config:
@@ -174,10 +207,22 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
         quantization_config["extra_config"] = extra_config
     if hasattr(model, "config"):
         model.config.quantization_config = quantization_config
+        if "awq" in backend:
+            awq_quant_config = {
+                "quant_method": "awq",
+                "zero_point": not quantization_config["sym"],
+                "group_size": quantization_config["group_size"],
+                "bits": quantization_config["bits"],
+                "version": "gemm",
+                "modules_to_not_convert": None if not modules_to_not_convert else modules_to_not_convert,
+            }
     tokenizer = kwargs["tokenizer"]
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
-    save(model, output_dir)
+    if "awq" not in backend:
+        save(model, output_dir)
+    else:
+        save_awq(model, output_dir, awq_quant_config=awq_quant_config)
 
 
 def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
@@ -206,3 +251,37 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_ser
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+
+
+def save_awq(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True, awq_quant_config: dict = {}):
+    """Save model state dict and configs.
+
+    Args:
+        model (`nn.Module`):
+            Model to be saved. The model can be wrapped or unwrapped.
+        save_dir (`str`):
+            Directory to which to save. Will be created if it doesn't exist.
+        max_shard_size (`str`, defaults to `"10GB"`):
+            The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+            lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+            <Tip warning={true}>
+
+            If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
+            which will be bigger than `max_shard_size`.
+
+            </Tip>
+        safe_serialization (`bool`, defaults to `True`):
+            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
+        quantization_config = model.config.quantization_config
+    else:
+        quantization_config = awq_quant_config
+    model.config.quantization_config = awq_quant_config
+    model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+    config_file = "quantization_config.json"
+    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
+        with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
+            json.dump(quantization_config, f, indent=2)
\ No newline at end of file
diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py
deleted file mode 100644
index 673eeda2..00000000
--- a/auto_round/export/export_to_awq.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import json
-import os
-from os.path import isdir, isfile, join
-from typing import Dict, List, Optional, Union
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import torch.nn as nn
-
-from auto_round.export.register import register_format
-from auto_round.utils import check_to_quantized, convert_dtype_torch2str_hf, get_block_names, get_module, logger
-
-# MIT License
-
-# Copyright (c) 2023 MIT HAN Lab
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-
-def save_quantized_as_autoawq(output_dir, model_path, kwargs):
-    """Export the model to autogptq format to easily leverage cuda kernel."""
-
-    try:
-        from awq import AutoAWQForCausalLM  # pylint: disable=E0401
-        from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
-        from awq.utils.utils import clear_memory  # pylint: disable=E0401
-    except:
-        logger.error("autoawq is required. Please install it to support auto_awq format.")
-        return
-
-    model = kwargs["model"]
-    weight_config = kwargs["weight_config"]
-    sym = kwargs["sym"]
-    bits = kwargs["bits"]
-    group_size = kwargs["group_size"]
-    iters = kwargs["iters"]
-    lr = kwargs["lr"]
-    minmax_lr = kwargs["minmax_lr"]
-    enable_minmax_tuning = kwargs["enable_minmax_tuning"]
-    enable_quanted_input = kwargs["enable_quanted_input"]
-    scale_dtype = kwargs["scale_dtype"]
-    tokenizer = kwargs["tokenizer"]
-    supported_types = kwargs["supported_types"]
-
-    logger.info("Saving quantized model to autoawq format")
-    if tokenizer is not None:
-        tokenizer.save_pretrained(output_dir)
-
-    compressed_model = copy.deepcopy(model.to("cpu"))
-
-    q_linear_module = WQLinear_GEMM
-    awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
-    self_modules = awq_model.get_model_layers(compressed_model)
-    del awq_model  # release memory
-    modules_to_not_convert = []
-    for i in range(len(self_modules)):
-        module = self_modules[i]
-        named_linears = get_named_linears(module)
-        for name, linear_layer in named_linears.items():
-            key = get_module_name(compressed_model, linear_layer)
-            info = weight_config[key]
-            if not check_to_quantized(info):
-                modules_to_not_convert.append(key)
-                continue
-            info["zp"] = info["zp"].to(torch.float32)
-            scale, zp = info["scale"], info["zp"]
-            scale = scale.t().contiguous()
-            zp = zp.t().contiguous()
-            q_linear = q_linear_module.from_linear(
-                linear=linear_layer,
-                w_bit=bits,
-                group_size=group_size,
-                init_only=False,
-                scales=scale,
-                zeros=zp,
-            )
-            linear_layer.cpu()
-            q_linear.to(next(module.parameters()).device)
-            set_op_by_name(module, name, q_linear)
-            clear_memory()
-
-    quant_config = kwargs["serialization_dict"]
-    quant_config["zero_point"] = not sym
-    quant_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert
-    quant_config["version"] = "gemm"
-    quant_config["quant_method"] = "intel/auto-round"
-    quant_config["backend"] = "awq"
-
-    save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config)
-
-
-from safetensors.torch import save_file
-from transformers.modeling_utils import shard_checkpoint
-
-
-def save_quantized(
-    model,
-    save_dir,
-    quant_config,
-    safetensors=True,
-    shard_size="10GB",
-):
-    save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir
-
-    # Save model
-    class EmptyModule(nn.Module):
-        def __init__(self):
-            super(EmptyModule, self).__init__()
-
-        def forward(self, x):
-            return x
-
-    # Save model and config files with empty state dict
-    awq_quant_config = {
-        "quant_method": "awq",
-        "zero_point": quant_config["zero_point"],
-        "group_size": quant_config["group_size"],
-        "bits": quant_config["bits"],
-        "version": "gemm",
-        "modules_to_not_convert": quant_config["modules_to_not_convert"],
-    }
-    model.config.quantization_config = awq_quant_config
-    model.generation_config.do_sample = True
-    model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict())
-
-    # Remove empty state dict
-    default_paths = [
-        f"{save_dir}/model.safetensors",
-        f"{save_dir}/pytorch_model.bin",
-    ]
-    for path in default_paths:
-        if os.path.exists(path):
-            os.remove(path)
-
-    # model_name has no extension, add it when saving state_dict
-    model_name = "model.safetensors" if safetensors else "pytorch_model.bin"
-
-    # shard checkpoint into chunks (10GB default)
-    shards, index = shard_checkpoint(model.state_dict(), max_shard_size=shard_size, weights_name=model_name)
-
-    for shard_file, shard in shards.items():
-        if safetensors:
-            # safetensors must be in the same memory, so we duplicate and use contiguous memory
-            shard = {k: v.clone().contiguous() for k, v in shard.items()}
-            save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"})
-        else:
-            torch.save(shard, os.path.join(save_dir, shard_file))
-
-    # save shard index
-    if index is not None:
-        with open(f"{save_dir}/{model_name}.index.json", "w+") as file:
-            file.write(json.dumps(index, indent=4))
-
-    # save quantize_config
-    with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f:
-        json.dump(quant_config, f, indent=2)
-
-
-def get_named_linears(module):
-    """Get the name, linear_op pairs of a given module.
-
-    Args:
-    module: A module to be searched.
-    """
-    return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
-
-
-def set_op_by_name(layer, name, new_module):
-    levels = name.split(".")
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_module_name(model, module_to_find):
-    """Get the name of a given module in a model.
-
-    Args:
-    model: The model.
-    module_to_find: A module to be found.
-
-    Returns:
-    name: The corresponding name of the given module.
-    """
-    for name, module in model.named_modules():
-        if module is module_to_find:
-            return name
-    return None

From a5ceb0bb0d5c57ef6d42e71430e0e7bd93e9d8e5 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 25 Jun 2024 03:15:35 -0400
Subject: [PATCH 28/40] minor fix

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index e9c417cb..c499a8c7 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -158,7 +158,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
             qlayer.to(device)
         else:
             logger.info("lyt_debug starting awq format packing")
-            from awq.utils.utils import clear_memory
+            from awq.utils.utils import clear_memory # pylint: disable=E0401
             scale, zp = weight_config[name]["scale"].to(torch.float32), weight_config[name]["zp"].to(torch.float32)
             scale = scale.t().contiguous()
             zp = zp.t().contiguous()
@@ -254,7 +254,13 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
 
 
 
-def save_awq(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True, awq_quant_config: dict = {}):
+def save_awq(
+        model: nn.Module, 
+        save_dir: str, 
+        max_shard_size: str = "5GB", 
+        safe_serialization: bool = True, 
+        awq_quant_config: dict = {}
+):
     """Save model state dict and configs.
 
     Args:

From 7d505a0fad1e3509f305ce1f2b2bfa8df55bba42 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 25 Jun 2024 03:17:01 -0400
Subject: [PATCH 29/40] typo

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index c499a8c7..cdcf64fa 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -157,7 +157,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
             qlayer.pack(layer, scale, zero, None)
             qlayer.to(device)
         else:
-            logger.info("lyt_debug starting awq format packing")
             from awq.utils.utils import clear_memory # pylint: disable=E0401
             scale, zp = weight_config[name]["scale"].to(torch.float32), weight_config[name]["zp"].to(torch.float32)
             scale = scale.t().contiguous()

From 56e60d7c1a28e5ef7d62737c9fee1ecebdbe3ddd Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 25 Jun 2024 03:26:31 -0400
Subject: [PATCH 30/40] remove comments

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index cdcf64fa..b7d76382 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -176,9 +176,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
             clear_memory()
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
-    # if "awq" in backend:
-    #     quantization_config["quant_method"], quantization_config["version"] = "awq", "gemm"
-    #     quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert
     quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in weight_config:

From 3385eb94f97ff0806ba24bab8b800988bcfe7637 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Thu, 27 Jun 2024 21:47:07 -0400
Subject: [PATCH 31/40] update comments

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index b7d76382..ad5a9792 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -93,7 +93,7 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size):
             from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
             return WQLinear_GEMM
         except:
-            logger.error("autoawq is required. Please install it to support auto_awq format.")
+            logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
             return
     elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend:
         from auto_round_extension.cuda.qliner_triton import QuantLinear

From 5ec30d49bd8ff526bd90cbe74d595c48da68a04b Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 2 Jul 2024 20:54:52 -0400
Subject: [PATCH 32/40] move awq to autoround format evaluation

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 .../export/export_to_autoround/export.py      | 25 ++++++-------------
 auto_round/utils.py                           |  6 +++++
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index ad5a9792..82a0b423 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -176,7 +176,8 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
             clear_memory()
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
-    quantization_config["backend"] = backend
+    if "awq" not in backend:
+        quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in weight_config:
         if weight_config[layer_name]["bits"] >= 16:
@@ -203,22 +204,13 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
         quantization_config["extra_config"] = extra_config
     if hasattr(model, "config"):
         model.config.quantization_config = quantization_config
-        if "awq" in backend:
-            awq_quant_config = {
-                "quant_method": "awq",
-                "zero_point": not quantization_config["sym"],
-                "group_size": quantization_config["group_size"],
-                "bits": quantization_config["bits"],
-                "version": "gemm",
-                "modules_to_not_convert": None if not modules_to_not_convert else modules_to_not_convert,
-            }
     tokenizer = kwargs["tokenizer"]
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
     if "awq" not in backend:
         save(model, output_dir)
     else:
-        save_awq(model, output_dir, awq_quant_config=awq_quant_config)
+        save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert)
 
 
 def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True):
@@ -255,7 +247,7 @@ def save_awq(
         save_dir: str, 
         max_shard_size: str = "5GB", 
         safe_serialization: bool = True, 
-        awq_quant_config: dict = {}
+        modules_to_not_convert: list = [], 
 ):
     """Save model state dict and configs.
 
@@ -277,11 +269,10 @@ def save_awq(
             Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
     """
     os.makedirs(save_dir, exist_ok=True)
-    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
-        quantization_config = model.config.quantization_config
-    else:
-        quantization_config = awq_quant_config
-    model.config.quantization_config = awq_quant_config
+    quantization_config = model.config.quantization_config
+    model.config.quantization_config["quant_method"] = "awq"
+    model.config.quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert \
+        else modules_to_not_convert
     model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
     config_file = "quantization_config.json"
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 2b8d817b..5f90f57a 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -656,6 +656,12 @@ def dynamic_import_inference_linear(bits, group_size, backend):
             disable_marlin=disable_marlin,
         )
         return QuantLinear
+    if "awq" in backend:
+        try:
+            from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
+        except:
+            raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
+        return WQLinear_GEMM
     if bits == 4 and exllama2_available and "exllamav2" in backend:
         from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
     elif bits == 4 and "exllamav2" in backend:

From d5537f4509f9cab295058cf3791029704f31c0fc Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 2 Jul 2024 21:02:11 -0400
Subject: [PATCH 33/40] pylint error fixing

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index 5f90f57a..317bef15 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -660,7 +660,8 @@ def dynamic_import_inference_linear(bits, group_size, backend):
         try:
             from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
         except:
-            raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
+            raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to \
+                support auto_awq format.")
         return WQLinear_GEMM
     if bits == 4 and exllama2_available and "exllamav2" in backend:
         from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear

From 821a1b9f155c101b780195a6b62c593447b181c2 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 23 Jul 2024 22:11:40 -0400
Subject: [PATCH 34/40] fix conflicts, and add awq format

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/__init__.py                 |   2 +-
 .../export/export_to_autoround/export.py      | 115 +++++++--
 auto_round/export/export_to_awq/__init__.py   |  17 ++
 auto_round/export/export_to_awq/export.py     | 230 ++++++++++++++++++
 auto_round/utils.py                           |   7 +
 examples/language-modeling/main.py            |   5 +-
 6 files changed, 351 insertions(+), 25 deletions(-)
 create mode 100644 auto_round/export/export_to_awq/__init__.py
 create mode 100644 auto_round/export/export_to_awq/export.py

diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py
index 6b0fd04b..ec304793 100644
--- a/auto_round/export/__init__.py
+++ b/auto_round/export/__init__.py
@@ -16,5 +16,5 @@
 from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq
 from .export_to_itrex import save_quantized_as_itrex, QuantConfig
 from .export_to_autoround.export import save_quantized_as_autoround
-
+from .export_to_awq.export import save_quantized_as_autoawq
 
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 13b15331..910a3303 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -119,16 +119,22 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym):
     Raises:
         AssertionError: If the backend is not supported.
     """
-    if "auto_round" in backend:
+    if "auto_round" in backend and "awq" not in backend:
         ##only support triton and exllamav2
         if not ("triton" in backend or "exllamav2" in backend):
             logger.warning_once(f"autoround format does not support {backend}, try to pack with autogptq")
             return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
         from auto_round_extension.cuda.qliner_triton import QuantLinear
         return QuantLinear
+    elif "awq" in backend:
+        try:
+            from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+            return WQLinear_GEMM
+        except:
+            logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
+            return
     elif "gptq" in backend:
         return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
-
     else:
         assert False, f"only support gptq and autoround backend"
 
@@ -160,7 +166,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         backend = "autoround:exllamav2"
     backend = backend.replace("autoround", "auto_round")
     backend = backend.replace("auto-round", "auto_round")
-    if not ("triton" in backend or "exllamav2" in backend):
+    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend):
         logger.info(f"autoround format does not support {backend}, try to pack with autogptq")
         backend = backend.replace("auto_round", "auto_gptq")
 
@@ -174,7 +180,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     layer_config = kwargs["layer_config"]
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
-    quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in layer_config:
         if layer_name not in layer_names_in_block and layer_config[layer_name]["bits"] <= 8:  ##lm head
@@ -198,10 +203,13 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     with tctl.threadpool_limits(limits=1):
+        modules_to_not_convert = []
         for name in layer_config.keys():
 
             config = kwargs["layer_config"][name]
             if config["bits"] > 8:
+                if "awq" in backend:
+                    modules_to_not_convert.append(name)
                 continue
             logger.info(f"packing {name}")
 
@@ -225,33 +233,55 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
                 out_features = layer.weight.shape[1]
             bias = layer.bias is not None and torch.any(layer.bias)
 
-            new_layer = QuantLinear(  ##pylint: disable=E1123
+            if "awq" not in backend:
+                new_layer = QuantLinear(  ##pylint: disable=E1123
                 bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-            )
-
-            new_layer.device = device
-            set_module(model, name, new_layer)
-            qlayer = new_layer
-            scale = layer_config[name]["scale"]
-            zero = layer_config[name]["zp"]
-            # so far can only pack layer on CPU
-            qlayer.to("cpu")
-            ##force to float32 to be compatible with torch 2.0
-            layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
-            sig = inspect.signature(qlayer.pack)
-            param_count = len(sig.parameters)
-            if param_count == 2:
-                qlayer.pack(layer, scale)
+                )
+                new_layer.device = device
+                set_module(model, name, new_layer)
+                qlayer = new_layer
+                scale = layer_config[name]["scale"]
+                zero = layer_config[name]["zp"]
+                # so far can only pack layer on CPU
+                qlayer.to("cpu")
+                ##force to float32 to be compatible with torch 2.0
+                layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
+                sig = inspect.signature(qlayer.pack)
+                param_count = len(sig.parameters)
+                if param_count == 2:
+                    qlayer.pack(layer, scale)
+                else:
+                    qlayer.pack(layer, scale, zero, None)
+                qlayer.to(device)
             else:
-                qlayer.pack(layer, scale, zero, None)
-            qlayer.to(device)
+                from awq.utils.utils import clear_memory # pylint: disable=E0401
+                scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32)
+                scale = scale.t().contiguous()
+                zp = zp.t().contiguous()
+                if bits != 4:
+                    logger.error("AutoAWQ format only supports 4-bits quantization.")
+                qlayer = QuantLinear.from_linear(
+                    linear=layer,
+                    w_bit=bits,
+                    group_size=group_size,
+                    init_only=False,
+                    scales=scale,
+                    zeros=zp,
+                )
+                qlayer.to(device)
+                set_module(model, name, qlayer)
+                clear_memory()
+
 
     if hasattr(model, "config"):
         model.config.quantization_config = quantization_config
     tokenizer = kwargs["tokenizer"]
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
-    save(model, output_dir)
+    if "awq" not in backend:
+        save(model, output_dir)
+    else:
+        save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert)
 
 
 def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True):
@@ -281,3 +311,42 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
 
+
+
+def save_awq(
+        model: nn.Module, 
+        save_dir: str, 
+        max_shard_size: str = "5GB", 
+        safe_serialization: bool = True, 
+        modules_to_not_convert: list = [], 
+):
+    """Save model state dict and configs.
+
+    Args:
+        model (`nn.Module`):
+            Model to be saved. The model can be wrapped or unwrapped.
+        save_dir (`str`):
+            Directory to which to save. Will be created if it doesn't exist.
+        max_shard_size (`str`, defaults to `"10GB"`):
+            The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+            lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+            <Tip warning={true}>
+
+            If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
+            which will be bigger than `max_shard_size`.
+
+            </Tip>
+        safe_serialization (`bool`, defaults to `True`):
+            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    quantization_config = model.config.quantization_config
+    model.config.quantization_config["quant_method"] = "awq"
+    model.config.quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert \
+        else modules_to_not_convert
+    model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+    config_file = "quantization_config.json"
+    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
+        with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
+            json.dump(quantization_config, f, indent=2)
+            
\ No newline at end of file
diff --git a/auto_round/export/export_to_awq/__init__.py b/auto_round/export/export_to_awq/__init__.py
new file mode 100644
index 00000000..0bdb4d35
--- /dev/null
+++ b/auto_round/export/export_to_awq/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .export import save_quantized_as_autoawq
+
+
diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
new file mode 100644
index 00000000..03888ba3
--- /dev/null
+++ b/auto_round/export/export_to_awq/export.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import json
+import os
+from os.path import isdir, isfile, join
+from typing import Dict, List, Optional, Union
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import torch
+import torch.nn as nn
+
+from auto_round.export.register import register_format
+from auto_round.utils import convert_dtype_torch2str_hf, logger
+
+# MIT License
+
+# Copyright (c) 2023 MIT HAN Lab
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+
+@register_format("auto_awq")
+def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
+    """Export the model to autogptq format to easily leverage cuda kernel."""
+    model = kwargs["model"]
+    layer_config = kwargs["layer_config"]
+    sym = kwargs["sym"]
+    bits = kwargs["bits"]
+    group_size = kwargs["group_size"]
+    iters = kwargs["iters"]
+    lr = kwargs["lr"]
+    minmax_lr = kwargs["minmax_lr"]
+    enable_minmax_tuning = kwargs["enable_minmax_tuning"]
+    enable_quanted_input = kwargs["enable_quanted_input"]
+    scale_dtype = kwargs["scale_dtype"]
+    tokenizer = kwargs["tokenizer"]
+
+    logger.info("Saving quantized model to auto_awq format")
+    if tokenizer is not None:
+        tokenizer.save_pretrained(output_dir)
+    ##check module quantized in block, this may have bug for mixed precision quantization
+    modules_to_not_convert = []
+    if inplace:
+        compressed_model = model.to("cpu")
+    else:
+        compressed_model = copy.deepcopy(model.to("cpu"))
+
+    from awq import AutoAWQForCausalLM  # pylint: disable=E0401
+    from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+    from awq.utils.utils import clear_memory  # pylint: disable=E0401
+
+    q_linear_module = WQLinear_GEMM
+    awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
+    self_modules = awq_model.get_model_layers(compressed_model)
+    del awq_model  # release memory
+    for i in range(len(self_modules)):
+        module = self_modules[i]
+        named_linears = get_named_linears(module)
+        for name, linear_layer in named_linears.items():
+            key = get_module_name(compressed_model, linear_layer)
+            logger.info(f"packing {name}")
+            config = layer_config[key]
+            if config["bits"] > 8:
+                modules_to_not_convert.append(name)
+                continue
+            config["zp"] = config["zp"].to(torch.float32)
+            scale, zp = config["scale"], config["zp"]
+            scale = scale.t().contiguous()
+            zp = zp.t().contiguous()
+            q_linear = q_linear_module.from_linear(
+                linear=linear_layer,
+                w_bit=bits,
+                group_size=group_size,
+                init_only=False,
+                scales=scale,
+                zeros=zp,
+            )
+            linear_layer.cpu()
+            q_linear.to(next(module.parameters()).device)
+            set_op_by_name(module, name, q_linear)
+            clear_memory()
+
+    quant_config = {}
+    quant_config["quant_method"] = "awq"
+    quant_config["modules_to_not_convert"] = None
+    quant_config["version"] = "gemm"
+    quant_config["iters"] = iters
+    quant_config["lr"] = lr
+    quant_config["minmax_lr"] = minmax_lr
+    quant_config["enable_minmax_tuning"] = enable_minmax_tuning
+    quant_config["enable_quanted_input"] = enable_quanted_input
+    quant_config["scale_dtype"] = convert_dtype_torch2str_hf(scale_dtype)
+    quant_config["sym"] = sym
+    quant_config["bits"] = bits
+    quant_config["group_size"] = group_size
+    quant_config["zero_point"] = not sym
+
+    save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config)
+
+
+from safetensors.torch import save_file
+from transformers.modeling_utils import shard_checkpoint
+
+
+def save_quantized(
+    model,
+    save_dir,
+    quant_config,
+    safetensors=True,
+    shard_size="5GB",
+):
+    save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir
+
+    # Save model
+    class EmptyModule(nn.Module):
+        def __init__(self):
+            super(EmptyModule, self).__init__()
+
+        def forward(self, x):
+            return x
+
+    # Save model and config files with empty state dict
+    awq_quant_config = {
+        "quant_method": "awq",
+        "zero_point": quant_config["zero_point"],
+        "group_size": quant_config["group_size"],
+        "bits": quant_config["bits"],
+        "version": "gemm",
+        "modules_to_not_convert": quant_config["modules_to_not_convert"],
+    }
+
+    model.config.quantization_config = awq_quant_config
+    model.generation_config.do_sample = True
+    model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict())
+
+    # Remove empty state dict
+    default_paths = [
+        f"{save_dir}/model.safetensors",
+        f"{save_dir}/pytorch_model.bin",
+    ]
+    for path in default_paths:
+        if os.path.exists(path):
+            os.remove(path)
+
+    # model_name has no extension, add it when saving state_dict
+    model_name = "model.safetensors" if safetensors else "pytorch_model.bin"
+
+    # shard checkpoint into chunks (10GB default)
+    shards, index = shard_checkpoint(model.state_dict(), max_shard_size=shard_size, weights_name=model_name)
+
+    for shard_file, shard in shards.items():
+        if safetensors:
+            # safetensors must be in the same memory, so we duplicate and use contiguous memory
+            shard = {k: v.clone().contiguous() for k, v in shard.items()}
+            save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"})
+        else:
+            torch.save(shard, os.path.join(save_dir, shard_file))
+
+    # save shard index
+    if index is not None:
+        with open(f"{save_dir}/{model_name}.index.json", "w+") as file:
+            file.write(json.dumps(index, indent=4))
+
+    # save quantize_config
+    with open(join(save_dir, "quantization_config.json"), "w", encoding="utf-8") as f:
+        json.dump(quant_config, f, indent=2)
+
+
+def get_named_linears(module):
+    """Get the name, linear_op pairs of a given module.
+    Args:
+    module: A module to be searched.
+    """
+    return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
+
+
+def set_op_by_name(layer, name, new_module):
+    levels = name.split(".")
+    if len(levels) > 1:
+        mod_ = layer
+        for l_idx in range(len(levels) - 1):
+            if levels[l_idx].isdigit():
+                mod_ = mod_[int(levels[l_idx])]
+            else:
+                mod_ = getattr(mod_, levels[l_idx])
+        setattr(mod_, levels[-1], new_module)
+    else:
+        setattr(layer, name, new_module)
+
+
+def get_module_name(model, module_to_find):
+    """Get the name of a given module in a model.
+    Args:
+    model: The model.
+    module_to_find: A module to be found.
+    Returns:
+    name: The corresponding name of the given module.
+    """
+    for name, module in model.named_modules():
+        if module is module_to_find:
+            return name
+    return None
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 195c2586..747c8d67 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -722,6 +722,13 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym):
         else:
             from auto_round_extension.hpu.qlinear_hpu import QuantLinear
             return QuantLinear
+    if "awq" in backend:
+        try:
+            from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
+        except:
+            raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to \
+                support auto_awq format.")
+        return WQLinear_GEMM
     if bits == 4 and exllama2_available and "exllamav2" in backend:
         from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
     elif bits == 4 and "exllamav2" in backend:
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 942a5106..9460e513 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -312,7 +312,7 @@
     deployment_device = args.deployment_device.split(',')
     gpu_formats = []
     for item in deployment_device:
-        if "gpu" in item or "auto_gptq" in item or "auto_round" in item:
+        if "gpu" in item or "auto_gptq" in item or "auto_round" in item or "auto_awq" in item:
             gpu_formats.append(item)
 
     if 'gpu' in deployment_device:
@@ -331,6 +331,9 @@
         elif "gptq" in gpu_format:
             eval_folder = f'{export_dir}-gpu'
             autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace)
+        elif "auto_awq" in gpu_format:
+            eval_folder = f'{export_dir}-awq'
+            autoround.save_quantized(eval_folder, format=gpu_format, inplace=inplace, model_path=model_name)
 
     if 'xpu' in deployment_device:
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,

From 1f87cad9d8c864b9f2745131edf0ba8e927209e8 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Wed, 24 Jul 2024 02:46:26 -0400
Subject: [PATCH 35/40] add ut

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 test/test_export.py | 51 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/test/test_export.py b/test/test_export.py
index 7a747837..ec777136 100644
--- a/test/test_export.py
+++ b/test/test_export.py
@@ -152,3 +152,54 @@ def test_autoround_format(self):
     #     shutil.rmtree("./saved", ignore_errors=True)
     #
 
+
+    def test_autoawq_format(self):
+        bits, group_size, sym = 4, 128, False
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_awq", model_path="facebook/opt-125m")
+
+        from auto_round.auto_quantizer import AutoHfQuantizer
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_autoround_awq_format(self):
+        bits, group_size, sym = 4, 128, False
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:awq")
+
+        from auto_round.auto_quantizer import AutoHfQuantizer
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree("./saved", ignore_errors=True)
\ No newline at end of file

From c888e125b6097fb81889f24b64375c40a4d11afd Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Wed, 24 Jul 2024 02:57:56 -0400
Subject: [PATCH 36/40] add requirement

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 83ec1fe4..40cd1722 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ torch
 transformers
 triton
 numpy < 2.0
-threadpoolctl
\ No newline at end of file
+threadpoolctl
+autoawq
\ No newline at end of file

From 1bc88db3042d79e05245d703edad1083645a25af Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Wed, 24 Jul 2024 03:18:28 -0400
Subject: [PATCH 37/40] add coverage test waiver

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 .../export/export_to_autoround/export.py      | 14 ++---
 auto_round/export/export_to_awq/export.py     | 10 ++--
 examples/language-modeling/main.py            |  2 +-
 requirements.txt                              |  3 +-
 test/test_export.py                           | 54 +------------------
 5 files changed, 15 insertions(+), 68 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index c63ab0a1..c5a9929e 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -127,7 +127,7 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym):
             return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
         from auto_round_extension.cuda.qliner_triton import QuantLinear
         return QuantLinear
-    elif "awq" in backend:
+    elif "awq" in backend: # pragma: no cover
         try:
             from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
             return WQLinear_GEMM
@@ -181,7 +181,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     layer_config = kwargs["layer_config"]
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
-    if "awq" not in backend:
+    if "awq" not in backend: # pragma: no cover
         quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in layer_config:
@@ -206,11 +206,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     with tctl.threadpool_limits(limits=1):
-        modules_to_not_convert = []
+        modules_to_not_convert = [] # pragma: no cover
         for name in layer_config.keys():
             config = kwargs["layer_config"][name]
             if config["bits"] > 8:
-                if "awq" in backend:
+                if "awq" in backend: # pragma: no cover
                     modules_to_not_convert.append(name)
                 continue
             logger.info(f"packing {name}")
@@ -255,7 +255,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
                 else:
                     qlayer.pack(layer, scale, zero, None)
                 qlayer.to(device)
-            else:
+            else: # pragma: no cover
                 from awq.utils.utils import clear_memory # pylint: disable=E0401
                 scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32)
                 scale = scale.t().contiguous()
@@ -282,7 +282,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         tokenizer.save_pretrained(output_dir)
     if "awq" not in backend:
         save(model, output_dir, safe_serialization=safe_serialization)
-    else:
+    else: # pragma: no cover
         save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert)
 
 
@@ -321,7 +321,7 @@ def save_awq(
         max_shard_size: str = "5GB", 
         safe_serialization: bool = True, 
         modules_to_not_convert: list = [], 
-):
+): # pragma: no cover
     """Save model state dict and configs.
 
     Args:
diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
index 03888ba3..451be56e 100644
--- a/auto_round/export/export_to_awq/export.py
+++ b/auto_round/export/export_to_awq/export.py
@@ -48,7 +48,7 @@
 
 
 @register_format("auto_awq")
-def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
+def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): # pragma: no cover
     """Export the model to autogptq format to easily leverage cuda kernel."""
     model = kwargs["model"]
     layer_config = kwargs["layer_config"]
@@ -136,7 +136,7 @@ def save_quantized(
     quant_config,
     safetensors=True,
     shard_size="5GB",
-):
+): # pragma: no cover
     save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir
 
     # Save model
@@ -194,7 +194,7 @@ def forward(self, x):
         json.dump(quant_config, f, indent=2)
 
 
-def get_named_linears(module):
+def get_named_linears(module): # pragma: no cover
     """Get the name, linear_op pairs of a given module.
     Args:
     module: A module to be searched.
@@ -202,7 +202,7 @@ def get_named_linears(module):
     return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
 
 
-def set_op_by_name(layer, name, new_module):
+def set_op_by_name(layer, name, new_module): # pragma: no cover
     levels = name.split(".")
     if len(levels) > 1:
         mod_ = layer
@@ -216,7 +216,7 @@ def set_op_by_name(layer, name, new_module):
         setattr(layer, name, new_module)
 
 
-def get_module_name(model, module_to_find):
+def get_module_name(model, module_to_find): # pragma: no cover
     """Get the name of a given module in a model.
     Args:
     model: The model.
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index 1e1d5430..5349880f 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -339,7 +339,7 @@
         elif "gptq" in gpu_format:
             eval_folder = f'{export_dir}-gpu'
             autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace)
-        elif "auto_awq" in gpu_format:
+        elif "auto_awq" in gpu_format: # pragma: no cover
             eval_folder = f'{export_dir}-awq'
             autoround.save_quantized(eval_folder, format=gpu_format, inplace=inplace, model_path=model_name)
 
diff --git a/requirements.txt b/requirements.txt
index 40cd1722..83ec1fe4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,4 @@ torch
 transformers
 triton
 numpy < 2.0
-threadpoolctl
-autoawq
\ No newline at end of file
+threadpoolctl
\ No newline at end of file
diff --git a/test/test_export.py b/test/test_export.py
index ec777136..6cab1f2a 100644
--- a/test/test_export.py
+++ b/test/test_export.py
@@ -150,56 +150,4 @@ def test_autoround_format(self):
     #     inputs = tokenizer(text, return_tensors="pt").to(model.device)
     #     print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
     #     shutil.rmtree("./saved", ignore_errors=True)
-    #
-
-
-    def test_autoawq_format(self):
-        bits, group_size, sym = 4, 128, False
-        autoround = AutoRound(
-            self.model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_awq", model_path="facebook/opt-125m")
-
-        from auto_round.auto_quantizer import AutoHfQuantizer
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        text = "There is a girl who likes adventure,"
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
-        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
-
-
-    def test_autoround_awq_format(self):
-        bits, group_size, sym = 4, 128, False
-        autoround = AutoRound(
-            self.model,
-            self.tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=2,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-        )
-        autoround.quantize()
-        quantized_model_path = "./saved"
-
-        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:awq")
-
-        from auto_round.auto_quantizer import AutoHfQuantizer
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        text = "There is a girl who likes adventure,"
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
-        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-        shutil.rmtree("./saved", ignore_errors=True)
\ No newline at end of file
+    #
\ No newline at end of file

From bee724aa69903f7f818c3abdb1a5cedb78ce683f Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Wed, 24 Jul 2024 03:54:52 -0400
Subject: [PATCH 38/40] minor change

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 2 +-
 auto_round/utils.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index c5a9929e..a038cc5a 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -167,7 +167,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         backend = "autoround:exllamav2"
     backend = backend.replace("autoround", "auto_round")
     backend = backend.replace("auto-round", "auto_round")
-    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend):
+    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): # pragma: no cover
         logger.info(f"autoround format does not support {backend}, try to pack with autogptq")
         backend = backend.replace("auto_round", "auto_gptq")
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 2d34eaac..a3f2a5e1 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -812,7 +812,7 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym):
         else:
             from auto_round_extension.hpu.qlinear_hpu import QuantLinear
             return QuantLinear
-    if "awq" in backend:
+    if "awq" in backend: # pragma: no cover
         try:
             from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401
         except:

From 949c48a94e69cbc3050544ed09995684d7dc85f4 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Wed, 24 Jul 2024 21:11:04 -0400
Subject: [PATCH 39/40] refine code to decrease number of branches

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index a038cc5a..01422c01 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -41,16 +41,12 @@ def check_neq_config(config, data_type, bits, group_size, sym):
     Returns:
         list: A list of strings indicating which configuration parameters do not match.
     """
-    res = []
-    if data_type != config["data_type"]:
-        res.append("data_type")
-    if bits != config["bits"]:
-        res.append("bits")
-    if group_size != config["group_size"]:
-        res.append("group_size")
-    if sym != config["sym"]:
-        res.append("sym")
-    return res
+    expected_config = {"data_type": data_type, 
+        "bits": bits, 
+        "group_size": group_size,
+        "sym": sym
+    }
+    return [key for key, expected_value in expected_config.items() if config.get(key) != expected_value]
 
 
 def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):

From 77943867a886e4a84479c04577a23dbd9beb4ab1 Mon Sep 17 00:00:00 2001
From: yintong-lu <yintong.lu@intel.com>
Date: Tue, 30 Jul 2024 02:59:05 -0400
Subject: [PATCH 40/40] add ut, fix minor issues

Signed-off-by: yintong-lu <yintong.lu@intel.com>
---
 .../export/export_to_autoround/export.py      | 16 ++---
 auto_round/export/export_to_awq/export.py     | 47 +++++++-------
 test/test_export.py                           | 61 +++++++++++++++++++
 3 files changed, 91 insertions(+), 33 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 01422c01..7a0061d1 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -123,7 +123,7 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym):
             return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
         from auto_round_extension.cuda.qliner_triton import QuantLinear
         return QuantLinear
-    elif "awq" in backend: # pragma: no cover
+    elif "awq" in backend:
         try:
             from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
             return WQLinear_GEMM
@@ -163,7 +163,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         backend = "autoround:exllamav2"
     backend = backend.replace("autoround", "auto_round")
     backend = backend.replace("auto-round", "auto_round")
-    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): # pragma: no cover
+    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend):
         logger.info(f"autoround format does not support {backend}, try to pack with autogptq")
         backend = backend.replace("auto_round", "auto_gptq")
 
@@ -177,7 +177,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     layer_config = kwargs["layer_config"]
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
-    if "awq" not in backend: # pragma: no cover
+    if "awq" not in backend:
         quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in layer_config:
@@ -202,11 +202,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     with tctl.threadpool_limits(limits=1):
-        modules_to_not_convert = [] # pragma: no cover
+        modules_to_not_convert = []
         for name in layer_config.keys():
             config = kwargs["layer_config"][name]
             if config["bits"] > 8:
-                if "awq" in backend: # pragma: no cover
+                if "awq" in backend:
                     modules_to_not_convert.append(name)
                 continue
             logger.info(f"packing {name}")
@@ -251,7 +251,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
                 else:
                     qlayer.pack(layer, scale, zero, None)
                 qlayer.to(device)
-            else: # pragma: no cover
+            else:
                 from awq.utils.utils import clear_memory # pylint: disable=E0401
                 scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32)
                 scale = scale.t().contiguous()
@@ -278,7 +278,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         tokenizer.save_pretrained(output_dir)
     if "awq" not in backend:
         save(model, output_dir, safe_serialization=safe_serialization)
-    else: # pragma: no cover
+    else:
         save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert)
 
 
@@ -317,7 +317,7 @@ def save_awq(
         max_shard_size: str = "5GB", 
         safe_serialization: bool = True, 
         modules_to_not_convert: list = [], 
-): # pragma: no cover
+):
     """Save model state dict and configs.
 
     Args:
diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
index 451be56e..79aab82e 100644
--- a/auto_round/export/export_to_awq/export.py
+++ b/auto_round/export/export_to_awq/export.py
@@ -11,14 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-import copy
-import json
-import os
-from os.path import isdir, isfile, join
-from typing import Dict, List, Optional, Union
-
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -26,29 +18,31 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-import torch
-import torch.nn as nn
-
-from auto_round.export.register import register_format
-from auto_round.utils import convert_dtype_torch2str_hf, logger
-
 # MIT License
-
 # Copyright (c) 2023 MIT HAN Lab
-
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 
 
+import os
+from os.path import isdir, isfile, join
+import torch
+import torch.nn as nn
+from auto_round.export.register import register_format
+from auto_round.utils import convert_dtype_torch2str_hf, logger
+import copy
+import json
+from typing import Dict, List, Optional, Union
+
+
 @register_format("auto_awq")
-def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): # pragma: no cover
+def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs):
     """Export the model to autogptq format to easily leverage cuda kernel."""
     model = kwargs["model"]
     layer_config = kwargs["layer_config"]
@@ -73,9 +67,12 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): #
     else:
         compressed_model = copy.deepcopy(model.to("cpu"))
 
-    from awq import AutoAWQForCausalLM  # pylint: disable=E0401
-    from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
-    from awq.utils.utils import clear_memory  # pylint: disable=E0401
+    try:
+        from awq import AutoAWQForCausalLM  # pylint: disable=E0401
+        from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+        from awq.utils.utils import clear_memory  # pylint: disable=E0401
+    except:
+        logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
 
     q_linear_module = WQLinear_GEMM
     awq_model = AutoAWQForCausalLM.from_pretrained(model_path)
@@ -136,7 +133,7 @@ def save_quantized(
     quant_config,
     safetensors=True,
     shard_size="5GB",
-): # pragma: no cover
+):
     save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir
 
     # Save model
@@ -194,7 +191,7 @@ def forward(self, x):
         json.dump(quant_config, f, indent=2)
 
 
-def get_named_linears(module): # pragma: no cover
+def get_named_linears(module):
     """Get the name, linear_op pairs of a given module.
     Args:
     module: A module to be searched.
@@ -202,7 +199,7 @@ def get_named_linears(module): # pragma: no cover
     return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
 
 
-def set_op_by_name(layer, name, new_module): # pragma: no cover
+def set_op_by_name(layer, name, new_module):
     levels = name.split(".")
     if len(levels) > 1:
         mod_ = layer
@@ -216,7 +213,7 @@ def set_op_by_name(layer, name, new_module): # pragma: no cover
         setattr(layer, name, new_module)
 
 
-def get_module_name(model, module_to_find): # pragma: no cover
+def get_module_name(model, module_to_find):
     """Get the name of a given module in a model.
     Args:
     model: The model.
diff --git a/test/test_export.py b/test/test_export.py
index 6cab1f2a..aa23ff58 100644
--- a/test/test_export.py
+++ b/test/test_export.py
@@ -120,6 +120,67 @@ def test_autoround_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
+
+    def test_autoround_awq_format(self):
+        try:
+            import awq
+        except:
+            return
+        bits, group_size, sym = 4, 128, False
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:awq")
+
+        from auto_round.auto_quantizer import AutoHfQuantizer
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree("./saved", ignore_errors=True)
+
+
+    def test_autoawq_format(self):
+        try:
+            import awq
+        except:
+            return
+        bits, group_size, sym = 4, 128, False
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        autoround.quantize()
+        quantized_model_path = "./saved"
+
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, \
+            format="auto_awq", model_path="facebook/opt-125m")
+
+        from auto_round.auto_quantizer import AutoHfQuantizer
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree("./saved", ignore_errors=True)
+
     # def test_autoround_marlin_format(self):
     #     if not torch.cuda.is_available():
     #         return