intel · wenhuach21 · Aug 5, 2024 · May 21, 2024 · May 21, 2024 · May 21, 2024
diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py
@@ -16,5 +16,5 @@
 from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq
 from .export_to_itrex import save_quantized_as_itrex, QuantConfig
 from .export_to_autoround.export import save_quantized_as_autoround
-
+from .export_to_awq.export import save_quantized_as_autoawq
 
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -41,16 +41,12 @@ def check_neq_config(config, data_type, bits, group_size, sym):
     Returns:
         list: A list of strings indicating which configuration parameters do not match.
     """
-    res = []
-    if data_type != config["data_type"]:
-        res.append("data_type")
-    if bits != config["bits"]:
-        res.append("bits")
-    if group_size != config["group_size"]:
-        res.append("group_size")
-    if sym != config["sym"]:
-        res.append("sym")
-    return res
+    expected_config = {"data_type": data_type, 
+        "bits": bits, 
+        "group_size": group_size,
+        "sym": sym
+    }
+    return [key for key, expected_value in expected_config.items() if config.get(key) != expected_value]
 
 
 def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
@@ -89,6 +85,7 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
         disable_exllamav2 = True
         disable_exllamav1 = True
 
+
     from auto_gptq.utils.import_utils import dynamically_import_QuantLinear  # pylint: disable=E0401
     QuantLinear = dynamically_import_QuantLinear(
         use_triton=use_triton,
@@ -119,16 +116,22 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym):
     Raises:
         AssertionError: If the backend is not supported.
     """
-    if "auto_round" in backend:
+    if "auto_round" in backend and "awq" not in backend:
         ##only support triton and exllamav2
         if not ("triton" in backend or "exllamav2" in backend):
             logger.warning_once(f"autoround format does not support {backend}, try to pack with autogptq")
             return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
         from auto_round_extension.cuda.qliner_triton import QuantLinear
         return QuantLinear
+    elif "awq" in backend:
+        try:
+            from awq.modules.linear import WQLinear_GEMM  # pylint: disable=E0401
+            return WQLinear_GEMM
+        except:
+            logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.")
+            return
     elif "gptq" in backend:
         return get_autogptq_packing_qlinear(backend, bits, group_size, sym)
-
     else:
         assert False, f"only support gptq and autoround backend"
 
@@ -160,7 +163,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         backend = "autoround:exllamav2"
     backend = backend.replace("autoround", "auto_round")
     backend = backend.replace("auto-round", "auto_round")
-    if not ("triton" in backend or "exllamav2" in backend):
+    if not ("triton" in backend or "exllamav2" in backend or "awq" in backend):
         logger.info(f"autoround format does not support {backend}, try to pack with autogptq")
         backend = backend.replace("auto_round", "auto_gptq")
 
@@ -174,7 +177,8 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     layer_config = kwargs["layer_config"]
     quantization_config = kwargs["serialization_dict"]
     quantization_config["quant_method"] = "intel/auto-round"
-    quantization_config["backend"] = backend
+    if "awq" not in backend:
+        quantization_config["backend"] = backend
     extra_config = {}
     for layer_name in layer_config:
         if layer_name not in layer_names_in_block and layer_config[layer_name]["bits"] <= 8:  ##lm head
@@ -198,9 +202,12 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     with tctl.threadpool_limits(limits=1):
+        modules_to_not_convert = []
         for name in layer_config.keys():
             config = kwargs["layer_config"][name]
             if config["bits"] > 8:
+                if "awq" in backend:
+                    modules_to_not_convert.append(name)
                 continue
             logger.info(f"packing {name}")
 
@@ -224,33 +231,55 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
                 out_features = layer.weight.shape[1]
             bias = layer.bias is not None and torch.any(layer.bias)
 
-            new_layer = QuantLinear(  ##pylint: disable=E1123
+            if "awq" not in backend:
+                new_layer = QuantLinear(  ##pylint: disable=E1123
                 bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-            )
-
-            new_layer.device = device
-            set_module(model, name, new_layer)
-            qlayer = new_layer
-            scale = layer_config[name]["scale"]
-            zero = layer_config[name]["zp"]
-            # so far can only pack layer on CPU
-            qlayer.to("cpu")
-            ##force to float32 to be compatible with torch 2.0
-            layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
-            sig = inspect.signature(qlayer.pack)
-            param_count = len(sig.parameters)
-            if param_count == 2:
-                qlayer.pack(layer, scale)
+                )
+                new_layer.device = device
+                set_module(model, name, new_layer)
+                qlayer = new_layer
+                scale = layer_config[name]["scale"]
+                zero = layer_config[name]["zp"]
+                # so far can only pack layer on CPU
+                qlayer.to("cpu")
+                ##force to float32 to be compatible with torch 2.0
+                layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32)
+                sig = inspect.signature(qlayer.pack)
+                param_count = len(sig.parameters)
+                if param_count == 2:
+                    qlayer.pack(layer, scale)
+                else:
+                    qlayer.pack(layer, scale, zero, None)
+                qlayer.to(device)
             else:
-                qlayer.pack(layer, scale, zero, None)
-            qlayer.to(device)
+                from awq.utils.utils import clear_memory # pylint: disable=E0401
+                scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32)
+                scale = scale.t().contiguous()
+                zp = zp.t().contiguous()
+                if bits != 4:
+                    logger.error("AutoAWQ format only supports 4-bits quantization.")
+                qlayer = QuantLinear.from_linear(
+                    linear=layer,
+                    w_bit=bits,
+                    group_size=group_size,
+                    init_only=False,
+                    scales=scale,
+                    zeros=zp,
+                )
+                qlayer.to(device)
+                set_module(model, name, qlayer)
+                clear_memory()
+
 
     if hasattr(model, "config"):
         model.config.quantization_config = quantization_config
     tokenizer = kwargs["tokenizer"]
     if tokenizer is not None:
         tokenizer.save_pretrained(output_dir)
-    save(model, output_dir, safe_serialization=safe_serialization)
+    if "awq" not in backend:
+        save(model, output_dir, safe_serialization=safe_serialization)
+    else:
+        save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert)
 
 
 def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True):
@@ -282,3 +311,41 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
 
 
 
+def save_awq(
+        model: nn.Module, 
+        save_dir: str, 
+        max_shard_size: str = "5GB", 
+        safe_serialization: bool = True, 
+        modules_to_not_convert: list = [], 
+):
+    """Save model state dict and configs.
+
+    Args:
+        model (`nn.Module`):
+            Model to be saved. The model can be wrapped or unwrapped.
+        save_dir (`str`):
+            Directory to which to save. Will be created if it doesn't exist.
+        max_shard_size (`str`, defaults to `"10GB"`):
+            The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+            lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+            <Tip warning={true}>
+
+            If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
+            which will be bigger than `max_shard_size`.
+
+            </Tip>
+        safe_serialization (`bool`, defaults to `True`):
+            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    quantization_config = model.config.quantization_config
+    model.config.quantization_config["quant_method"] = "awq"
+    model.config.quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert \
+        else modules_to_not_convert
+    model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+    config_file = "quantization_config.json"
+    if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
+        with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
+            json.dump(quantization_config, f, indent=2)
+
+
diff --git a/auto_round/export/export_to_awq/__init__.py b/auto_round/export/export_to_awq/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .export import save_quantized_as_autoawq
+
+