From c7770af9afc37ac8c8d368fb213a709ae9f37731 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 21 May 2024 03:10:54 -0400 Subject: [PATCH 01/40] add file Signed-off-by: yintong-lu --- auto_round/export/export_to_awq.py | 234 +++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 auto_round/export/export_to_awq.py diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py new file mode 100644 index 00000000..35374b01 --- /dev/null +++ b/auto_round/export/export_to_awq.py @@ -0,0 +1,234 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy +import json +import os +from os.path import isdir, isfile, join +from typing import Dict, List, Optional, Union +import torch.nn as nn + +# MIT License +# +# Copyright (c) 2023 潘其威(William) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import torch + +from auto_round.export.register import register_format +from auto_round.utils import ( + check_to_quantized, + get_block_names, + get_module, + get_module_name, + get_named_linears, + set_op_by_name, + logger +) + + + +@register_format("auto_awq") +def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): + """Export the model to autogptq format to easily leverage cuda kernel.""" + model = kwargs["model"] + weight_config = kwargs["weight_config"] + sym = kwargs["sym"] + bits = kwargs["bits"] + group_size = kwargs["group_size"] + iters = kwargs["iters"] + lr = kwargs["lr"] + minmax_lr = kwargs["minmax_lr"] + enable_minmax_tuning = kwargs["enable_minmax_tuning"] + enable_quanted_input = kwargs["enable_quanted_input"] + scale_dtype = kwargs["scale_dtype"] + tokenizer = kwargs["tokenizer"] + supported_types = kwargs["supported_types"] + + logger.info("Saving quantized model to autoawq format") + if tokenizer is not None: + tokenizer.save_pretrained(output_dir) + ##check module quantized in block, this may have bug for mixed precision quantization + block_name = get_block_names(model)[0] + first_block = get_module(model, block_name) + all_to_quantized = True + modules_in_block_to_quantize = [] + for n, m in first_block.named_modules(): + is_supported_type = False + for supported_type in supported_types: + if isinstance(m, supported_type): + is_supported_type = True + break + if not is_supported_type: + continue + if not check_to_quantized(m): + all_to_quantized = False + else: + modules_in_block_to_quantize.append(n) + modules_in_block_to_quantize = [modules_in_block_to_quantize] + if all_to_quantized: + modules_in_block_to_quantize = None + + if inplace: + compressed_model = model.to("cpu") + else: + compressed_model = copy.deepcopy(model.to("cpu")) + + from awq.modules.linear import WQLinear_GEMM + from awq.utils.utils import clear_memory + from awq import AutoAWQForCausalLM + import sys + + q_linear_module = WQLinear_GEMM + awq_model = AutoAWQForCausalLM.from_pretrained(model_path) + logger.info(f"lyt_debug Approximate memory usage of compressed_model: {sizeof_fmt(sys.getsizeof(compressed_model))}, device: {compressed_model.device}") + logger.info(f"lyt_debug Approximate memory usage of model: {sizeof_fmt(sys.getsizeof(model))}, device: {model.device}") + try: + logger.info(f"lyt_debug Approximate memory usage of awq_model: {sizeof_fmt(sys.getsizeof(awq_model))}") + except: + logger.info("lyt_debug awq model unable to calc") + self_modules = awq_model.get_model_layers(compressed_model) + for i in range(len(self_modules)): + module = self_modules[i] + named_linears = get_named_linears(module) + for name, linear_layer in named_linears.items(): + key = get_module_name(compressed_model, linear_layer) + info = weight_config[key] + if not check_to_quantized(info): + continue + info["zp"] = info["zp"].to(torch.float32) + scale, zp = info['scale'], info['zp'] + scale = scale.t().contiguous() + zp = zp.t().contiguous() + q_linear = q_linear_module.from_linear( + linear=linear_layer, + w_bit=bits, + group_size=group_size, + init_only=False, + scales=scale, + zeros=zp, + ) + linear_layer.cpu() + q_linear.to(next(module.parameters()).device) + set_op_by_name(module, name, q_linear) + clear_memory() + + quant_config = { + "quant_method": 'awq', + "zero_point": not sym, + "group_size": group_size, + "bits": bits, + "version": 'gemm', + "modules_to_not_convert": None, + } + + save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config) + + +from transformers.modeling_utils import shard_checkpoint +from safetensors.torch import save_file +def save_quantized( + model, + save_dir, + quant_config, + safetensors=True, + shard_size="5GB", +): + save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir + + # Save model + class EmptyModule(nn.Module): + def __init__(self): + super(EmptyModule, self).__init__() + + def forward(self, x): + return x + + # Save model and config files with empty state dict + from awq.models._config import AwqConfig + + model.config.quantization_config = quant_config + model.generation_config.do_sample = True + model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict()) + + # Remove empty state dict + default_paths = [ + f"{save_dir}/model.safetensors", + f"{save_dir}/pytorch_model.bin", + ] + for path in default_paths: + if os.path.exists(path): + os.remove(path) + + # model_name has no extension, add it when saving state_dict + model_name = "model.safetensors" if safetensors else "pytorch_model.bin" + + # shard checkpoint into chunks (10GB default) + shards, index = shard_checkpoint( + model.state_dict(), max_shard_size=shard_size, weights_name=model_name + ) + + for shard_file, shard in shards.items(): + if safetensors: + # safetensors must be in the same memory, so we duplicate and use contiguous memory + shard = {k: v.clone().contiguous() for k, v in shard.items()} + save_file( + shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"} + ) + else: + torch.save(shard, os.path.join(save_dir, shard_file)) + + # save shard index + if index is not None: + with open(f"{save_dir}/{model_name}.index.json", "w+") as file: + file.write(json.dumps(index, indent=4)) + + with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: + json.dump(quant_config, f, indent=2) + + + + + + + + + + +def sizeof_fmt(num, suffix='B'): + for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}{suffix}" + num /= 1024.0 + return f"{num:.1f}Yi{suffix}" + + + From 008899d5130552188d780d034497d79a5171d9dd Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 21 May 2024 03:13:07 -0400 Subject: [PATCH 02/40] add file Signed-off-by: yintong-lu --- auto_round/export/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py index 9b4384ff..268f8096 100644 --- a/auto_round/export/__init__.py +++ b/auto_round/export/__init__.py @@ -15,3 +15,4 @@ from .register import EXPORT_FORMAT from .export_to_autogptq import save_quantized_as_autogptq from .export_to_itrex import save_quantized_as_itrex, QuantConfig +from .export_to_awq import save_quantized_as_autoawq From 49deb12ba3ff6582efdb75e1b2bb54a78484b445 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 21 May 2024 04:13:32 -0400 Subject: [PATCH 03/40] update Signed-off-by: yintong-lu --- auto_round/export/export_to_awq.py | 16 ++++----- auto_round/utils.py | 39 +++++++++++++++++++++ examples/language-modeling/main.py | 11 ++++++ examples/language-modeling/requirements.txt | 2 +- 4 files changed, 58 insertions(+), 10 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 35374b01..ab74cc56 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -105,17 +105,11 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): from awq.modules.linear import WQLinear_GEMM from awq.utils.utils import clear_memory from awq import AutoAWQForCausalLM - import sys q_linear_module = WQLinear_GEMM awq_model = AutoAWQForCausalLM.from_pretrained(model_path) - logger.info(f"lyt_debug Approximate memory usage of compressed_model: {sizeof_fmt(sys.getsizeof(compressed_model))}, device: {compressed_model.device}") - logger.info(f"lyt_debug Approximate memory usage of model: {sizeof_fmt(sys.getsizeof(model))}, device: {model.device}") - try: - logger.info(f"lyt_debug Approximate memory usage of awq_model: {sizeof_fmt(sys.getsizeof(awq_model))}") - except: - logger.info("lyt_debug awq model unable to calc") self_modules = awq_model.get_model_layers(compressed_model) + del awq_model # release memory for i in range(len(self_modules)): module = self_modules[i] named_linears = get_named_linears(module) @@ -173,8 +167,6 @@ def forward(self, x): return x # Save model and config files with empty state dict - from awq.models._config import AwqConfig - model.config.quantization_config = quant_config model.generation_config.do_sample = True model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict()) @@ -211,6 +203,7 @@ def forward(self, x): with open(f"{save_dir}/{model_name}.index.json", "w+") as file: file.write(json.dumps(index, indent=4)) + # save quantize_config with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: json.dump(quant_config, f, indent=2) @@ -232,3 +225,8 @@ def sizeof_fmt(num, suffix='B'): +def get_size(model): + total = 0 + for param in model.parameters(): + total += param.nelement() * param.element_size() + return total \ No newline at end of file diff --git a/auto_round/utils.py b/auto_round/utils.py index 0e2b832f..8312248e 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -743,3 +743,42 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): bs = 1 return False, seqlen, bs + + +def get_named_linears(module): + """Get the name, linear_op pairs of a given module. + + Args: + module: A module to be searched. + """ + return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} + + +def set_op_by_name(layer, name, new_module): + levels = name.split(".") + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_module_name(model, module_to_find): + """Get the name of a given module in a model. + + Args: + model: The model. + module_to_find: A module to be found. + + Returns: + name: The corresponding name of the given module. + """ + for name, module in model.named_modules(): + if module is module_to_find: + return name + return None diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 90a83d93..044c6bb7 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -2,6 +2,8 @@ import sys sys.path.insert(0, '../..') +sys.path.insert(0, '/home/lyt/ChineseLLM_quant/AR_debug/auto-round') +print(f"lyt_dbeug sys.path: {sys.path}") parser = argparse.ArgumentParser() import torch import os @@ -141,6 +143,14 @@ import subprocess + import logging + + logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + logger = logging.getLogger(__name__) + logger.info("lyt_debug This is an info message") + + def get_library_version(library_name): try: version = subprocess.check_output(['pip', 'show', library_name]).decode().split('\n')[1].split(': ')[1] @@ -309,6 +319,7 @@ def get_library_version(library_name): autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace, compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False, device="xpu") + autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) if "cpu" in deployment_device: autoround.save_quantized(output_dir=f'{export_dir}-cpu', format='itrex', inplace=inplace) if "fake" in deployment_device: diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt index 9b0df5e0..b6ddd19c 100644 --- a/examples/language-modeling/requirements.txt +++ b/examples/language-modeling/requirements.txt @@ -15,4 +15,4 @@ auto-gptq openpyxl wandb py-cpuinfo - +autoawq From b1b729dd3c36a2ef491824c0b8cca1ce3b465a76 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 02:48:18 +0000 Subject: [PATCH 04/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_awq.py | 60 ++++++++++++------------------ 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index ab74cc56..895e8df6 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -18,7 +18,6 @@ import os from os.path import isdir, isfile, join from typing import Dict, List, Optional, Union -import torch.nn as nn # MIT License # @@ -42,20 +41,20 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch +import torch.nn as nn from auto_round.export.register import register_format from auto_round.utils import ( - check_to_quantized, - get_block_names, - get_module, + check_to_quantized, + get_block_names, + get_module, get_module_name, get_named_linears, + logger, set_op_by_name, - logger ) - @register_format("auto_awq") def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): """Export the model to autogptq format to easily leverage cuda kernel.""" @@ -102,14 +101,14 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): else: compressed_model = copy.deepcopy(model.to("cpu")) + from awq import AutoAWQForCausalLM from awq.modules.linear import WQLinear_GEMM from awq.utils.utils import clear_memory - from awq import AutoAWQForCausalLM q_linear_module = WQLinear_GEMM awq_model = AutoAWQForCausalLM.from_pretrained(model_path) self_modules = awq_model.get_model_layers(compressed_model) - del awq_model # release memory + del awq_model # release memory for i in range(len(self_modules)): module = self_modules[i] named_linears = get_named_linears(module) @@ -119,7 +118,7 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): if not check_to_quantized(info): continue info["zp"] = info["zp"].to(torch.float32) - scale, zp = info['scale'], info['zp'] + scale, zp = info["scale"], info["zp"] scale = scale.t().contiguous() zp = zp.t().contiguous() q_linear = q_linear_module.from_linear( @@ -134,21 +133,23 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): q_linear.to(next(module.parameters()).device) set_op_by_name(module, name, q_linear) clear_memory() - + quant_config = { - "quant_method": 'awq', - "zero_point": not sym, - "group_size": group_size, - "bits": bits, - "version": 'gemm', - "modules_to_not_convert": None, - } + "quant_method": "awq", + "zero_point": not sym, + "group_size": group_size, + "bits": bits, + "version": "gemm", + "modules_to_not_convert": None, + } save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config) -from transformers.modeling_utils import shard_checkpoint from safetensors.torch import save_file +from transformers.modeling_utils import shard_checkpoint + + def save_quantized( model, save_dir, @@ -184,17 +185,13 @@ def forward(self, x): model_name = "model.safetensors" if safetensors else "pytorch_model.bin" # shard checkpoint into chunks (10GB default) - shards, index = shard_checkpoint( - model.state_dict(), max_shard_size=shard_size, weights_name=model_name - ) + shards, index = shard_checkpoint(model.state_dict(), max_shard_size=shard_size, weights_name=model_name) for shard_file, shard in shards.items(): if safetensors: # safetensors must be in the same memory, so we duplicate and use contiguous memory shard = {k: v.clone().contiguous() for k, v in shard.items()} - save_file( - shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"} - ) + save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"}) else: torch.save(shard, os.path.join(save_dir, shard_file)) @@ -206,27 +203,18 @@ def forward(self, x): # save quantize_config with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: json.dump(quant_config, f, indent=2) - - - - - - - - -def sizeof_fmt(num, suffix='B'): - for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: +def sizeof_fmt(num, suffix="B"): + for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: if abs(num) < 1024.0: return f"{num:3.1f}{unit}{suffix}" num /= 1024.0 return f"{num:.1f}Yi{suffix}" - def get_size(model): total = 0 for param in model.parameters(): total += param.nelement() * param.element_size() - return total \ No newline at end of file + return total From 9e6cf36e153c28bdee725d0c941910157a417461 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Wed, 22 May 2024 22:58:16 -0400 Subject: [PATCH 05/40] update Signed-off-by: yintong-lu --- auto_round/export/export_to_awq.py | 2 +- examples/language-modeling/main.py | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 895e8df6..8ce5fdfa 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -155,7 +155,7 @@ def save_quantized( save_dir, quant_config, safetensors=True, - shard_size="5GB", + shard_size="10GB", ): save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 044c6bb7..6a3c12bf 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -2,8 +2,6 @@ import sys sys.path.insert(0, '../..') -sys.path.insert(0, '/home/lyt/ChineseLLM_quant/AR_debug/auto-round') -print(f"lyt_dbeug sys.path: {sys.path}") parser = argparse.ArgumentParser() import torch import os @@ -143,14 +141,6 @@ import subprocess - import logging - - logging.basicConfig(level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') - logger = logging.getLogger(__name__) - logger.info("lyt_debug This is an info message") - - def get_library_version(library_name): try: version = subprocess.check_output(['pip', 'show', library_name]).decode().split('\n')[1].split(': ')[1] @@ -319,13 +309,13 @@ def get_library_version(library_name): autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace, compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False, device="xpu") - autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) if "cpu" in deployment_device: autoround.save_quantized(output_dir=f'{export_dir}-cpu', format='itrex', inplace=inplace) if "fake" in deployment_device: model = model.to("cpu") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) + autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) if not args.disable_eval and "fake" in deployment_device: ##support autogptq real eval later excel_name = f"{output_dir}_result.xlsx" From 86b9b96a8efd67384cf0721986a7e261f3cafb0c Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 23 May 2024 03:42:22 -0400 Subject: [PATCH 06/40] fix import error Signed-off-by: yintong-lu --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 09874574..cdc39334 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ py-cpuinfo sentencepiece torch transformers +autoawq \ No newline at end of file From 10a91e686b8ff5235ad8823dce45c1493a861f2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 07:43:20 +0000 Subject: [PATCH 07/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cdc39334..eb65a0c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ accelerate auto-gptq +autoawq datasets py-cpuinfo sentencepiece torch transformers -autoawq \ No newline at end of file From 4807994b9a64c3ffb82a048862da6a6343dbad2a Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 23 May 2024 04:05:45 -0400 Subject: [PATCH 08/40] minor change --- auto_round/export/export_to_awq.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 8ce5fdfa..e2c822a0 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -204,17 +204,3 @@ def forward(self, x): with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: json.dump(quant_config, f, indent=2) - -def sizeof_fmt(num, suffix="B"): - for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: - if abs(num) < 1024.0: - return f"{num:3.1f}{unit}{suffix}" - num /= 1024.0 - return f"{num:.1f}Yi{suffix}" - - -def get_size(model): - total = 0 - for param in model.parameters(): - total += param.nelement() * param.element_size() - return total From 5822543c09b7d31baca018d3eee2fe1b78694f1a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 08:07:32 +0000 Subject: [PATCH 09/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_awq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index e2c822a0..228ad663 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -203,4 +203,3 @@ def forward(self, x): # save quantize_config with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: json.dump(quant_config, f, indent=2) - From 991b5e9f949463a4998ee6d4a5d61ee14c90adac Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 23 May 2024 05:06:50 -0400 Subject: [PATCH 10/40] update Signed-off-by: yintong-lu --- auto_round/export/export_to_awq.py | 91 ++++++++++++++++++++++++------ auto_round/utils.py | 38 +------------ requirements.txt | 1 - 3 files changed, 75 insertions(+), 55 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 228ad663..58bf0799 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -20,19 +20,19 @@ from typing import Dict, List, Optional, Union # MIT License -# -# Copyright (c) 2023 潘其威(William) -# + +# Copyright (c) 2023 MIT HAN Lab + # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# + # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. -# + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -48,10 +48,8 @@ check_to_quantized, get_block_names, get_module, - get_module_name, - get_named_linears, logger, - set_op_by_name, + convert_dtype_torch2str_hf ) @@ -134,14 +132,20 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): set_op_by_name(module, name, q_linear) clear_memory() - quant_config = { - "quant_method": "awq", - "zero_point": not sym, - "group_size": group_size, - "bits": bits, - "version": "gemm", - "modules_to_not_convert": None, - } + quant_config = {} + quant_config["quant_method"] = "awq" + quant_config["modules_to_not_convert"] = None + quant_config["version"] = "gemm" + quant_config["iters"] = iters + quant_config["lr"] = lr + quant_config["minmax_lr"] = minmax_lr + quant_config["enable_minmax_tuning"] = enable_minmax_tuning + quant_config["enable_quanted_input"] = enable_quanted_input + quant_config["scale_dtype"] = convert_dtype_torch2str_hf(scale_dtype) + quant_config["sym"] = sym + quant_config["bits"] = bits + quant_config["group_size"] = group_size + quant_config["zero_point"] = not sym save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config) @@ -168,7 +172,16 @@ def forward(self, x): return x # Save model and config files with empty state dict - model.config.quantization_config = quant_config + awq_quant_config = { + "quant_method": "awq", + "zero_point": quant_config["zero_point"], + "group_size": quant_config["group_size"], + "bits": quant_config["bits"], + "version": "gemm", + "modules_to_not_convert": None, + } + + model.config.quantization_config = awq_quant_config model.generation_config.do_sample = True model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict()) @@ -200,6 +213,50 @@ def forward(self, x): with open(f"{save_dir}/{model_name}.index.json", "w+") as file: file.write(json.dumps(index, indent=4)) + q # save quantize_config with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: json.dump(quant_config, f, indent=2) + + + +def get_named_linears(module): + """Get the name, linear_op pairs of a given module. + + Args: + module: A module to be searched. + """ + return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} + + +def set_op_by_name(layer, name, new_module): + levels = name.split(".") + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_module_name(model, module_to_find): + """Get the name of a given module in a model. + + Args: + model: The model. + module_to_find: A module to be found. + + Returns: + name: The corresponding name of the given module. + """ + for name, module in model.named_modules(): + if module is module_to_find: + return name + return None + + + diff --git a/auto_round/utils.py b/auto_round/utils.py index 8312248e..922be595 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -70,6 +70,7 @@ def __call__(self, *args, **kwargs): auto_gptq = LazyImport("auto_gptq") htcore = LazyImport("habana_frameworks.torch.core") +awq = LazyImport("autoawq") def is_optimum_habana_available(): @@ -745,40 +746,3 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): return False, seqlen, bs -def get_named_linears(module): - """Get the name, linear_op pairs of a given module. - - Args: - module: A module to be searched. - """ - return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} - - -def set_op_by_name(layer, name, new_module): - levels = name.split(".") - if len(levels) > 1: - mod_ = layer - for l_idx in range(len(levels) - 1): - if levels[l_idx].isdigit(): - mod_ = mod_[int(levels[l_idx])] - else: - mod_ = getattr(mod_, levels[l_idx]) - setattr(mod_, levels[-1], new_module) - else: - setattr(layer, name, new_module) - - -def get_module_name(model, module_to_find): - """Get the name of a given module in a model. - - Args: - model: The model. - module_to_find: A module to be found. - - Returns: - name: The corresponding name of the given module. - """ - for name, module in model.named_modules(): - if module is module_to_find: - return name - return None diff --git a/requirements.txt b/requirements.txt index eb65a0c9..09874574 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ accelerate auto-gptq -autoawq datasets py-cpuinfo sentencepiece From 4051f55c557b112741afbb0389a80f627ec80c4e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 09:07:49 +0000 Subject: [PATCH 11/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_awq.py | 38 +++++++++++------------------- auto_round/utils.py | 2 -- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 58bf0799..b4bda892 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -19,6 +19,19 @@ from os.path import isdir, isfile, join from typing import Dict, List, Optional, Union +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import torch +import torch.nn as nn + +from auto_round.export.register import register_format +from auto_round.utils import check_to_quantized, convert_dtype_torch2str_hf, get_block_names, get_module, logger + # MIT License # Copyright (c) 2023 MIT HAN Lab @@ -33,25 +46,6 @@ # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import torch -import torch.nn as nn - -from auto_round.export.register import register_format -from auto_round.utils import ( - check_to_quantized, - get_block_names, - get_module, - logger, - convert_dtype_torch2str_hf -) - @register_format("auto_awq") def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): @@ -172,7 +166,7 @@ def forward(self, x): return x # Save model and config files with empty state dict - awq_quant_config = { + awq_quant_config = { "quant_method": "awq", "zero_point": quant_config["zero_point"], "group_size": quant_config["group_size"], @@ -219,7 +213,6 @@ def forward(self, x): json.dump(quant_config, f, indent=2) - def get_named_linears(module): """Get the name, linear_op pairs of a given module. @@ -257,6 +250,3 @@ def get_module_name(model, module_to_find): if module is module_to_find: return name return None - - - diff --git a/auto_round/utils.py b/auto_round/utils.py index 922be595..f4602627 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -744,5 +744,3 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): bs = 1 return False, seqlen, bs - - From f9668121a254e5eb21c0e9f1a27d8c5d12c51c11 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 23 May 2024 05:11:58 -0400 Subject: [PATCH 12/40] fix pylint error Signed-off-by: yintong-lu --- auto_round/export/export_to_awq.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index b4bda892..0217621d 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -93,9 +93,9 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): else: compressed_model = copy.deepcopy(model.to("cpu")) - from awq import AutoAWQForCausalLM - from awq.modules.linear import WQLinear_GEMM - from awq.utils.utils import clear_memory + from awq import AutoAWQForCausalLM # pylint: disable=E0401 + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + from awq.utils.utils import clear_memory # pylint: disable=E0401 q_linear_module = WQLinear_GEMM awq_model = AutoAWQForCausalLM.from_pretrained(model_path) @@ -207,7 +207,6 @@ def forward(self, x): with open(f"{save_dir}/{model_name}.index.json", "w+") as file: file.write(json.dumps(index, indent=4)) - q # save quantize_config with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: json.dump(quant_config, f, indent=2) From 5a1188c7ad9bf1028b07f365c1b41f3917840e08 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 09:12:44 +0000 Subject: [PATCH 13/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_awq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 0217621d..ac4c66ff 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -93,9 +93,9 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): else: compressed_model = copy.deepcopy(model.to("cpu")) - from awq import AutoAWQForCausalLM # pylint: disable=E0401 - from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 - from awq.utils.utils import clear_memory # pylint: disable=E0401 + from awq import AutoAWQForCausalLM # pylint: disable=E0401 + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + from awq.utils.utils import clear_memory # pylint: disable=E0401 q_linear_module = WQLinear_GEMM awq_model = AutoAWQForCausalLM.from_pretrained(model_path) From 9c4f73b3a7182f88afede6e20ca81c6732bf190a Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 23 May 2024 23:20:48 -0400 Subject: [PATCH 14/40] minor fix Signed-off-by: yintong-lu --- examples/language-modeling/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 43eb91d0..36a44f19 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -320,7 +320,8 @@ def get_library_version(library_name): model = model.to("cpu") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) + if args.bits == 4: + autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) if not args.disable_eval and "fake" in deployment_device: ##support autogptq real eval later excel_name = f"{output_dir}_result.xlsx" From 05d6e1db8350c0eed79cd21abff7240342fe0cd9 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 23 May 2024 23:51:01 -0400 Subject: [PATCH 15/40] fix pylint issue Signed-off-by: yintong-lu --- .../export/export_to_autoround/autoround_quantizer.py | 6 +++--- .../export/export_to_autoround/export_to_autoround.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/auto_round/export/export_to_autoround/autoround_quantizer.py b/auto_round/export/export_to_autoround/autoround_quantizer.py index 9a265104..dfb2b5af 100644 --- a/auto_round/export/export_to_autoround/autoround_quantizer.py +++ b/auto_round/export/export_to_autoround/autoround_quantizer.py @@ -361,14 +361,14 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend): in_features = layer.weight.shape[0] out_features = layer.weight.shape[1] bias = layer.bias is not None - new_layer = QuantLinear( + new_layer = QuantLinear( # pylint: disable=E1123 bits, group_size, in_features, out_features, bias, - weight_dtype=layer.weight.dtype, # pylint: disable=E1123 - ) + weight_dtype=layer.weight.dtype, + ) new_layer.device = device set_module(module, layer_name, new_layer) diff --git a/auto_round/export/export_to_autoround/export_to_autoround.py b/auto_round/export/export_to_autoround/export_to_autoround.py index fab55a26..c20967d8 100644 --- a/auto_round/export/export_to_autoround/export_to_autoround.py +++ b/auto_round/export/export_to_autoround/export_to_autoround.py @@ -140,9 +140,9 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav out_features = layer.weight.shape[1] bias = layer.bias is not None and torch.any(layer.bias) - new_layer = QuantLinear( - bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype ##pylint: disable=E1123 - ) + new_layer = QuantLinear( # pylint: disable=E1123 + bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype + ) new_layer.device = device set_module(model, name, new_layer) From 72b48b1d2eb23ac5dfcbfada55bd7f4a574c5d7c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 03:51:44 +0000 Subject: [PATCH 16/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../export/export_to_autoround/autoround_quantizer.py | 6 +++--- .../export/export_to_autoround/export_to_autoround.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_autoround/autoround_quantizer.py b/auto_round/export/export_to_autoround/autoround_quantizer.py index dfb2b5af..0b77fb6a 100644 --- a/auto_round/export/export_to_autoround/autoround_quantizer.py +++ b/auto_round/export/export_to_autoround/autoround_quantizer.py @@ -361,14 +361,14 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend): in_features = layer.weight.shape[0] out_features = layer.weight.shape[1] bias = layer.bias is not None - new_layer = QuantLinear( # pylint: disable=E1123 + new_layer = QuantLinear( # pylint: disable=E1123 bits, group_size, in_features, out_features, bias, - weight_dtype=layer.weight.dtype, - ) + weight_dtype=layer.weight.dtype, + ) new_layer.device = device set_module(module, layer_name, new_layer) diff --git a/auto_round/export/export_to_autoround/export_to_autoround.py b/auto_round/export/export_to_autoround/export_to_autoround.py index c20967d8..1016d1eb 100644 --- a/auto_round/export/export_to_autoround/export_to_autoround.py +++ b/auto_round/export/export_to_autoround/export_to_autoround.py @@ -142,7 +142,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav new_layer = QuantLinear( # pylint: disable=E1123 bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype - ) + ) new_layer.device = device set_module(model, name, new_layer) From 40e8ce7e348e107251c57d18110a2fc0530f4def Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Fri, 24 May 2024 02:08:09 -0400 Subject: [PATCH 17/40] update import of awq Signed-off-by: yintong-lu --- auto_round/autoround.py | 6 ++++++ auto_round/utils.py | 1 - examples/language-modeling/main.py | 4 ++-- examples/language-modeling/requirements.txt | 3 +-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index dcb08bfb..9be4ded5 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -1350,6 +1350,12 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw if format not in EXPORT_FORMAT: logger.error(f"export format only supports {EXPORT_FORMAT.keys()}") exit() + if format == "auto_awq": + try: + import awq + except: + logger.error("autoawq is required. Please install it to support auto_awq format.") + return save_quantized_as_format = EXPORT_FORMAT.get(format) compressed_model = save_quantized_as_format( ##TODO refine the code output_dir, diff --git a/auto_round/utils.py b/auto_round/utils.py index ccd8c7d0..c830b8d8 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -70,7 +70,6 @@ def __call__(self, *args, **kwargs): auto_gptq = LazyImport("auto_gptq") htcore = LazyImport("habana_frameworks.torch.core") -awq = LazyImport("autoawq") def is_optimum_habana_available(): diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 36a44f19..a4116c77 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -310,6 +310,8 @@ def get_library_version(library_name): autoround.save_quantized(f'{export_dir}-gpu', format="autoround", use_triton=True, inplace=inplace) else: autoround.save_quantized(f'{export_dir}-gpu', format="auto_gptq", use_triton=True, inplace=inplace) + if args.bits == 4: + autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) if 'xpu' in deployment_device: autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace, compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False, @@ -320,8 +322,6 @@ def get_library_version(library_name): model = model.to("cpu") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - if args.bits == 4: - autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) if not args.disable_eval and "fake" in deployment_device: ##support autogptq real eval later excel_name = f"{output_dir}_result.xlsx" diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt index b6ddd19c..04b21595 100644 --- a/examples/language-modeling/requirements.txt +++ b/examples/language-modeling/requirements.txt @@ -14,5 +14,4 @@ protobuf auto-gptq openpyxl wandb -py-cpuinfo -autoawq +py-cpuinfo \ No newline at end of file From 55b05e661dd18f634908d15c020733d66a1050b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 06:08:51 +0000 Subject: [PATCH 18/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 9be4ded5..bb1e060e 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -1355,7 +1355,7 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw import awq except: logger.error("autoawq is required. Please install it to support auto_awq format.") - return + return save_quantized_as_format = EXPORT_FORMAT.get(format) compressed_model = save_quantized_as_format( ##TODO refine the code output_dir, From 07c0f184fbd3ccdf8a7da9cc81281b9fbb26b44b Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Fri, 24 May 2024 02:19:56 -0400 Subject: [PATCH 19/40] fix import error Signed-off-by: yintong-lu --- auto_round/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index bb1e060e..fa143bc4 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -1352,7 +1352,7 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw exit() if format == "auto_awq": try: - import awq + import awq # pylint: disable=E0401 except: logger.error("autoawq is required. Please install it to support auto_awq format.") return From 8184d8479ff9caccc9036e221271b57c9164189a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 06:20:38 +0000 Subject: [PATCH 20/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index fa143bc4..bff90ba1 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -1352,7 +1352,7 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw exit() if format == "auto_awq": try: - import awq # pylint: disable=E0401 + import awq # pylint: disable=E0401 except: logger.error("autoawq is required. Please install it to support auto_awq format.") return From 02687acacec41acd0fc836f316f138cd2503e683 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Sun, 26 May 2024 22:01:54 -0400 Subject: [PATCH 21/40] modify code Signed-off-by: yintong-lu --- auto_round/autoround.py | 6 ------ auto_round/export/export_to_awq.py | 21 ++++++++++++--------- examples/language-modeling/main.py | 4 ++-- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index bff90ba1..dcb08bfb 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -1350,12 +1350,6 @@ def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kw if format not in EXPORT_FORMAT: logger.error(f"export format only supports {EXPORT_FORMAT.keys()}") exit() - if format == "auto_awq": - try: - import awq # pylint: disable=E0401 - except: - logger.error("autoawq is required. Please install it to support auto_awq format.") - return save_quantized_as_format = EXPORT_FORMAT.get(format) compressed_model = save_quantized_as_format( ##TODO refine the code output_dir, diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index ac4c66ff..0c3753ba 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -48,8 +48,18 @@ @register_format("auto_awq") -def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): +def save_quantized_as_autoawq(output_dir, model_path, **kwargs): """Export the model to autogptq format to easily leverage cuda kernel.""" + + try: + from awq import AutoAWQForCausalLM # pylint: disable=E0401 + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + from awq.utils.utils import clear_memory # pylint: disable=E0401 + except: + logger.error("autoawq is required. Please install it to support auto_awq format.") + return + + model = kwargs["model"] weight_config = kwargs["weight_config"] sym = kwargs["sym"] @@ -88,14 +98,7 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): if all_to_quantized: modules_in_block_to_quantize = None - if inplace: - compressed_model = model.to("cpu") - else: - compressed_model = copy.deepcopy(model.to("cpu")) - - from awq import AutoAWQForCausalLM # pylint: disable=E0401 - from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 - from awq.utils.utils import clear_memory # pylint: disable=E0401 + compressed_model = copy.deepcopy(model.to("cpu")) q_linear_module = WQLinear_GEMM awq_model = AutoAWQForCausalLM.from_pretrained(model_path) diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index a4116c77..244eb593 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -310,8 +310,8 @@ def get_library_version(library_name): autoround.save_quantized(f'{export_dir}-gpu', format="autoround", use_triton=True, inplace=inplace) else: autoround.save_quantized(f'{export_dir}-gpu', format="auto_gptq", use_triton=True, inplace=inplace) - if args.bits == 4: - autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", inplace=inplace, model_path=args.model_name) + if deployment_device == ['gpu'] and args.bits == 4: + autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", model_path=args.model_name) if 'xpu' in deployment_device: autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace, compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False, From 6d6b8b14bf3cf0694a20f74466ddc4f617ea0393 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 02:06:18 +0000 Subject: [PATCH 22/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_awq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 0c3753ba..03e3a722 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -59,7 +59,6 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): logger.error("autoawq is required. Please install it to support auto_awq format.") return - model = kwargs["model"] weight_config = kwargs["weight_config"] sym = kwargs["sym"] From e91fb2057ab44b934b63030650d6073b4a2754e7 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Sun, 26 May 2024 22:20:09 -0400 Subject: [PATCH 23/40] fix doc typo Signed-off-by: yintong-lu --- docs/Meta-Llama-3-8B-Instruct-acc.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Meta-Llama-3-8B-Instruct-acc.md b/docs/Meta-Llama-3-8B-Instruct-acc.md index 24db9992..448c4e47 100644 --- a/docs/Meta-Llama-3-8B-Instruct-acc.md +++ b/docs/Meta-Llama-3-8B-Instruct-acc.md @@ -9,7 +9,7 @@ for evaluation with quantized lm-head ```bash git clone https://github.com/intel/auto-round cd auto-round/examples/language-modeling -python3 eval_042/evluation.py --model_name "./" --eval_bs 16 +python3 eval_042/evaluation.py --model_name "./" --eval_bs 16 ``` | Metric | **BF16** | w4g128 w/o lm-head | w4g128 with lm-head | From a775b21c09281c9b4442ff2bdaa2927b568972dd Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Mon, 27 May 2024 05:33:37 -0400 Subject: [PATCH 24/40] fix mixtral issue Signed-off-by: yintong-lu --- auto_round/export/export_to_awq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index 03e3a722..bb095f25 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -111,7 +111,6 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): info = weight_config[key] if not check_to_quantized(info): continue - info["zp"] = info["zp"].to(torch.float32) scale, zp = info["scale"], info["zp"] scale = scale.t().contiguous() zp = zp.t().contiguous() @@ -131,6 +130,8 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): quant_config = {} quant_config["quant_method"] = "awq" quant_config["modules_to_not_convert"] = None + if compressed_model.config.model_type == 'mixtral': + quant_config["modules_to_not_convert"] = ["gate"] quant_config["version"] = "gemm" quant_config["iters"] = iters quant_config["lr"] = lr @@ -174,9 +175,8 @@ def forward(self, x): "group_size": quant_config["group_size"], "bits": quant_config["bits"], "version": "gemm", - "modules_to_not_convert": None, + "modules_to_not_convert": quant_config["modules_to_not_convert"], } - model.config.quantization_config = awq_quant_config model.generation_config.do_sample = True model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict()) From fc39b70fdaf353803372f689ac93a9877c885879 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 28 May 2024 01:36:30 -0400 Subject: [PATCH 25/40] mv awq to autoround format, fix mixtral issue and follow autoround format keys Signed-off-by: yintong-lu --- .../export_to_autoround.py | 190 +++++++++--------- auto_round/export/export_to_awq.py | 45 +---- examples/language-modeling/main.py | 2 - 3 files changed, 108 insertions(+), 129 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_autoround.py b/auto_round/export/export_to_autoround/export_to_autoround.py index 34d0ac13..66999a85 100644 --- a/auto_round/export/export_to_autoround/export_to_autoround.py +++ b/auto_round/export/export_to_autoround/export_to_autoround.py @@ -94,99 +94,107 @@ def get_autogptq_backend_config(backend, bits=4): @register_format("autoround") -def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav2", **kwargs): - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear - - model = kwargs["model"] - if not inplace: - model = copy.deepcopy(model.to("cpu")) - layer_names_in_block = get_layer_names_in_block(model) - - weight_config = kwargs["weight_config"] - for name in weight_config.keys(): - - config = kwargs["weight_config"][name] - if config["data_type"] != "int" and config["bits"] >= 16: - continue - logger.info(f"packing {name}") - - bits = config["bits"] - group_size = config["group_size"] - use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin = get_autogptq_backend_config( - backend, bits - ) - - layer = get_module(model, name) - device = "cpu" - QuantLinear = dynamically_import_QuantLinear( - use_triton=use_triton, - desc_act=False, - group_size=group_size, - bits=bits, - disable_exllama=disable_exllamav1, - disable_exllamav2=disable_exllamav2, - use_qigen=use_qigen, - disable_marlin=disable_marlin, - ) - - if isinstance(layer, nn.Linear): - in_features = layer.in_features - out_features = layer.out_features - elif isinstance(layer, nn.Conv2d): - in_features = layer.in_channels - out_features = layer.out_channels - elif isinstance(layer, transformers.pytorch_utils.Conv1D): - in_features = layer.weight.shape[0] - out_features = layer.weight.shape[1] - bias = layer.bias is not None and torch.any(layer.bias) - - new_layer = QuantLinear( ##pylint: disable=E1123 - bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype - ) - - new_layer.device = device - set_module(model, name, new_layer) - qlayer = new_layer - scale = weight_config[name]["scale"] - zero = weight_config[name]["zp"] - # so far can only pack layer on CPU - qlayer.to("cpu") - layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu") - qlayer.pack(layer, scale, zero, None) - qlayer.to(device) - quantization_config = kwargs["serialization_dict"] - quantization_config["quant_method"] = "intel/auto-round" - quantization_config["backend"] = backend - extra_config = {} - for layer_name in weight_config: - if weight_config[layer_name]["data_type"] != "int" and weight_config[layer_name]["bits"] >= 16: - continue - if layer_name not in layer_names_in_block: - extra_config[layer_name] = {} - extra_config[layer_name]["bits"] = weight_config[layer_name]["bits"] - extra_config[layer_name]["data_type"] = weight_config[layer_name]["data_type"] - extra_config[layer_name]["group_size"] = weight_config[layer_name]["group_size"] - extra_config[layer_name]["sym"] = weight_config[layer_name]["sym"] - else: - neq_keys = check_neq_config( - weight_config[layer_name], - data_type=quantization_config["data_type"], - bits=quantization_config["bits"], - group_size=quantization_config["group_size"], - sym=quantization_config["sym"], +def save_quantized_as_autoround(output_dir, inplace=True, backend="gptq:exllamav2", model_path="", **kwargs): + if "awq" not in backend: + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear + + model = kwargs["model"] + if not inplace: + model = copy.deepcopy(model.to("cpu")) + layer_names_in_block = get_layer_names_in_block(model) + + weight_config = kwargs["weight_config"] + for name in weight_config.keys(): + + config = kwargs["weight_config"][name] + if config["data_type"] != "int" and config["bits"] >= 16: + continue + logger.info(f"packing {name}") + + bits = config["bits"] + group_size = config["group_size"] + use_triton, disable_exllamav1, disable_exllamav2, use_qigen, disable_marlin = get_autogptq_backend_config( + backend, bits ) - if len(neq_keys) > 0: + + layer = get_module(model, name) + device = "cpu" + QuantLinear = dynamically_import_QuantLinear( + use_triton=use_triton, + desc_act=False, + group_size=group_size, + bits=bits, + disable_exllama=disable_exllamav1, + disable_exllamav2=disable_exllamav2, + use_qigen=use_qigen, + disable_marlin=disable_marlin, + ) + + if isinstance(layer, nn.Linear): + in_features = layer.in_features + out_features = layer.out_features + elif isinstance(layer, nn.Conv2d): + in_features = layer.in_channels + out_features = layer.out_channels + elif isinstance(layer, transformers.pytorch_utils.Conv1D): + in_features = layer.weight.shape[0] + out_features = layer.weight.shape[1] + bias = layer.bias is not None and torch.any(layer.bias) + + new_layer = QuantLinear( # pylint: disable=E1123 + bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype + ) + + new_layer.device = device + set_module(model, name, new_layer) + qlayer = new_layer + scale = weight_config[name]["scale"] + zero = weight_config[name]["zp"] + # so far can only pack layer on CPU + qlayer.to("cpu") + layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu") + qlayer.pack(layer, scale, zero, None) + qlayer.to(device) + quantization_config = kwargs["serialization_dict"] + quantization_config["quant_method"] = "intel/auto-round" + quantization_config["backend"] = backend + extra_config = {} + for layer_name in weight_config: + if weight_config[layer_name]["data_type"] != "int" and weight_config[layer_name]["bits"] >= 16: + continue + if layer_name not in layer_names_in_block: extra_config[layer_name] = {} - for key in neq_keys: - extra_config[layer_name][key] = weight_config[layer_name][key] - if len(extra_config) > 0: - quantization_config["extra_config"] = extra_config - if hasattr(model, "config"): - model.config.quantization_config = quantization_config - tokenizer = kwargs["tokenizer"] - if tokenizer is not None: - tokenizer.save_pretrained(output_dir) - save(model, output_dir) + extra_config[layer_name]["bits"] = weight_config[layer_name]["bits"] + extra_config[layer_name]["data_type"] = weight_config[layer_name]["data_type"] + extra_config[layer_name]["group_size"] = weight_config[layer_name]["group_size"] + extra_config[layer_name]["sym"] = weight_config[layer_name]["sym"] + else: + neq_keys = check_neq_config( + weight_config[layer_name], + data_type=quantization_config["data_type"], + bits=quantization_config["bits"], + group_size=quantization_config["group_size"], + sym=quantization_config["sym"], + ) + if len(neq_keys) > 0: + extra_config[layer_name] = {} + for key in neq_keys: + extra_config[layer_name][key] = weight_config[layer_name][key] + if len(extra_config) > 0: + quantization_config["extra_config"] = extra_config + if hasattr(model, "config"): + model.config.quantization_config = quantization_config + tokenizer = kwargs["tokenizer"] + if tokenizer is not None: + tokenizer.save_pretrained(output_dir) + save(model, output_dir) + else: + if not model_path: + logger.error("Please provide model path for awq format.") + return + from ..export_to_awq import save_quantized_as_autoawq + save_quantized_as_autoawq(output_dir=output_dir, model_path=model_path, kwargs=kwargs) + def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True): diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index bb095f25..aab878f1 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -48,7 +48,7 @@ @register_format("auto_awq") -def save_quantized_as_autoawq(output_dir, model_path, **kwargs): +def save_quantized_as_autoawq(output_dir, model_path, kwargs): """Export the model to autogptq format to easily leverage cuda kernel.""" try: @@ -76,26 +76,6 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): logger.info("Saving quantized model to autoawq format") if tokenizer is not None: tokenizer.save_pretrained(output_dir) - ##check module quantized in block, this may have bug for mixed precision quantization - block_name = get_block_names(model)[0] - first_block = get_module(model, block_name) - all_to_quantized = True - modules_in_block_to_quantize = [] - for n, m in first_block.named_modules(): - is_supported_type = False - for supported_type in supported_types: - if isinstance(m, supported_type): - is_supported_type = True - break - if not is_supported_type: - continue - if not check_to_quantized(m): - all_to_quantized = False - else: - modules_in_block_to_quantize.append(n) - modules_in_block_to_quantize = [modules_in_block_to_quantize] - if all_to_quantized: - modules_in_block_to_quantize = None compressed_model = copy.deepcopy(model.to("cpu")) @@ -103,6 +83,7 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): awq_model = AutoAWQForCausalLM.from_pretrained(model_path) self_modules = awq_model.get_model_layers(compressed_model) del awq_model # release memory + modules_to_not_convert = [] for i in range(len(self_modules)): module = self_modules[i] named_linears = get_named_linears(module) @@ -110,7 +91,9 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): key = get_module_name(compressed_model, linear_layer) info = weight_config[key] if not check_to_quantized(info): + modules_to_not_convert.append(key) continue + info["zp"] = info["zp"].to(torch.float32) scale, zp = info["scale"], info["zp"] scale = scale.t().contiguous() zp = zp.t().contiguous() @@ -127,22 +110,12 @@ def save_quantized_as_autoawq(output_dir, model_path, **kwargs): set_op_by_name(module, name, q_linear) clear_memory() - quant_config = {} - quant_config["quant_method"] = "awq" - quant_config["modules_to_not_convert"] = None - if compressed_model.config.model_type == 'mixtral': - quant_config["modules_to_not_convert"] = ["gate"] - quant_config["version"] = "gemm" - quant_config["iters"] = iters - quant_config["lr"] = lr - quant_config["minmax_lr"] = minmax_lr - quant_config["enable_minmax_tuning"] = enable_minmax_tuning - quant_config["enable_quanted_input"] = enable_quanted_input - quant_config["scale_dtype"] = convert_dtype_torch2str_hf(scale_dtype) - quant_config["sym"] = sym - quant_config["bits"] = bits - quant_config["group_size"] = group_size + quant_config = kwargs["serialization_dict"] quant_config["zero_point"] = not sym + quant_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert + quant_config["version"] = "gemm" + quant_config["quant_method"] = "intel/auto-round" + quant_config["backend"] = "awq" save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config) diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index ccf08eca..07832924 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -320,8 +320,6 @@ def get_library_version(library_name): output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq" inplace = True if len(deployment_device) < 2 else False - if deployment_device == ['gpu'] and args.bits == 4: - autoround.save_quantized(f'{export_dir}-awq', format="auto_awq", model_path=args.model_name) if 'gpu' in deployment_device: autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace) if 'xpu' in deployment_device: From 3480b8d608f3c0c0774cdf505d00bd1d55991709 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 28 May 2024 03:28:23 -0400 Subject: [PATCH 26/40] minor fix Signed-off-by: yintong-lu --- auto_round/export/__init__.py | 1 - auto_round/export/export_to_awq.py | 1 - 2 files changed, 2 deletions(-) diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py index 71e9ac34..5a48d41f 100644 --- a/auto_round/export/__init__.py +++ b/auto_round/export/__init__.py @@ -15,6 +15,5 @@ from .register import EXPORT_FORMAT from .export_to_autogptq import save_quantized_as_autogptq from .export_to_itrex import save_quantized_as_itrex, QuantConfig -from .export_to_awq import save_quantized_as_autoawq from .export_to_autoround.export_to_autoround import save_quantized_as_autoround from .export_to_autoround import AutoHfQuantizer diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py index aab878f1..673eeda2 100644 --- a/auto_round/export/export_to_awq.py +++ b/auto_round/export/export_to_awq.py @@ -47,7 +47,6 @@ # copies or substantial portions of the Software. -@register_format("auto_awq") def save_quantized_as_autoawq(output_dir, model_path, kwargs): """Export the model to autogptq format to easily leverage cuda kernel.""" From 81576c5ffc968c1ba0fa4497cf70e6d830bad896 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 13 Jun 2024 20:44:13 -0400 Subject: [PATCH 27/40] mv to autoround format Signed-off-by: yintong-lu --- .../export/export_to_autoround/export.py | 109 +++++++-- auto_round/export/export_to_awq.py | 225 ------------------ 2 files changed, 94 insertions(+), 240 deletions(-) delete mode 100644 auto_round/export/export_to_awq.py diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index f89a03aa..b973bab1 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -88,6 +88,13 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size): ) return QuantLinear ##export all use trition, inference use exllamav2 + elif "awq" in backend: + try: + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + return WQLinear_GEMM + except: + logger.error("autoawq is required. Please install it to support auto_awq format.") + return elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend: from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear @@ -103,11 +110,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl model = copy.deepcopy(model.to("cpu")) layer_names_in_block = get_layer_names_in_block(model) + modules_to_not_convert = [] weight_config = kwargs["weight_config"] for name in weight_config.keys(): config = kwargs["weight_config"][name] if config["data_type"] != "int" and config["bits"] >= 16: + if "awq" in backend: + modules_to_not_convert.append(name) continue logger.info(f"packing {name}") @@ -130,23 +140,46 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl out_features = layer.weight.shape[1] bias = layer.bias is not None and torch.any(layer.bias) - new_layer = QuantLinear( ##pylint: disable=E1123 - bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype - ) + if "awq" not in backend: + new_layer = QuantLinear( ##pylint: disable=E1123 + bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype + ) - new_layer.device = device - set_module(model, name, new_layer) - qlayer = new_layer - scale = weight_config[name]["scale"] - zero = weight_config[name]["zp"] - # so far can only pack layer on CPU - qlayer.to("cpu") - ##force to float32 to be compatible with torch 2.0 - layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32) - qlayer.pack(layer, scale, zero, None) - qlayer.to(device) + new_layer.device = device + set_module(model, name, new_layer) + qlayer = new_layer + scale = weight_config[name]["scale"] + zero = weight_config[name]["zp"] + # so far can only pack layer on CPU + qlayer.to("cpu") + ##force to float32 to be compatible with torch 2.0 + layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32) + qlayer.pack(layer, scale, zero, None) + qlayer.to(device) + else: + logger.info("lyt_debug starting awq format packing") + from awq.utils.utils import clear_memory + scale, zp = weight_config[name]["scale"].to(torch.float32), weight_config[name]["zp"].to(torch.float32) + scale = scale.t().contiguous() + zp = zp.t().contiguous() + if bits != 4: + logger.error("AutoAWQ format only supports 4-bits quantization.") + qlayer = QuantLinear.from_linear( + linear=layer, + w_bit=bits, + group_size=group_size, + init_only=False, + scales=scale, + zeros=zp, + ) + qlayer.to(device) + set_module(model, name, qlayer) + clear_memory() quantization_config = kwargs["serialization_dict"] quantization_config["quant_method"] = "intel/auto-round" + # if "awq" in backend: + # quantization_config["quant_method"], quantization_config["version"] = "awq", "gemm" + # quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert quantization_config["backend"] = backend extra_config = {} for layer_name in weight_config: @@ -174,10 +207,22 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl quantization_config["extra_config"] = extra_config if hasattr(model, "config"): model.config.quantization_config = quantization_config + if "awq" in backend: + awq_quant_config = { + "quant_method": "awq", + "zero_point": not quantization_config["sym"], + "group_size": quantization_config["group_size"], + "bits": quantization_config["bits"], + "version": "gemm", + "modules_to_not_convert": None if not modules_to_not_convert else modules_to_not_convert, + } tokenizer = kwargs["tokenizer"] if tokenizer is not None: tokenizer.save_pretrained(output_dir) - save(model, output_dir) + if "awq" not in backend: + save(model, output_dir) + else: + save_awq(model, output_dir, awq_quant_config=awq_quant_config) def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True): @@ -206,3 +251,37 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_ser if hasattr(model, "config") and hasattr(model.config, "quantization_config"): with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f: json.dump(model.config.quantization_config, f, indent=2) + + + +def save_awq(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True, awq_quant_config: dict = {}): + """Save model state dict and configs. + + Args: + model (`nn.Module`): + Model to be saved. The model can be wrapped or unwrapped. + save_dir (`str`): + Directory to which to save. Will be created if it doesn't exist. + max_shard_size (`str`, defaults to `"10GB"`): + The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size + lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). + + + If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard + which will be bigger than `max_shard_size`. + + + safe_serialization (`bool`, defaults to `True`): + Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + """ + os.makedirs(save_dir, exist_ok=True) + if hasattr(model, "config") and hasattr(model.config, "quantization_config"): + quantization_config = model.config.quantization_config + else: + quantization_config = awq_quant_config + model.config.quantization_config = awq_quant_config + model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) + config_file = "quantization_config.json" + if hasattr(model, "config") and hasattr(model.config, "quantization_config"): + with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f: + json.dump(quantization_config, f, indent=2) \ No newline at end of file diff --git a/auto_round/export/export_to_awq.py b/auto_round/export/export_to_awq.py deleted file mode 100644 index 673eeda2..00000000 --- a/auto_round/export/export_to_awq.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy -import json -import os -from os.path import isdir, isfile, join -from typing import Dict, List, Optional, Union - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import torch -import torch.nn as nn - -from auto_round.export.register import register_format -from auto_round.utils import check_to_quantized, convert_dtype_torch2str_hf, get_block_names, get_module, logger - -# MIT License - -# Copyright (c) 2023 MIT HAN Lab - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - - -def save_quantized_as_autoawq(output_dir, model_path, kwargs): - """Export the model to autogptq format to easily leverage cuda kernel.""" - - try: - from awq import AutoAWQForCausalLM # pylint: disable=E0401 - from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 - from awq.utils.utils import clear_memory # pylint: disable=E0401 - except: - logger.error("autoawq is required. Please install it to support auto_awq format.") - return - - model = kwargs["model"] - weight_config = kwargs["weight_config"] - sym = kwargs["sym"] - bits = kwargs["bits"] - group_size = kwargs["group_size"] - iters = kwargs["iters"] - lr = kwargs["lr"] - minmax_lr = kwargs["minmax_lr"] - enable_minmax_tuning = kwargs["enable_minmax_tuning"] - enable_quanted_input = kwargs["enable_quanted_input"] - scale_dtype = kwargs["scale_dtype"] - tokenizer = kwargs["tokenizer"] - supported_types = kwargs["supported_types"] - - logger.info("Saving quantized model to autoawq format") - if tokenizer is not None: - tokenizer.save_pretrained(output_dir) - - compressed_model = copy.deepcopy(model.to("cpu")) - - q_linear_module = WQLinear_GEMM - awq_model = AutoAWQForCausalLM.from_pretrained(model_path) - self_modules = awq_model.get_model_layers(compressed_model) - del awq_model # release memory - modules_to_not_convert = [] - for i in range(len(self_modules)): - module = self_modules[i] - named_linears = get_named_linears(module) - for name, linear_layer in named_linears.items(): - key = get_module_name(compressed_model, linear_layer) - info = weight_config[key] - if not check_to_quantized(info): - modules_to_not_convert.append(key) - continue - info["zp"] = info["zp"].to(torch.float32) - scale, zp = info["scale"], info["zp"] - scale = scale.t().contiguous() - zp = zp.t().contiguous() - q_linear = q_linear_module.from_linear( - linear=linear_layer, - w_bit=bits, - group_size=group_size, - init_only=False, - scales=scale, - zeros=zp, - ) - linear_layer.cpu() - q_linear.to(next(module.parameters()).device) - set_op_by_name(module, name, q_linear) - clear_memory() - - quant_config = kwargs["serialization_dict"] - quant_config["zero_point"] = not sym - quant_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert - quant_config["version"] = "gemm" - quant_config["quant_method"] = "intel/auto-round" - quant_config["backend"] = "awq" - - save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config) - - -from safetensors.torch import save_file -from transformers.modeling_utils import shard_checkpoint - - -def save_quantized( - model, - save_dir, - quant_config, - safetensors=True, - shard_size="10GB", -): - save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir - - # Save model - class EmptyModule(nn.Module): - def __init__(self): - super(EmptyModule, self).__init__() - - def forward(self, x): - return x - - # Save model and config files with empty state dict - awq_quant_config = { - "quant_method": "awq", - "zero_point": quant_config["zero_point"], - "group_size": quant_config["group_size"], - "bits": quant_config["bits"], - "version": "gemm", - "modules_to_not_convert": quant_config["modules_to_not_convert"], - } - model.config.quantization_config = awq_quant_config - model.generation_config.do_sample = True - model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict()) - - # Remove empty state dict - default_paths = [ - f"{save_dir}/model.safetensors", - f"{save_dir}/pytorch_model.bin", - ] - for path in default_paths: - if os.path.exists(path): - os.remove(path) - - # model_name has no extension, add it when saving state_dict - model_name = "model.safetensors" if safetensors else "pytorch_model.bin" - - # shard checkpoint into chunks (10GB default) - shards, index = shard_checkpoint(model.state_dict(), max_shard_size=shard_size, weights_name=model_name) - - for shard_file, shard in shards.items(): - if safetensors: - # safetensors must be in the same memory, so we duplicate and use contiguous memory - shard = {k: v.clone().contiguous() for k, v in shard.items()} - save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"}) - else: - torch.save(shard, os.path.join(save_dir, shard_file)) - - # save shard index - if index is not None: - with open(f"{save_dir}/{model_name}.index.json", "w+") as file: - file.write(json.dumps(index, indent=4)) - - # save quantize_config - with open(join(save_dir, "quantize_config.json"), "w", encoding="utf-8") as f: - json.dump(quant_config, f, indent=2) - - -def get_named_linears(module): - """Get the name, linear_op pairs of a given module. - - Args: - module: A module to be searched. - """ - return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} - - -def set_op_by_name(layer, name, new_module): - levels = name.split(".") - if len(levels) > 1: - mod_ = layer - for l_idx in range(len(levels) - 1): - if levels[l_idx].isdigit(): - mod_ = mod_[int(levels[l_idx])] - else: - mod_ = getattr(mod_, levels[l_idx]) - setattr(mod_, levels[-1], new_module) - else: - setattr(layer, name, new_module) - - -def get_module_name(model, module_to_find): - """Get the name of a given module in a model. - - Args: - model: The model. - module_to_find: A module to be found. - - Returns: - name: The corresponding name of the given module. - """ - for name, module in model.named_modules(): - if module is module_to_find: - return name - return None From a5ceb0bb0d5c57ef6d42e71430e0e7bd93e9d8e5 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 25 Jun 2024 03:15:35 -0400 Subject: [PATCH 28/40] minor fix Signed-off-by: yintong-lu --- auto_round/export/export_to_autoround/export.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index e9c417cb..c499a8c7 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -158,7 +158,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl qlayer.to(device) else: logger.info("lyt_debug starting awq format packing") - from awq.utils.utils import clear_memory + from awq.utils.utils import clear_memory # pylint: disable=E0401 scale, zp = weight_config[name]["scale"].to(torch.float32), weight_config[name]["zp"].to(torch.float32) scale = scale.t().contiguous() zp = zp.t().contiguous() @@ -254,7 +254,13 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri -def save_awq(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True, awq_quant_config: dict = {}): +def save_awq( + model: nn.Module, + save_dir: str, + max_shard_size: str = "5GB", + safe_serialization: bool = True, + awq_quant_config: dict = {} +): """Save model state dict and configs. Args: From 7d505a0fad1e3509f305ce1f2b2bfa8df55bba42 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 25 Jun 2024 03:17:01 -0400 Subject: [PATCH 29/40] typo Signed-off-by: yintong-lu --- auto_round/export/export_to_autoround/export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index c499a8c7..cdcf64fa 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -157,7 +157,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl qlayer.pack(layer, scale, zero, None) qlayer.to(device) else: - logger.info("lyt_debug starting awq format packing") from awq.utils.utils import clear_memory # pylint: disable=E0401 scale, zp = weight_config[name]["scale"].to(torch.float32), weight_config[name]["zp"].to(torch.float32) scale = scale.t().contiguous() From 56e60d7c1a28e5ef7d62737c9fee1ecebdbe3ddd Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 25 Jun 2024 03:26:31 -0400 Subject: [PATCH 30/40] remove comments Signed-off-by: yintong-lu --- auto_round/export/export_to_autoround/export.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index cdcf64fa..b7d76382 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -176,9 +176,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl clear_memory() quantization_config = kwargs["serialization_dict"] quantization_config["quant_method"] = "intel/auto-round" - # if "awq" in backend: - # quantization_config["quant_method"], quantization_config["version"] = "awq", "gemm" - # quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert else modules_to_not_convert quantization_config["backend"] = backend extra_config = {} for layer_name in weight_config: From 3385eb94f97ff0806ba24bab8b800988bcfe7637 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Thu, 27 Jun 2024 21:47:07 -0400 Subject: [PATCH 31/40] update comments Signed-off-by: yintong-lu --- auto_round/export/export_to_autoround/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index b7d76382..ad5a9792 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -93,7 +93,7 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size): from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 return WQLinear_GEMM except: - logger.error("autoawq is required. Please install it to support auto_awq format.") + logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.") return elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend: from auto_round_extension.cuda.qliner_triton import QuantLinear From 5ec30d49bd8ff526bd90cbe74d595c48da68a04b Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 2 Jul 2024 20:54:52 -0400 Subject: [PATCH 32/40] move awq to autoround format evaluation Signed-off-by: yintong-lu --- .../export/export_to_autoround/export.py | 25 ++++++------------- auto_round/utils.py | 6 +++++ 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index ad5a9792..82a0b423 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -176,7 +176,8 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl clear_memory() quantization_config = kwargs["serialization_dict"] quantization_config["quant_method"] = "intel/auto-round" - quantization_config["backend"] = backend + if "awq" not in backend: + quantization_config["backend"] = backend extra_config = {} for layer_name in weight_config: if weight_config[layer_name]["bits"] >= 16: @@ -203,22 +204,13 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl quantization_config["extra_config"] = extra_config if hasattr(model, "config"): model.config.quantization_config = quantization_config - if "awq" in backend: - awq_quant_config = { - "quant_method": "awq", - "zero_point": not quantization_config["sym"], - "group_size": quantization_config["group_size"], - "bits": quantization_config["bits"], - "version": "gemm", - "modules_to_not_convert": None if not modules_to_not_convert else modules_to_not_convert, - } tokenizer = kwargs["tokenizer"] if tokenizer is not None: tokenizer.save_pretrained(output_dir) if "awq" not in backend: save(model, output_dir) else: - save_awq(model, output_dir, awq_quant_config=awq_quant_config) + save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert) def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True): @@ -255,7 +247,7 @@ def save_awq( save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True, - awq_quant_config: dict = {} + modules_to_not_convert: list = [], ): """Save model state dict and configs. @@ -277,11 +269,10 @@ def save_awq( Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). """ os.makedirs(save_dir, exist_ok=True) - if hasattr(model, "config") and hasattr(model.config, "quantization_config"): - quantization_config = model.config.quantization_config - else: - quantization_config = awq_quant_config - model.config.quantization_config = awq_quant_config + quantization_config = model.config.quantization_config + model.config.quantization_config["quant_method"] = "awq" + model.config.quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert \ + else modules_to_not_convert model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) config_file = "quantization_config.json" if hasattr(model, "config") and hasattr(model.config, "quantization_config"): diff --git a/auto_round/utils.py b/auto_round/utils.py index 2b8d817b..5f90f57a 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -656,6 +656,12 @@ def dynamic_import_inference_linear(bits, group_size, backend): disable_marlin=disable_marlin, ) return QuantLinear + if "awq" in backend: + try: + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + except: + raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.") + return WQLinear_GEMM if bits == 4 and exllama2_available and "exllamav2" in backend: from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear elif bits == 4 and "exllamav2" in backend: From d5537f4509f9cab295058cf3791029704f31c0fc Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 2 Jul 2024 21:02:11 -0400 Subject: [PATCH 33/40] pylint error fixing Signed-off-by: yintong-lu --- auto_round/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index 5f90f57a..317bef15 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -660,7 +660,8 @@ def dynamic_import_inference_linear(bits, group_size, backend): try: from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 except: - raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.") + raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to \ + support auto_awq format.") return WQLinear_GEMM if bits == 4 and exllama2_available and "exllamav2" in backend: from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear From 821a1b9f155c101b780195a6b62c593447b181c2 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 23 Jul 2024 22:11:40 -0400 Subject: [PATCH 34/40] fix conflicts, and add awq format Signed-off-by: yintong-lu --- auto_round/export/__init__.py | 2 +- .../export/export_to_autoround/export.py | 115 +++++++-- auto_round/export/export_to_awq/__init__.py | 17 ++ auto_round/export/export_to_awq/export.py | 230 ++++++++++++++++++ auto_round/utils.py | 7 + examples/language-modeling/main.py | 5 +- 6 files changed, 351 insertions(+), 25 deletions(-) create mode 100644 auto_round/export/export_to_awq/__init__.py create mode 100644 auto_round/export/export_to_awq/export.py diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py index 6b0fd04b..ec304793 100644 --- a/auto_round/export/__init__.py +++ b/auto_round/export/__init__.py @@ -16,5 +16,5 @@ from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq from .export_to_itrex import save_quantized_as_itrex, QuantConfig from .export_to_autoround.export import save_quantized_as_autoround - +from .export_to_awq.export import save_quantized_as_autoawq diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 13b15331..910a3303 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -119,16 +119,22 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym): Raises: AssertionError: If the backend is not supported. """ - if "auto_round" in backend: + if "auto_round" in backend and "awq" not in backend: ##only support triton and exllamav2 if not ("triton" in backend or "exllamav2" in backend): logger.warning_once(f"autoround format does not support {backend}, try to pack with autogptq") return get_autogptq_packing_qlinear(backend, bits, group_size, sym) from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear + elif "awq" in backend: + try: + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + return WQLinear_GEMM + except: + logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.") + return elif "gptq" in backend: return get_autogptq_packing_qlinear(backend, bits, group_size, sym) - else: assert False, f"only support gptq and autoround backend" @@ -160,7 +166,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex backend = "autoround:exllamav2" backend = backend.replace("autoround", "auto_round") backend = backend.replace("auto-round", "auto_round") - if not ("triton" in backend or "exllamav2" in backend): + if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): logger.info(f"autoround format does not support {backend}, try to pack with autogptq") backend = backend.replace("auto_round", "auto_gptq") @@ -174,7 +180,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex layer_config = kwargs["layer_config"] quantization_config = kwargs["serialization_dict"] quantization_config["quant_method"] = "intel/auto-round" - quantization_config["backend"] = backend extra_config = {} for layer_name in layer_config: if layer_name not in layer_names_in_block and layer_config[layer_name]["bits"] <= 8: ##lm head @@ -198,10 +203,13 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex if len(extra_config) > 0: quantization_config["extra_config"] = extra_config with tctl.threadpool_limits(limits=1): + modules_to_not_convert = [] for name in layer_config.keys(): config = kwargs["layer_config"][name] if config["bits"] > 8: + if "awq" in backend: + modules_to_not_convert.append(name) continue logger.info(f"packing {name}") @@ -225,33 +233,55 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex out_features = layer.weight.shape[1] bias = layer.bias is not None and torch.any(layer.bias) - new_layer = QuantLinear( ##pylint: disable=E1123 + if "awq" not in backend: + new_layer = QuantLinear( ##pylint: disable=E1123 bits, group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype - ) - - new_layer.device = device - set_module(model, name, new_layer) - qlayer = new_layer - scale = layer_config[name]["scale"] - zero = layer_config[name]["zp"] - # so far can only pack layer on CPU - qlayer.to("cpu") - ##force to float32 to be compatible with torch 2.0 - layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32) - sig = inspect.signature(qlayer.pack) - param_count = len(sig.parameters) - if param_count == 2: - qlayer.pack(layer, scale) + ) + new_layer.device = device + set_module(model, name, new_layer) + qlayer = new_layer + scale = layer_config[name]["scale"] + zero = layer_config[name]["zp"] + # so far can only pack layer on CPU + qlayer.to("cpu") + ##force to float32 to be compatible with torch 2.0 + layer, scale, zero = layer.to("cpu"), scale.to("cpu"), zero.to("cpu").to(torch.float32) + sig = inspect.signature(qlayer.pack) + param_count = len(sig.parameters) + if param_count == 2: + qlayer.pack(layer, scale) + else: + qlayer.pack(layer, scale, zero, None) + qlayer.to(device) else: - qlayer.pack(layer, scale, zero, None) - qlayer.to(device) + from awq.utils.utils import clear_memory # pylint: disable=E0401 + scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32) + scale = scale.t().contiguous() + zp = zp.t().contiguous() + if bits != 4: + logger.error("AutoAWQ format only supports 4-bits quantization.") + qlayer = QuantLinear.from_linear( + linear=layer, + w_bit=bits, + group_size=group_size, + init_only=False, + scales=scale, + zeros=zp, + ) + qlayer.to(device) + set_module(model, name, qlayer) + clear_memory() + if hasattr(model, "config"): model.config.quantization_config = quantization_config tokenizer = kwargs["tokenizer"] if tokenizer is not None: tokenizer.save_pretrained(output_dir) - save(model, output_dir) + if "awq" not in backend: + save(model, output_dir) + else: + save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert) def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True): @@ -281,3 +311,42 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f: json.dump(model.config.quantization_config, f, indent=2) + + +def save_awq( + model: nn.Module, + save_dir: str, + max_shard_size: str = "5GB", + safe_serialization: bool = True, + modules_to_not_convert: list = [], +): + """Save model state dict and configs. + + Args: + model (`nn.Module`): + Model to be saved. The model can be wrapped or unwrapped. + save_dir (`str`): + Directory to which to save. Will be created if it doesn't exist. + max_shard_size (`str`, defaults to `"10GB"`): + The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size + lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). + + + If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard + which will be bigger than `max_shard_size`. + + + safe_serialization (`bool`, defaults to `True`): + Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + """ + os.makedirs(save_dir, exist_ok=True) + quantization_config = model.config.quantization_config + model.config.quantization_config["quant_method"] = "awq" + model.config.quantization_config["modules_to_not_convert"] = None if not modules_to_not_convert \ + else modules_to_not_convert + model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) + config_file = "quantization_config.json" + if hasattr(model, "config") and hasattr(model.config, "quantization_config"): + with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f: + json.dump(quantization_config, f, indent=2) + \ No newline at end of file diff --git a/auto_round/export/export_to_awq/__init__.py b/auto_round/export/export_to_awq/__init__.py new file mode 100644 index 00000000..0bdb4d35 --- /dev/null +++ b/auto_round/export/export_to_awq/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .export import save_quantized_as_autoawq + + diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py new file mode 100644 index 00000000..03888ba3 --- /dev/null +++ b/auto_round/export/export_to_awq/export.py @@ -0,0 +1,230 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy +import json +import os +from os.path import isdir, isfile, join +from typing import Dict, List, Optional, Union + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import torch +import torch.nn as nn + +from auto_round.export.register import register_format +from auto_round.utils import convert_dtype_torch2str_hf, logger + +# MIT License + +# Copyright (c) 2023 MIT HAN Lab + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + + +@register_format("auto_awq") +def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): + """Export the model to autogptq format to easily leverage cuda kernel.""" + model = kwargs["model"] + layer_config = kwargs["layer_config"] + sym = kwargs["sym"] + bits = kwargs["bits"] + group_size = kwargs["group_size"] + iters = kwargs["iters"] + lr = kwargs["lr"] + minmax_lr = kwargs["minmax_lr"] + enable_minmax_tuning = kwargs["enable_minmax_tuning"] + enable_quanted_input = kwargs["enable_quanted_input"] + scale_dtype = kwargs["scale_dtype"] + tokenizer = kwargs["tokenizer"] + + logger.info("Saving quantized model to auto_awq format") + if tokenizer is not None: + tokenizer.save_pretrained(output_dir) + ##check module quantized in block, this may have bug for mixed precision quantization + modules_to_not_convert = [] + if inplace: + compressed_model = model.to("cpu") + else: + compressed_model = copy.deepcopy(model.to("cpu")) + + from awq import AutoAWQForCausalLM # pylint: disable=E0401 + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + from awq.utils.utils import clear_memory # pylint: disable=E0401 + + q_linear_module = WQLinear_GEMM + awq_model = AutoAWQForCausalLM.from_pretrained(model_path) + self_modules = awq_model.get_model_layers(compressed_model) + del awq_model # release memory + for i in range(len(self_modules)): + module = self_modules[i] + named_linears = get_named_linears(module) + for name, linear_layer in named_linears.items(): + key = get_module_name(compressed_model, linear_layer) + logger.info(f"packing {name}") + config = layer_config[key] + if config["bits"] > 8: + modules_to_not_convert.append(name) + continue + config["zp"] = config["zp"].to(torch.float32) + scale, zp = config["scale"], config["zp"] + scale = scale.t().contiguous() + zp = zp.t().contiguous() + q_linear = q_linear_module.from_linear( + linear=linear_layer, + w_bit=bits, + group_size=group_size, + init_only=False, + scales=scale, + zeros=zp, + ) + linear_layer.cpu() + q_linear.to(next(module.parameters()).device) + set_op_by_name(module, name, q_linear) + clear_memory() + + quant_config = {} + quant_config["quant_method"] = "awq" + quant_config["modules_to_not_convert"] = None + quant_config["version"] = "gemm" + quant_config["iters"] = iters + quant_config["lr"] = lr + quant_config["minmax_lr"] = minmax_lr + quant_config["enable_minmax_tuning"] = enable_minmax_tuning + quant_config["enable_quanted_input"] = enable_quanted_input + quant_config["scale_dtype"] = convert_dtype_torch2str_hf(scale_dtype) + quant_config["sym"] = sym + quant_config["bits"] = bits + quant_config["group_size"] = group_size + quant_config["zero_point"] = not sym + + save_quantized(compressed_model, save_dir=output_dir, quant_config=quant_config) + + +from safetensors.torch import save_file +from transformers.modeling_utils import shard_checkpoint + + +def save_quantized( + model, + save_dir, + quant_config, + safetensors=True, + shard_size="5GB", +): + save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir + + # Save model + class EmptyModule(nn.Module): + def __init__(self): + super(EmptyModule, self).__init__() + + def forward(self, x): + return x + + # Save model and config files with empty state dict + awq_quant_config = { + "quant_method": "awq", + "zero_point": quant_config["zero_point"], + "group_size": quant_config["group_size"], + "bits": quant_config["bits"], + "version": "gemm", + "modules_to_not_convert": quant_config["modules_to_not_convert"], + } + + model.config.quantization_config = awq_quant_config + model.generation_config.do_sample = True + model.save_pretrained(save_dir, state_dict=EmptyModule().state_dict()) + + # Remove empty state dict + default_paths = [ + f"{save_dir}/model.safetensors", + f"{save_dir}/pytorch_model.bin", + ] + for path in default_paths: + if os.path.exists(path): + os.remove(path) + + # model_name has no extension, add it when saving state_dict + model_name = "model.safetensors" if safetensors else "pytorch_model.bin" + + # shard checkpoint into chunks (10GB default) + shards, index = shard_checkpoint(model.state_dict(), max_shard_size=shard_size, weights_name=model_name) + + for shard_file, shard in shards.items(): + if safetensors: + # safetensors must be in the same memory, so we duplicate and use contiguous memory + shard = {k: v.clone().contiguous() for k, v in shard.items()} + save_file(shard, os.path.join(save_dir, shard_file), metadata={"format": "pt"}) + else: + torch.save(shard, os.path.join(save_dir, shard_file)) + + # save shard index + if index is not None: + with open(f"{save_dir}/{model_name}.index.json", "w+") as file: + file.write(json.dumps(index, indent=4)) + + # save quantize_config + with open(join(save_dir, "quantization_config.json"), "w", encoding="utf-8") as f: + json.dump(quant_config, f, indent=2) + + +def get_named_linears(module): + """Get the name, linear_op pairs of a given module. + Args: + module: A module to be searched. + """ + return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} + + +def set_op_by_name(layer, name, new_module): + levels = name.split(".") + if len(levels) > 1: + mod_ = layer + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_module) + else: + setattr(layer, name, new_module) + + +def get_module_name(model, module_to_find): + """Get the name of a given module in a model. + Args: + model: The model. + module_to_find: A module to be found. + Returns: + name: The corresponding name of the given module. + """ + for name, module in model.named_modules(): + if module is module_to_find: + return name + return None diff --git a/auto_round/utils.py b/auto_round/utils.py index 195c2586..747c8d67 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -722,6 +722,13 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym): else: from auto_round_extension.hpu.qlinear_hpu import QuantLinear return QuantLinear + if "awq" in backend: + try: + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + except: + raise ImportError("autoawq is required. Please install it by 'pip install autoawq' to \ + support auto_awq format.") + return WQLinear_GEMM if bits == 4 and exllama2_available and "exllamav2" in backend: from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear elif bits == 4 and "exllamav2" in backend: diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 942a5106..9460e513 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -312,7 +312,7 @@ deployment_device = args.deployment_device.split(',') gpu_formats = [] for item in deployment_device: - if "gpu" in item or "auto_gptq" in item or "auto_round" in item: + if "gpu" in item or "auto_gptq" in item or "auto_round" in item or "auto_awq" in item: gpu_formats.append(item) if 'gpu' in deployment_device: @@ -331,6 +331,9 @@ elif "gptq" in gpu_format: eval_folder = f'{export_dir}-gpu' autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace) + elif "auto_awq" in gpu_format: + eval_folder = f'{export_dir}-awq' + autoround.save_quantized(eval_folder, format=gpu_format, inplace=inplace, model_path=model_name) if 'xpu' in deployment_device: autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace, From 1f87cad9d8c864b9f2745131edf0ba8e927209e8 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Wed, 24 Jul 2024 02:46:26 -0400 Subject: [PATCH 35/40] add ut Signed-off-by: yintong-lu --- test/test_export.py | 51 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/test/test_export.py b/test/test_export.py index 7a747837..ec777136 100644 --- a/test/test_export.py +++ b/test/test_export.py @@ -152,3 +152,54 @@ def test_autoround_format(self): # shutil.rmtree("./saved", ignore_errors=True) # + + def test_autoawq_format(self): + bits, group_size, sym = 4, 128, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_awq", model_path="facebook/opt-125m") + + from auto_round.auto_quantizer import AutoHfQuantizer + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + shutil.rmtree("./saved", ignore_errors=True) + + + def test_autoround_awq_format(self): + bits, group_size, sym = 4, 128, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:awq") + + from auto_round.auto_quantizer import AutoHfQuantizer + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + shutil.rmtree("./saved", ignore_errors=True) \ No newline at end of file From c888e125b6097fb81889f24b64375c40a4d11afd Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Wed, 24 Jul 2024 02:57:56 -0400 Subject: [PATCH 36/40] add requirement Signed-off-by: yintong-lu --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 83ec1fe4..40cd1722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ torch transformers triton numpy < 2.0 -threadpoolctl \ No newline at end of file +threadpoolctl +autoawq \ No newline at end of file From 1bc88db3042d79e05245d703edad1083645a25af Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Wed, 24 Jul 2024 03:18:28 -0400 Subject: [PATCH 37/40] add coverage test waiver Signed-off-by: yintong-lu --- .../export/export_to_autoround/export.py | 14 ++--- auto_round/export/export_to_awq/export.py | 10 ++-- examples/language-modeling/main.py | 2 +- requirements.txt | 3 +- test/test_export.py | 54 +------------------ 5 files changed, 15 insertions(+), 68 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index c63ab0a1..c5a9929e 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -127,7 +127,7 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym): return get_autogptq_packing_qlinear(backend, bits, group_size, sym) from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear - elif "awq" in backend: + elif "awq" in backend: # pragma: no cover try: from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 return WQLinear_GEMM @@ -181,7 +181,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex layer_config = kwargs["layer_config"] quantization_config = kwargs["serialization_dict"] quantization_config["quant_method"] = "intel/auto-round" - if "awq" not in backend: + if "awq" not in backend: # pragma: no cover quantization_config["backend"] = backend extra_config = {} for layer_name in layer_config: @@ -206,11 +206,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex if len(extra_config) > 0: quantization_config["extra_config"] = extra_config with tctl.threadpool_limits(limits=1): - modules_to_not_convert = [] + modules_to_not_convert = [] # pragma: no cover for name in layer_config.keys(): config = kwargs["layer_config"][name] if config["bits"] > 8: - if "awq" in backend: + if "awq" in backend: # pragma: no cover modules_to_not_convert.append(name) continue logger.info(f"packing {name}") @@ -255,7 +255,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex else: qlayer.pack(layer, scale, zero, None) qlayer.to(device) - else: + else: # pragma: no cover from awq.utils.utils import clear_memory # pylint: disable=E0401 scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32) scale = scale.t().contiguous() @@ -282,7 +282,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex tokenizer.save_pretrained(output_dir) if "awq" not in backend: save(model, output_dir, safe_serialization=safe_serialization) - else: + else: # pragma: no cover save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert) @@ -321,7 +321,7 @@ def save_awq( max_shard_size: str = "5GB", safe_serialization: bool = True, modules_to_not_convert: list = [], -): +): # pragma: no cover """Save model state dict and configs. Args: diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py index 03888ba3..451be56e 100644 --- a/auto_round/export/export_to_awq/export.py +++ b/auto_round/export/export_to_awq/export.py @@ -48,7 +48,7 @@ @register_format("auto_awq") -def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): +def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): # pragma: no cover """Export the model to autogptq format to easily leverage cuda kernel.""" model = kwargs["model"] layer_config = kwargs["layer_config"] @@ -136,7 +136,7 @@ def save_quantized( quant_config, safetensors=True, shard_size="5GB", -): +): # pragma: no cover save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir # Save model @@ -194,7 +194,7 @@ def forward(self, x): json.dump(quant_config, f, indent=2) -def get_named_linears(module): +def get_named_linears(module): # pragma: no cover """Get the name, linear_op pairs of a given module. Args: module: A module to be searched. @@ -202,7 +202,7 @@ def get_named_linears(module): return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} -def set_op_by_name(layer, name, new_module): +def set_op_by_name(layer, name, new_module): # pragma: no cover levels = name.split(".") if len(levels) > 1: mod_ = layer @@ -216,7 +216,7 @@ def set_op_by_name(layer, name, new_module): setattr(layer, name, new_module) -def get_module_name(model, module_to_find): +def get_module_name(model, module_to_find): # pragma: no cover """Get the name of a given module in a model. Args: model: The model. diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 1e1d5430..5349880f 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -339,7 +339,7 @@ elif "gptq" in gpu_format: eval_folder = f'{export_dir}-gpu' autoround.save_quantized(eval_folder, format=gpu_format, use_triton=False, inplace=inplace) - elif "auto_awq" in gpu_format: + elif "auto_awq" in gpu_format: # pragma: no cover eval_folder = f'{export_dir}-awq' autoround.save_quantized(eval_folder, format=gpu_format, inplace=inplace, model_path=model_name) diff --git a/requirements.txt b/requirements.txt index 40cd1722..83ec1fe4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,4 @@ torch transformers triton numpy < 2.0 -threadpoolctl -autoawq \ No newline at end of file +threadpoolctl \ No newline at end of file diff --git a/test/test_export.py b/test/test_export.py index ec777136..6cab1f2a 100644 --- a/test/test_export.py +++ b/test/test_export.py @@ -150,56 +150,4 @@ def test_autoround_format(self): # inputs = tokenizer(text, return_tensors="pt").to(model.device) # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) # shutil.rmtree("./saved", ignore_errors=True) - # - - - def test_autoawq_format(self): - bits, group_size, sym = 4, 128, False - autoround = AutoRound( - self.model, - self.tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - autoround.quantize() - quantized_model_path = "./saved" - - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_awq", model_path="facebook/opt-125m") - - from auto_round.auto_quantizer import AutoHfQuantizer - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - shutil.rmtree("./saved", ignore_errors=True) - - - def test_autoround_awq_format(self): - bits, group_size, sym = 4, 128, False - autoround = AutoRound( - self.model, - self.tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=2, - seqlen=2, - dataset=self.llm_dataloader, - ) - autoround.quantize() - quantized_model_path = "./saved" - - autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:awq") - - from auto_round.auto_quantizer import AutoHfQuantizer - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - text = "There is a girl who likes adventure," - inputs = tokenizer(text, return_tensors="pt").to(model.device) - print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - shutil.rmtree("./saved", ignore_errors=True) \ No newline at end of file + # \ No newline at end of file From bee724aa69903f7f818c3abdb1a5cedb78ce683f Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Wed, 24 Jul 2024 03:54:52 -0400 Subject: [PATCH 38/40] minor change Signed-off-by: yintong-lu --- auto_round/export/export_to_autoround/export.py | 2 +- auto_round/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index c5a9929e..a038cc5a 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -167,7 +167,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex backend = "autoround:exllamav2" backend = backend.replace("autoround", "auto_round") backend = backend.replace("auto-round", "auto_round") - if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): + if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): # pragma: no cover logger.info(f"autoround format does not support {backend}, try to pack with autogptq") backend = backend.replace("auto_round", "auto_gptq") diff --git a/auto_round/utils.py b/auto_round/utils.py index 2d34eaac..a3f2a5e1 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -812,7 +812,7 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym): else: from auto_round_extension.hpu.qlinear_hpu import QuantLinear return QuantLinear - if "awq" in backend: + if "awq" in backend: # pragma: no cover try: from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 except: From 949c48a94e69cbc3050544ed09995684d7dc85f4 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Wed, 24 Jul 2024 21:11:04 -0400 Subject: [PATCH 39/40] refine code to decrease number of branches Signed-off-by: yintong-lu --- auto_round/export/export_to_autoround/export.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index a038cc5a..01422c01 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -41,16 +41,12 @@ def check_neq_config(config, data_type, bits, group_size, sym): Returns: list: A list of strings indicating which configuration parameters do not match. """ - res = [] - if data_type != config["data_type"]: - res.append("data_type") - if bits != config["bits"]: - res.append("bits") - if group_size != config["group_size"]: - res.append("group_size") - if sym != config["sym"]: - res.append("sym") - return res + expected_config = {"data_type": data_type, + "bits": bits, + "group_size": group_size, + "sym": sym + } + return [key for key, expected_value in expected_config.items() if config.get(key) != expected_value] def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False): From 77943867a886e4a84479c04577a23dbd9beb4ab1 Mon Sep 17 00:00:00 2001 From: yintong-lu Date: Tue, 30 Jul 2024 02:59:05 -0400 Subject: [PATCH 40/40] add ut, fix minor issues Signed-off-by: yintong-lu --- .../export/export_to_autoround/export.py | 16 ++--- auto_round/export/export_to_awq/export.py | 47 +++++++------- test/test_export.py | 61 +++++++++++++++++++ 3 files changed, 91 insertions(+), 33 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 01422c01..7a0061d1 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -123,7 +123,7 @@ def dynamic_import_quantLinear_for_packing(backend, bits, group_size, sym): return get_autogptq_packing_qlinear(backend, bits, group_size, sym) from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear - elif "awq" in backend: # pragma: no cover + elif "awq" in backend: try: from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 return WQLinear_GEMM @@ -163,7 +163,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex backend = "autoround:exllamav2" backend = backend.replace("autoround", "auto_round") backend = backend.replace("auto-round", "auto_round") - if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): # pragma: no cover + if not ("triton" in backend or "exllamav2" in backend or "awq" in backend): logger.info(f"autoround format does not support {backend}, try to pack with autogptq") backend = backend.replace("auto_round", "auto_gptq") @@ -177,7 +177,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex layer_config = kwargs["layer_config"] quantization_config = kwargs["serialization_dict"] quantization_config["quant_method"] = "intel/auto-round" - if "awq" not in backend: # pragma: no cover + if "awq" not in backend: quantization_config["backend"] = backend extra_config = {} for layer_name in layer_config: @@ -202,11 +202,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex if len(extra_config) > 0: quantization_config["extra_config"] = extra_config with tctl.threadpool_limits(limits=1): - modules_to_not_convert = [] # pragma: no cover + modules_to_not_convert = [] for name in layer_config.keys(): config = kwargs["layer_config"][name] if config["bits"] > 8: - if "awq" in backend: # pragma: no cover + if "awq" in backend: modules_to_not_convert.append(name) continue logger.info(f"packing {name}") @@ -251,7 +251,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex else: qlayer.pack(layer, scale, zero, None) qlayer.to(device) - else: # pragma: no cover + else: from awq.utils.utils import clear_memory # pylint: disable=E0401 scale, zp = layer_config[name]["scale"].to(torch.float32), layer_config[name]["zp"].to(torch.float32) scale = scale.t().contiguous() @@ -278,7 +278,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex tokenizer.save_pretrained(output_dir) if "awq" not in backend: save(model, output_dir, safe_serialization=safe_serialization) - else: # pragma: no cover + else: save_awq(model, output_dir, modules_to_not_convert=modules_to_not_convert) @@ -317,7 +317,7 @@ def save_awq( max_shard_size: str = "5GB", safe_serialization: bool = True, modules_to_not_convert: list = [], -): # pragma: no cover +): """Save model state dict and configs. Args: diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py index 451be56e..79aab82e 100644 --- a/auto_round/export/export_to_awq/export.py +++ b/auto_round/export/export_to_awq/export.py @@ -11,14 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - -import copy -import json -import os -from os.path import isdir, isfile, join -from typing import Dict, List, Optional, Union - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -26,29 +18,31 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import torch -import torch.nn as nn - -from auto_round.export.register import register_format -from auto_round.utils import convert_dtype_torch2str_hf, logger - # MIT License - # Copyright (c) 2023 MIT HAN Lab - # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: - # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. +import os +from os.path import isdir, isfile, join +import torch +import torch.nn as nn +from auto_round.export.register import register_format +from auto_round.utils import convert_dtype_torch2str_hf, logger +import copy +import json +from typing import Dict, List, Optional, Union + + @register_format("auto_awq") -def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): # pragma: no cover +def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): """Export the model to autogptq format to easily leverage cuda kernel.""" model = kwargs["model"] layer_config = kwargs["layer_config"] @@ -73,9 +67,12 @@ def save_quantized_as_autoawq(output_dir, model_path, inplace=True, **kwargs): # else: compressed_model = copy.deepcopy(model.to("cpu")) - from awq import AutoAWQForCausalLM # pylint: disable=E0401 - from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 - from awq.utils.utils import clear_memory # pylint: disable=E0401 + try: + from awq import AutoAWQForCausalLM # pylint: disable=E0401 + from awq.modules.linear import WQLinear_GEMM # pylint: disable=E0401 + from awq.utils.utils import clear_memory # pylint: disable=E0401 + except: + logger.error("autoawq is required. Please install it by 'pip install autoawq' to support auto_awq format.") q_linear_module = WQLinear_GEMM awq_model = AutoAWQForCausalLM.from_pretrained(model_path) @@ -136,7 +133,7 @@ def save_quantized( quant_config, safetensors=True, shard_size="5GB", -): # pragma: no cover +): save_dir = save_dir[:-1] if save_dir[-1] == "/" else save_dir # Save model @@ -194,7 +191,7 @@ def forward(self, x): json.dump(quant_config, f, indent=2) -def get_named_linears(module): # pragma: no cover +def get_named_linears(module): """Get the name, linear_op pairs of a given module. Args: module: A module to be searched. @@ -202,7 +199,7 @@ def get_named_linears(module): # pragma: no cover return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)} -def set_op_by_name(layer, name, new_module): # pragma: no cover +def set_op_by_name(layer, name, new_module): levels = name.split(".") if len(levels) > 1: mod_ = layer @@ -216,7 +213,7 @@ def set_op_by_name(layer, name, new_module): # pragma: no cover setattr(layer, name, new_module) -def get_module_name(model, module_to_find): # pragma: no cover +def get_module_name(model, module_to_find): """Get the name of a given module in a model. Args: model: The model. diff --git a/test/test_export.py b/test/test_export.py index 6cab1f2a..aa23ff58 100644 --- a/test/test_export.py +++ b/test/test_export.py @@ -120,6 +120,67 @@ def test_autoround_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) + + def test_autoround_awq_format(self): + try: + import awq + except: + return + bits, group_size, sym = 4, 128, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:awq") + + from auto_round.auto_quantizer import AutoHfQuantizer + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + shutil.rmtree("./saved", ignore_errors=True) + + + def test_autoawq_format(self): + try: + import awq + except: + return + bits, group_size, sym = 4, 128, False + autoround = AutoRound( + self.model, + self.tokenizer, + bits=bits, + group_size=group_size, + sym=sym, + iters=2, + seqlen=2, + dataset=self.llm_dataloader, + ) + autoround.quantize() + quantized_model_path = "./saved" + + autoround.save_quantized(output_dir=quantized_model_path, inplace=False, \ + format="auto_awq", model_path="facebook/opt-125m") + + from auto_round.auto_quantizer import AutoHfQuantizer + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) + text = "There is a girl who likes adventure," + inputs = tokenizer(text, return_tensors="pt").to(model.device) + print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) + shutil.rmtree("./saved", ignore_errors=True) + # def test_autoround_marlin_format(self): # if not torch.cuda.is_available(): # return