From 29cd0c2afbb11af6dece5029f07701506f32ef40 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Thu, 8 Aug 2024 18:52:09 +0800 Subject: [PATCH 1/3] add qwen int4 model, refine example Signed-off-by: Zhang, Weiwei1 --- README.md | 3 +++ examples/language-modeling/main.py | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/README.md b/README.md index ab03a9ee..5bc67d3e 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ image presents an overview of AutoRound. Check out our updated paper on [arxiv]
## What's New +* [2024/08] Enabled the export and inference of the quantized model to the AutoRound format on HPU devices, please refer to [Intel/Qwen2-7B-int4-inc](https://huggingface.co/Intel/Qwen2-7B-int4-inc) and [Intel/Qwen2-57B-A14B-Instruct-int4-inc](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc). * [2024/07] Important change: the default value of nsamples has been changed from 512 to 128 to reduce the memory usages, which may cause a slight accuracy drop in some scenarios * [2024/06] AutoRound format supports mixed bit-widths and group sizes for inference, resolving the significant performance drop issue with the asymmetric kernel * [2024/05] AutoRound supports lm-head quantization, saving 0.7G for LLaMA3-8B at W4G128. @@ -168,6 +169,8 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) | Model | Supported | |--------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Qwen/Qwen2-7B | [HF-int4-model](https://huggingface.co/Intel/Qwen2-7B-int4-inc) +| Qwen/Qwen2-57B-A14B-Instruct | [HF-int4-model](https://huggingface.co/Intel/Qwen2-57B-A14B-Instruct-int4-inc) | Intel/neural-chat-7b-v3-3 | [HF-int4-model](https://huggingface.co/Intel/neural-chat-7b-v3-3-int4-inc), [accuracy](./docs/neural-chat-7b-v3-3-acc.md), [recipe](./examples/language-modeling/scripts/neural-chat-7b-v3-3.sh), [example](./examples/language-modeling/) | | Intel/neural-chat-7b-v3-1 | [HF-int4-model](https://huggingface.co/Intel/neural-chat-7b-v3-1-int4-inc), [accuracy](./docs/neural-chat-7b-v3-1-acc.md), [recipe](./examples/language-modeling/scripts/neural-chat-7b-v3-1.sh), [example](./examples/language-modeling/) | | mistralai/Mistral-7B-v0.1 | [HF-int4-model-lmhead](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc-lmhead),[HF-int4-model](https://huggingface.co/Intel/Mistral-7B-v0.1-int4-inc), [accuracy](./docs/Mistral-7B-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mistral-7B-v0.1.sh), [example](./examples/language-modeling/) | diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index f71d5c43..46873b8b 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -131,6 +131,9 @@ parser.add_argument("--act_bits", default=32, type=int, help="activation bits") + + parser.add_argument("--layer_blacklist", default="", type=str, + help="black list of quantization.") args = parser.parse_args() @@ -269,6 +272,15 @@ layer_config[n] = {"bits": 32} print( f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq") + layer_blacklist = args.layer_blacklist.split(",") + if bool(layer_blacklist): + for n, m in model.named_modules(): + if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): + name = n.split('.')[-1] + if name in layer_blacklist: + layer_config[n] = {"bits": 32} + print( + f"{n} will not be quantized.") lm_head_layer_name = "lm_head" for n, _ in model.named_modules(): lm_head_layer_name = n From 73991eb240f88e3ca42d0e421f41287d055b9a52 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Fri, 9 Aug 2024 17:22:44 +0800 Subject: [PATCH 2/3] update args Signed-off-by: Zhang, Weiwei1 --- examples/language-modeling/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 46873b8b..96cea2f0 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -132,8 +132,8 @@ parser.add_argument("--act_bits", default=32, type=int, help="activation bits") - parser.add_argument("--layer_blacklist", default="", type=str, - help="black list of quantization.") + parser.add_argument("--fp_layer_list", default="", type=str, + help="List of Layers to maintain original data type") args = parser.parse_args() @@ -272,12 +272,12 @@ layer_config[n] = {"bits": 32} print( f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq") - layer_blacklist = args.layer_blacklist.split(",") - if bool(layer_blacklist): + fp_layer_list = args.fp_layer_list.split(",") + if bool(fp_layer_list): for n, m in model.named_modules(): if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): name = n.split('.')[-1] - if name in layer_blacklist: + if n in fp_layer_list or name in fp_layer_list: layer_config[n] = {"bits": 32} print( f"{n} will not be quantized.") From e0ef293d01cd312b822c8f86728f7c2d46201f22 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Fri, 9 Aug 2024 17:30:43 +0800 Subject: [PATCH 3/3] fixtypos Signed-off-by: Zhang, Weiwei1 --- examples/language-modeling/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index 96cea2f0..8605151b 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -132,7 +132,7 @@ parser.add_argument("--act_bits", default=32, type=int, help="activation bits") - parser.add_argument("--fp_layer_list", default="", type=str, + parser.add_argument("--fp_layers_list", default="", type=str, help="List of Layers to maintain original data type") args = parser.parse_args() @@ -272,12 +272,12 @@ layer_config[n] = {"bits": 32} print( f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq") - fp_layer_list = args.fp_layer_list.split(",") - if bool(fp_layer_list): + fp_layers_list = args.fp_layers_list.split(",") + if bool(fp_layers_list): for n, m in model.named_modules(): if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): name = n.split('.')[-1] - if n in fp_layer_list or name in fp_layer_list: + if n in fp_layers_list or name in fp_layers_list: layer_config[n] = {"bits": 32} print( f"{n} will not be quantized.")