From 4dbda878a25c971d7ba9bfcb7d249ca1f1d55eeb Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Tue, 22 Oct 2024 09:01:57 +0800 Subject: [PATCH] WOQ: set default act_quant_mode to PER_BATCH_IC_BLOCK_SYM (#3321) --- .../run_accuracy_with_deepspeed.py | 29 +++++++++++++++++-- .../distributed/run_generation_tp.py | 21 ++++++++++++-- .../run_generation_with_deepspeed.py | 21 ++++++++++++-- examples/cpu/llm/inference/run.py | 2 +- .../single_instance/run_quantization.py | 2 +- .../quantization/_qconfig.py | 6 ++-- tests/cpu/test_quantization_default_recipe.py | 2 +- 7 files changed, 71 insertions(+), 12 deletions(-) diff --git a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py index 13f4c3fab..ffada4f53 100644 --- a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py @@ -145,8 +145,17 @@ def decorator(func): ) parser.add_argument( "--act-quant-mode", - choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"], - default="PER_IC_BLOCK", + choices=[ + "PER_TENSOR", + "PER_IC_BLOCK", + "PER_BATCH", + "PER_BATCH_IC_BLOCK", + "PER_TENSOR_SYM", + "PER_IC_BLOCK_SYM", + "PER_BATCH_SYM", + "PER_BATCH_IC_BLOCK_SYM", + ], + default="PER_BATCH_IC_BLOCK_SYM", type=str, help="Quantization mode for activation with different granularity. " "For lowp-mode=INT8 only. For other cases, it has no effect. " @@ -155,6 +164,10 @@ def decorator(func): "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; " "PER_BATCH(2): quantize per batch; " "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " + "PER_TENSOR_SYM(4): symmetrically quantize per tensor; " + "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH_SYM(6): symmetrically quantize per batch; " + "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. " "IC_BLOCK is determined by IC automatically.", ) parser.add_argument( @@ -417,6 +430,10 @@ def write_checkpoints_json(): "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM, + "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM, + "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM, + "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, } weight_qscheme = ( WoqWeightQScheme.SYMMETRIC @@ -1196,6 +1213,10 @@ def write_checkpoints_json(): "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM, + "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM, + "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM, + "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, } weight_qscheme = ( WoqWeightQScheme.SYMMETRIC @@ -1849,6 +1870,10 @@ def write_checkpoints_json(): "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM, + "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM, + "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM, + "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, } weight_qscheme = ( WoqWeightQScheme.SYMMETRIC diff --git a/examples/cpu/llm/inference/distributed/run_generation_tp.py b/examples/cpu/llm/inference/distributed/run_generation_tp.py index 7142d971a..ae444d0fb 100644 --- a/examples/cpu/llm/inference/distributed/run_generation_tp.py +++ b/examples/cpu/llm/inference/distributed/run_generation_tp.py @@ -146,8 +146,17 @@ ) parser.add_argument( "--act-quant-mode", - choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"], - default="PER_IC_BLOCK", + choices=[ + "PER_TENSOR", + "PER_IC_BLOCK", + "PER_BATCH", + "PER_BATCH_IC_BLOCK", + "PER_TENSOR_SYM", + "PER_IC_BLOCK_SYM", + "PER_BATCH_SYM", + "PER_BATCH_IC_BLOCK_SYM", + ], + default="PER_BATCH_IC_BLOCK_SYM", type=str, help="Quantization mode for activation with different granularity. " "For lowp-mode=INT8 only. For other cases, it has no effect. " @@ -156,6 +165,10 @@ "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; " "PER_BATCH(2): quantize per batch; " "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " + "PER_TENSOR_SYM(4): symmetrically quantize per tensor; " + "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH_SYM(6): symmetrically quantize per batch; " + "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. " "IC_BLOCK is determined by IC automatically.", ) parser.add_argument( @@ -339,6 +352,10 @@ def trace_handler(prof): "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM, + "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM, + "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM, + "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, } weight_qscheme = ( WoqWeightQScheme.SYMMETRIC diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py index 378a9a169..e2d466e0c 100644 --- a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py @@ -156,8 +156,17 @@ ) parser.add_argument( "--act-quant-mode", - choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"], - default="PER_IC_BLOCK", + choices=[ + "PER_TENSOR", + "PER_IC_BLOCK", + "PER_BATCH", + "PER_BATCH_IC_BLOCK", + "PER_TENSOR_SYM", + "PER_IC_BLOCK_SYM", + "PER_BATCH_SYM", + "PER_BATCH_IC_BLOCK_SYM", + ], + default="PER_BATCH_IC_BLOCK_SYM", type=str, help="Quantization mode for activation with different granularity. " "For lowp-mode=INT8 only. For other cases, it has no effect. " @@ -166,6 +175,10 @@ "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; " "PER_BATCH(2): quantize per batch; " "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " + "PER_TENSOR_SYM(4): symmetrically quantize per tensor; " + "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH_SYM(6): symmetrically quantize per batch; " + "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. " "IC_BLOCK is determined by IC automatically.", ) parser.add_argument( @@ -489,6 +502,10 @@ def write_checkpoints_json(): "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM, + "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM, + "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM, + "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, } weight_qscheme = ( WoqWeightQScheme.SYMMETRIC diff --git a/examples/cpu/llm/inference/run.py b/examples/cpu/llm/inference/run.py index bae7d5b47..57b2db6ce 100644 --- a/examples/cpu/llm/inference/run.py +++ b/examples/cpu/llm/inference/run.py @@ -195,7 +195,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "PER_BATCH_SYM", "PER_BATCH_IC_BLOCK_SYM", ], - default="PER_IC_BLOCK", + default="PER_BATCH_IC_BLOCK_SYM", type=str, help="Quantization mode for activation with different granularity. " "For lowp-mode=INT8 only. For other cases, it has no effect. " diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py index 7fc4c3ca9..fdd9e2655 100644 --- a/examples/cpu/llm/inference/single_instance/run_quantization.py +++ b/examples/cpu/llm/inference/single_instance/run_quantization.py @@ -230,7 +230,7 @@ "PER_BATCH_SYM", "PER_BATCH_IC_BLOCK_SYM", ], - default="PER_IC_BLOCK", + default="PER_BATCH_IC_BLOCK_SYM", type=str, help="Quantization mode for activation with different granularity. " "For lowp-mode=INT8 only. For other cases, it has no effect. " diff --git a/intel_extension_for_pytorch/quantization/_qconfig.py b/intel_extension_for_pytorch/quantization/_qconfig.py index e49dbbaec..24dab7b87 100644 --- a/intel_extension_for_pytorch/quantization/_qconfig.py +++ b/intel_extension_for_pytorch/quantization/_qconfig.py @@ -188,7 +188,7 @@ def get_weight_only_quant_qconfig_mapping( *, weight_dtype: int = WoqWeightDtype.INT8, lowp_mode: int = WoqLowpMode.NONE, - act_quant_mode: int = WoqActQuantMode.PER_IC_BLOCK, + act_quant_mode: int = WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, group_size: int = -1, weight_qscheme: int = WoqWeightQScheme.UNDEFINED, ): @@ -222,8 +222,8 @@ def get_weight_only_quant_qconfig_mapping( No grouping along IC for weight. For activation, IC_BLOCK is determined automatically by IC. If group_size > 0: - act_quant_mode can be any. If act_quant_mode is PER_IC_BLOCK - or PER_BATCH_IC_BLOCK, weight is grouped along IC by group_size. + act_quant_mode can be any. If act_quant_mode is PER_IC_BLOCK(_SYM) + or PER_BATCH_IC_BLOCK(_SYM), weight is grouped along IC by group_size. The IC_BLOCK for activation is determined by group_size automatically. Each group has its own quantization parameters. weight_qscheme: Specify how to quantize weight, asymmetrically or symmetrically. Generally, diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index 1e0fb866f..6b968bd27 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -2155,7 +2155,7 @@ def test(feature, has_bias, w_dtype, lowp_mode, enable_amp): None, group_size, lowp_mode, - WoqActQuantMode.PER_IC_BLOCK, + WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, compensation, ) torch.testing.assert_close(output, output_ref)