From 4dbda878a25c971d7ba9bfcb7d249ca1f1d55eeb Mon Sep 17 00:00:00 2001
From: Xia Weiwen <weiwen.xia@intel.com>
Date: Tue, 22 Oct 2024 09:01:57 +0800
Subject: [PATCH] WOQ: set default act_quant_mode to PER_BATCH_IC_BLOCK_SYM
 (#3321)

---
 .../run_accuracy_with_deepspeed.py            | 29 +++++++++++++++++--
 .../distributed/run_generation_tp.py          | 21 ++++++++++++--
 .../run_generation_with_deepspeed.py          | 21 ++++++++++++--
 examples/cpu/llm/inference/run.py             |  2 +-
 .../single_instance/run_quantization.py       |  2 +-
 .../quantization/_qconfig.py                  |  6 ++--
 tests/cpu/test_quantization_default_recipe.py |  2 +-
 7 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
index 13f4c3fab..ffada4f53 100644
--- a/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
+++ b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
@@ -145,8 +145,17 @@ def decorator(func):
 )
 parser.add_argument(
     "--act-quant-mode",
-    choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"],
-    default="PER_IC_BLOCK",
+    choices=[
+        "PER_TENSOR",
+        "PER_IC_BLOCK",
+        "PER_BATCH",
+        "PER_BATCH_IC_BLOCK",
+        "PER_TENSOR_SYM",
+        "PER_IC_BLOCK_SYM",
+        "PER_BATCH_SYM",
+        "PER_BATCH_IC_BLOCK_SYM",
+    ],
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
@@ -155,6 +164,10 @@ def decorator(func):
     "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; "
     "PER_BATCH(2): quantize per batch; "
     "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. "
+    "PER_TENSOR_SYM(4): symmetrically quantize per tensor; "
+    "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; "
+    "PER_BATCH_SYM(6): symmetrically quantize per batch; "
+    "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. "
     "IC_BLOCK is determined by IC automatically.",
 )
 parser.add_argument(
@@ -417,6 +430,10 @@ def write_checkpoints_json():
                     "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
                     "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
                     "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+                    "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+                    "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+                    "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+                    "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                 }
                 weight_qscheme = (
                     WoqWeightQScheme.SYMMETRIC
@@ -1196,6 +1213,10 @@ def write_checkpoints_json():
                     "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
                     "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
                     "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+                    "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+                    "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+                    "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+                    "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                 }
                 weight_qscheme = (
                     WoqWeightQScheme.SYMMETRIC
@@ -1849,6 +1870,10 @@ def write_checkpoints_json():
                     "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
                     "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
                     "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+                    "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+                    "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+                    "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+                    "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                 }
                 weight_qscheme = (
                     WoqWeightQScheme.SYMMETRIC
diff --git a/examples/cpu/llm/inference/distributed/run_generation_tp.py b/examples/cpu/llm/inference/distributed/run_generation_tp.py
index 7142d971a..ae444d0fb 100644
--- a/examples/cpu/llm/inference/distributed/run_generation_tp.py
+++ b/examples/cpu/llm/inference/distributed/run_generation_tp.py
@@ -146,8 +146,17 @@
 )
 parser.add_argument(
     "--act-quant-mode",
-    choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"],
-    default="PER_IC_BLOCK",
+    choices=[
+        "PER_TENSOR",
+        "PER_IC_BLOCK",
+        "PER_BATCH",
+        "PER_BATCH_IC_BLOCK",
+        "PER_TENSOR_SYM",
+        "PER_IC_BLOCK_SYM",
+        "PER_BATCH_SYM",
+        "PER_BATCH_IC_BLOCK_SYM",
+    ],
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
@@ -156,6 +165,10 @@
     "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; "
     "PER_BATCH(2): quantize per batch; "
     "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. "
+    "PER_TENSOR_SYM(4): symmetrically quantize per tensor; "
+    "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; "
+    "PER_BATCH_SYM(6): symmetrically quantize per batch; "
+    "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. "
     "IC_BLOCK is determined by IC automatically.",
 )
 parser.add_argument(
@@ -339,6 +352,10 @@ def trace_handler(prof):
         "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
         "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
         "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+        "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+        "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+        "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+        "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
     }
     weight_qscheme = (
         WoqWeightQScheme.SYMMETRIC
diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
index 378a9a169..e2d466e0c 100644
--- a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
+++ b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -156,8 +156,17 @@
 )
 parser.add_argument(
     "--act-quant-mode",
-    choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"],
-    default="PER_IC_BLOCK",
+    choices=[
+        "PER_TENSOR",
+        "PER_IC_BLOCK",
+        "PER_BATCH",
+        "PER_BATCH_IC_BLOCK",
+        "PER_TENSOR_SYM",
+        "PER_IC_BLOCK_SYM",
+        "PER_BATCH_SYM",
+        "PER_BATCH_IC_BLOCK_SYM",
+    ],
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
@@ -166,6 +175,10 @@
     "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; "
     "PER_BATCH(2): quantize per batch; "
     "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. "
+    "PER_TENSOR_SYM(4): symmetrically quantize per tensor; "
+    "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; "
+    "PER_BATCH_SYM(6): symmetrically quantize per batch; "
+    "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. "
     "IC_BLOCK is determined by IC automatically.",
 )
 parser.add_argument(
@@ -489,6 +502,10 @@ def write_checkpoints_json():
             "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
             "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
             "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+            "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM,
+            "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM,
+            "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM,
+            "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
         }
         weight_qscheme = (
             WoqWeightQScheme.SYMMETRIC
diff --git a/examples/cpu/llm/inference/run.py b/examples/cpu/llm/inference/run.py
index bae7d5b47..57b2db6ce 100644
--- a/examples/cpu/llm/inference/run.py
+++ b/examples/cpu/llm/inference/run.py
@@ -195,7 +195,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             "PER_BATCH_SYM",
             "PER_BATCH_IC_BLOCK_SYM",
         ],
-        default="PER_IC_BLOCK",
+        default="PER_BATCH_IC_BLOCK_SYM",
         type=str,
         help="Quantization mode for activation with different granularity. "
         "For lowp-mode=INT8 only. For other cases, it has no effect. "
diff --git a/examples/cpu/llm/inference/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
index 7fc4c3ca9..fdd9e2655 100644
--- a/examples/cpu/llm/inference/single_instance/run_quantization.py
+++ b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -230,7 +230,7 @@
         "PER_BATCH_SYM",
         "PER_BATCH_IC_BLOCK_SYM",
     ],
-    default="PER_IC_BLOCK",
+    default="PER_BATCH_IC_BLOCK_SYM",
     type=str,
     help="Quantization mode for activation with different granularity. "
     "For lowp-mode=INT8 only. For other cases, it has no effect. "
diff --git a/intel_extension_for_pytorch/quantization/_qconfig.py b/intel_extension_for_pytorch/quantization/_qconfig.py
index e49dbbaec..24dab7b87 100644
--- a/intel_extension_for_pytorch/quantization/_qconfig.py
+++ b/intel_extension_for_pytorch/quantization/_qconfig.py
@@ -188,7 +188,7 @@ def get_weight_only_quant_qconfig_mapping(
     *,
     weight_dtype: int = WoqWeightDtype.INT8,
     lowp_mode: int = WoqLowpMode.NONE,
-    act_quant_mode: int = WoqActQuantMode.PER_IC_BLOCK,
+    act_quant_mode: int = WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
     group_size: int = -1,
     weight_qscheme: int = WoqWeightQScheme.UNDEFINED,
 ):
@@ -222,8 +222,8 @@ def get_weight_only_quant_qconfig_mapping(
                                 No grouping along IC for weight. For activation,
                                 IC_BLOCK is determined automatically by IC.
                         If group_size > 0:
-                            act_quant_mode can be any. If act_quant_mode is PER_IC_BLOCK
-                            or PER_BATCH_IC_BLOCK, weight is grouped along IC by group_size.
+                            act_quant_mode can be any. If act_quant_mode is PER_IC_BLOCK(_SYM)
+                            or PER_BATCH_IC_BLOCK(_SYM), weight is grouped along IC by group_size.
                             The IC_BLOCK for activation is determined by group_size automatically.
                             Each group has its own quantization parameters.
         weight_qscheme: Specify how to quantize weight, asymmetrically or symmetrically. Generally,
diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py
index 1e0fb866f..6b968bd27 100644
--- a/tests/cpu/test_quantization_default_recipe.py
+++ b/tests/cpu/test_quantization_default_recipe.py
@@ -2155,7 +2155,7 @@ def test(feature, has_bias, w_dtype, lowp_mode, enable_amp):
                     None,
                     group_size,
                     lowp_mode,
-                    WoqActQuantMode.PER_IC_BLOCK,
+                    WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM,
                     compensation,
                 )
                 torch.testing.assert_close(output, output_ref)