From e28fc3e623b2d77ea250df3f9342696812118a09 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 15:00:50 +0800
Subject: [PATCH 01/17] add save/load

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../3.x_api/pytorch/cv/static_quant/main.py   |  8 ++-
 .../pytorch/cv/static_quant/run_quant.sh      |  3 +-
 .../static_quant/pt2e/run_clm_no_trainer.py   |  8 ++-
 .../static_quant/pt2e/run_quant.sh            |  3 +-
 .../quantization/weight_only/run_benchmark.sh | 59 +++++--------------
 5 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index 3ab2d6bd6ad..99cc6467091 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -81,6 +81,8 @@
                     help='quantize model')
 parser.add_argument("--calib_iters", default=2, type=int,
                     help="For calibration only.")
+parser.add_argument('-o', '--output_dir', default='', type=str, metavar='PATH',
+                    help='path to quantized result.')
 
 best_acc1 = 0
 
@@ -297,9 +299,13 @@ def main_worker(gpu, ngpus_per_node, args):
         config.freezing = True
         opt_model = torch.compile(q_model)
         model = opt_model
-
+        if args.output_dir:
+            model.save(example_inputs=example_inputs, output_dir = args.output_dir)
     
     if args.evaluate:
+        if args.output_dir:
+            from neural_compressor.torch.quantization import load
+            model = load(args.output_dir)
         validate(val_loader, model, criterion, args)
         return
 
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
index ac4a5a2b668..2bd1f288ec7 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
@@ -38,8 +38,9 @@ function init_params {
 function run_tuning {
     if [ "${topology}" = "resnet18_pt2e_static" ]; then
         model_name_or_path="resnet18"
+        output_dir = "saved_results"
     fi
-    python main.py -a ${model_name_or_path} ${dataset_location} -q -e
+    python main.py -a ${model_name_or_path} ${dataset_location} -q -o ${output_dir}
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 98d3f11a1dd..8c7cd66c5fe 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -14,7 +14,7 @@
     "--revision", default=None,
     help="Transformers parameter: set the model hub commit number")
 parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
-parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--output_dir", nargs="?", default="")
 parser.add_argument("--quantize", action="store_true")
 parser.add_argument("--approach", type=str, default='static',
                     help="Select from ['dynamic', 'static', 'weight-only']")
@@ -98,9 +98,15 @@ def get_example_inputs(tokenizer):
 
     opt_model.config = user_model.config # for lm eval
     user_model = opt_model
+    if args.output_dir:
+        user_model.save(example_inputs=example_inputs, output_dir = args.output_dir)
 
 
 if args.accuracy:
+    if args.output_dir:
+        from neural_compressor.torch.quantization import load
+        model = load(args.output_dir)
+        model.config = user_model.config
     from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf",
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
index 6bd599483ff..8f7f499b13a 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
@@ -39,8 +39,9 @@ function run_tuning {
 
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
+        output_dir = "saved_results"
     fi
-    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"
+    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --output_dir ${output_dir} --tasks "lambada_openai"
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 9e1d766128e..61c7a460094 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -4,17 +4,12 @@ set -x
 function main {
 
   init_params "$@"
-  run_benchmark
+  run_tuning
 
 }
 
 # init params
 function init_params {
-  iters=100
-  batch_size=16
-  tuned_checkpoint=saved_results
-  task=lambada_openai
-  echo ${max_eval_samples}
   for var in "$@"
   do
     case $var in
@@ -27,21 +22,9 @@ function init_params {
       --input_model=*)
           input_model=$(echo $var |cut -f2 -d=)
       ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -51,26 +34,14 @@ function init_params {
 
 }
 
-
-# run_benchmark
-function run_benchmark {
+# run_tuning
+function run_tuning {
     extra_cmd=''
+    batch_size=8
+    DATASET_NAME="NeelNanda/pile-10k"
+    tuned_checkpoint="saved_results"
 
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy "
-    elif [[ ${mode} == "performance" ]]; then
-        mode_cmd=" --performance --iters "${iters}
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+    if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
@@ -96,11 +67,11 @@ function run_benchmark {
         model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
+        model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
         extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
+        model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
@@ -118,10 +89,12 @@ function run_benchmark {
 
     python -u run_clm_no_trainer.py \
         --model ${model_name_or_path} \
+        --dataset ${DATASET_NAME} \
+        --accuracy \
         --output_dir ${tuned_checkpoint} \
-        --task ${task} \
+        --tasks "lambada_openai" \
         --batch_size ${batch_size} \
-        ${extra_cmd} ${mode_cmd}
+        ${extra_cmd}
 }
 
 main "$@"

From 9a72393cde9dc74620d83fec9c241ed8ee95f7b6 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 17 Jul 2024 09:37:11 +0800
Subject: [PATCH 02/17] update run_benchmark.sh

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../static_quant/pt2e/run_benchmark.sh        | 127 ++++++++++++++++++
 .../quantization/weight_only/run_benchmark.sh |  59 +++++---
 2 files changed, 170 insertions(+), 16 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
new file mode 100644
index 00000000000..9e1d766128e
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+    elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+    elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
+    elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"\
+        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"\
+        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+    elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 61c7a460094..9e1d766128e 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -4,12 +4,17 @@ set -x
 function main {
 
   init_params "$@"
-  run_tuning
+  run_benchmark
 
 }
 
 # init params
 function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
   for var in "$@"
   do
     case $var in
@@ -22,9 +27,21 @@ function init_params {
       --input_model=*)
           input_model=$(echo $var |cut -f2 -d=)
       ;;
-       --output_model=*)
-           tuned_checkpoint=$(echo $var |cut -f2 -d=)
-       ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -34,14 +51,26 @@ function init_params {
 
 }
 
-# run_tuning
-function run_tuning {
+
+# run_benchmark
+function run_benchmark {
     extra_cmd=''
-    batch_size=8
-    DATASET_NAME="NeelNanda/pile-10k"
-    tuned_checkpoint="saved_results"
 
-    if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
@@ -67,11 +96,11 @@ function run_tuning {
         model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
+        model_name_or_path="EleutherAI/gpt-j-6b"\
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
         extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
+        model_name_or_path="EleutherAI/gpt-j-6b"\
         extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
@@ -89,12 +118,10 @@ function run_tuning {
 
     python -u run_clm_no_trainer.py \
         --model ${model_name_or_path} \
-        --dataset ${DATASET_NAME} \
-        --accuracy \
         --output_dir ${tuned_checkpoint} \
-        --tasks "lambada_openai" \
+        --task ${task} \
         --batch_size ${batch_size} \
-        ${extra_cmd}
+        ${extra_cmd} ${mode_cmd}
 }
 
 main "$@"

From c56a3b9b371a0fa6fd7b06eb06ae00dcb4b78e96 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 17 Jul 2024 09:39:26 +0800
Subject: [PATCH 03/17] update run_benchmark

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../static_quant/pt2e/run_benchmark.sh        | 96 ++-----------------
 1 file changed, 8 insertions(+), 88 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
index 9e1d766128e..e4e1181e792 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -4,17 +4,12 @@ set -x
 function main {
 
   init_params "$@"
-  run_benchmark
+  run_tuning
 
 }
 
 # init params
 function init_params {
-  iters=100
-  batch_size=16
-  tuned_checkpoint=saved_results
-  task=lambada_openai
-  echo ${max_eval_samples}
   for var in "$@"
   do
     case $var in
@@ -27,21 +22,9 @@ function init_params {
       --input_model=*)
           input_model=$(echo $var |cut -f2 -d=)
       ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -51,77 +34,14 @@ function init_params {
 
 }
 
-
-# run_benchmark
+# run_tuning
 function run_benchmark {
-    extra_cmd=''
 
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy "
-    elif [[ ${mode} == "performance" ]]; then
-        mode_cmd=" --performance --iters "${iters}
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+    if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-    elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
-        model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
-    elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
-        model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
-    elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
-        model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-    elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
-        model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
-    elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
-        model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
-    elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-    elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
-    elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
-    elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-    elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
-    elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+        output_dir = "saved_results"
     fi
-
-    python -u run_clm_no_trainer.py \
-        --model ${model_name_or_path} \
-        --output_dir ${tuned_checkpoint} \
-        --task ${task} \
-        --batch_size ${batch_size} \
-        ${extra_cmd} ${mode_cmd}
+    python run_clm_no_trainer.py --model ${model_name_or_path} --accuracy --output_dir ${output_dir} --tasks "lambada_openai"
 }
 
 main "$@"

From 08700f63d3ac1ec4e7366e064dbee84a8f0cc328 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 17 Jul 2024 09:43:03 +0800
Subject: [PATCH 04/17] update func name

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/static_quant/pt2e/run_benchmark.sh             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
index e4e1181e792..60749f4805c 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -4,7 +4,7 @@ set -x
 function main {
 
   init_params "$@"
-  run_tuning
+  run_benchmark
 
 }
 

From 9f693c0c669a92d6f5fcf0de9f287f97e2eff931 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 17 Jul 2024 15:56:50 +0800
Subject: [PATCH 05/17] fix typo

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/run_quant.sh           | 2 +-
 .../quantization/static_quant/pt2e/run_benchmark.sh             | 2 +-
 .../quantization/static_quant/pt2e/run_quant.sh                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
index 2bd1f288ec7..940e70175c6 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
@@ -38,7 +38,7 @@ function init_params {
 function run_tuning {
     if [ "${topology}" = "resnet18_pt2e_static" ]; then
         model_name_or_path="resnet18"
-        output_dir = "saved_results"
+        output_dir="saved_results"
     fi
     python main.py -a ${model_name_or_path} ${dataset_location} -q -o ${output_dir}
 }
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
index 60749f4805c..17b49c65482 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -39,7 +39,7 @@ function run_benchmark {
 
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
-        output_dir = "saved_results"
+        output_dir="saved_results"
     fi
     python run_clm_no_trainer.py --model ${model_name_or_path} --accuracy --output_dir ${output_dir} --tasks "lambada_openai"
 }
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
index 8f7f499b13a..9e995ec8869 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
@@ -39,7 +39,7 @@ function run_tuning {
 
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
-        output_dir = "saved_results"
+        output_dir="saved_results"
     fi
     python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --output_dir ${output_dir} --tasks "lambada_openai"
 }

From 5706780c1b09b138e0d79d011edf48a743bcac7e Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 19 Jul 2024 10:54:53 +0800
Subject: [PATCH 06/17] update run_benchmark.sh

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../pytorch/cv/static_quant/run_benchmark.sh  | 86 +++++++++++++++++++
 .../static_quant/pt2e/run_benchmark.sh        | 59 +++++++++++--
 2 files changed, 139 insertions(+), 6 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh

diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
new file mode 100644
index 00000000000..5e1ed19fc6e
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    # if [[ ${mode} == "accuracy" ]]; then
+    #     mode_cmd=" --accuracy "
+    # elif [[ ${mode} == "performance" ]]; then
+    #     mode_cmd=" --performance --iters "${iters}
+    # else
+    #     echo "Error: No such mode: ${mode}"
+    #     exit 1
+    # fi
+
+    echo $extra_cmd
+
+    if [ "${topology}" = "resnet18_pt2e_static" ]; then
+        model_name_or_path="resnet18"
+    fi
+    python main.py -a ${model_name_or_path} ${dataset_location} -q -o ${tuned_checkpoint}
+
+
+    if [[ ${mode} == "accuracy" ]]; then
+        python main.py -a ${model_name_or_path} ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 main.py -a ${model_name_or_path} 
+          ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
index 17b49c65482..93272f4b4f5 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -10,6 +10,11 @@ function main {
 
 # init params
 function init_params {
+  iters=100
+  batch_size=16
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
   for var in "$@"
   do
     case $var in
@@ -22,9 +27,21 @@ function init_params {
       --input_model=*)
           input_model=$(echo $var |cut -f2 -d=)
       ;;
-       --output_model=*)
-           tuned_checkpoint=$(echo $var |cut -f2 -d=)
-       ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -34,14 +51,44 @@ function init_params {
 
 }
 
-# run_tuning
+
+# run_benchmark
 function run_benchmark {
+    extra_cmd=''
 
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+        extra_cmd=$extra_cmd
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+        extra_cmd=$extra_cmd
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+    echo $extra_cmd
+    
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
-        output_dir="saved_results"
+        tuned_checkpoint="saved"
+    fi
+    if [[ ${mode} == "accuracy" ]]; then
+        python -u run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --output_dir ${tuned_checkpoint} \
+            --task ${task} \
+            --batch_size ${batch_size} \
+            ${extra_cmd} ${mode_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            ${extra_cmd} ${mode_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
     fi
-    python run_clm_no_trainer.py --model ${model_name_or_path} --accuracy --output_dir ${output_dir} --tasks "lambada_openai"
 }
 
 main "$@"

From a7b7d88002d660f71a95c82e329a6ee3c917b16c Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 22 Jul 2024 14:02:58 +0800
Subject: [PATCH 07/17] update cv benchmark & script

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../3.x_api/pytorch/cv/static_quant/main.py   | 69 +++++++++++++++++--
 .../pytorch/cv/static_quant/run_benchmark.sh  | 31 +++++----
 2 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index 99cc6467091..3a9e88d1a1f 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -79,10 +79,17 @@
 parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
 parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
                     help='quantize model')
-parser.add_argument("--calib_iters", default=2, type=int,
+parser.add_argument("--calib_iters", default=-1, type=int,
                     help="For calibration only.")
 parser.add_argument('-o', '--output_dir', default='', type=str, metavar='PATH',
                     help='path to quantized result.')
+parser.add_argument('--performance', dest='performance', action='store_true',
+                    help='do benchmark')
+parser.add_argument("--iters", default=-1, type=int,
+                    help="For benchmark only.")
+parser.add_argument('--int8', dest='int8', action='store_true',
+                    help='Load quantized model')
+
 
 best_acc1 = 0
 
@@ -290,8 +297,20 @@ def main_worker(gpu, ngpus_per_node, args):
         
         prepared_model = prepare(exported_model, quant_config=quant_config)
         # Calibrate
-        for i in range(args.calib_iters):
-            prepared_model(*example_inputs)
+        with torch.no_grad():
+            for i, (images, target) in enumerate(val_loader):
+                if i == args.calib_iters:
+                    break
+                if args.gpu is not None and torch.cuda.is_available():
+                    images = images.cuda(args.gpu, non_blocking=True)
+                if torch.backends.mps.is_available():
+                    images = images.to('mps')
+                    target = target.to('mps')
+                if torch.cuda.is_available():
+                    target = target.cuda(args.gpu, non_blocking=True)
+                # compute output
+                model(images)
+                
         q_model = convert(prepared_model)
         # Compile the quantized model and replace the Q/DQ pattern with Q-operator
         from torch._inductor import config
@@ -299,16 +318,56 @@ def main_worker(gpu, ngpus_per_node, args):
         config.freezing = True
         opt_model = torch.compile(q_model)
         model = opt_model
+
         if args.output_dir:
             model.save(example_inputs=example_inputs, output_dir = args.output_dir)
-    
-    if args.evaluate:
+ 
+    if args.int8:
         if args.output_dir:
+            print("load int8 model")
             from neural_compressor.torch.quantization import load
             model = load(args.output_dir)
+ 
+      
+    if args.evaluate:
         validate(val_loader, model, criterion, args)
         return
+        
+    if args.performance:
+        benchmark(val_loader, model, args)
+        return
+
+def benchmark(val_loader, model, args): 
 
+    total_iters = args.iters
+    warmup_iters = 5
+    with torch.no_grad():
+        
+        for i, (images, target) in enumerate(val_loader):
+            if i == total_iters:
+                break
+            if i == warmup_iters:
+                start = time.time()
+
+            if args.gpu is not None and torch.cuda.is_available():
+                images = images.cuda(args.gpu, non_blocking=True)
+            if torch.backends.mps.is_available():
+                images = images.to('mps')
+                target = target.to('mps')
+            if torch.cuda.is_available():
+                target = target.cuda(args.gpu, non_blocking=True)
+            
+            # model inference
+            model(images)
+            
+            if i % args.print_freq == 0:
+                print(f"benchmarking... {i+1}/{total_iters}")
+            
+        end = time.time()
+    latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
+    throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
+    print("Latency: {:.3f} ms".format(latency * 10**3))
+    print("Throughput: {:.3f} samples/sec".format(throughput))
 
 def validate(val_loader, model, criterion, args):
 
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
index 5e1ed19fc6e..554856bb8ef 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
@@ -53,30 +53,33 @@ function init_params {
 
 # run_benchmark
 function run_benchmark {
-    extra_cmd=''
+    extra_cmd=' --int8 '
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" -e "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
 
-    # if [[ ${mode} == "accuracy" ]]; then
-    #     mode_cmd=" --accuracy "
-    # elif [[ ${mode} == "performance" ]]; then
-    #     mode_cmd=" --performance --iters "${iters}
-    # else
-    #     echo "Error: No such mode: ${mode}"
-    #     exit 1
-    # fi
 
     echo $extra_cmd
 
     if [ "${topology}" = "resnet18_pt2e_static" ]; then
         model_name_or_path="resnet18"
     fi
-    python main.py -a ${model_name_or_path} ${dataset_location} -q -o ${tuned_checkpoint}
-
 
     if [[ ${mode} == "accuracy" ]]; then
-        python main.py -a ${model_name_or_path} ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd}
+        python main.py -a ${model_name_or_path} ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd} ${mode_cmd}
     elif [[ ${mode} == "performance" ]]; then
-        incbench --num_cores_per_instance 4 main.py -a ${model_name_or_path} 
-          ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd}
+        incbench --num_cores_per_instance 4 main.py -a ${model_name_or_path} \
+          ${dataset_location} -o ${tuned_checkpoint} ${extra_cmd} ${mode_cmd}
     else
         echo "Error: No such mode: ${mode}"
         exit 1

From 9aab1593cd88825e1e3e8c4edb62bf7dde1dfaeb Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 22 Jul 2024 15:43:27 +0800
Subject: [PATCH 08/17] update llm & set logger level

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../pytorch/cv/static_quant/run_benchmark.sh      |  2 +-
 .../static_quant/pt2e/run_benchmark.sh            |  7 ++++++-
 .../static_quant/pt2e/run_clm_no_trainer.py       | 15 ++++++++++++---
 neural_compressor/common/__init__.py              |  2 ++
 neural_compressor/common/utils/logger.py          |  2 ++
 .../pt2e_quant/half_precision_rewriter.py         |  3 ++-
 .../torch/algorithms/pt2e_quant/save_load.py      |  3 ++-
 neural_compressor/torch/export/pt2e_export.py     |  4 +++-
 .../torch/quantization/algorithm_entry.py         |  4 ++++
 9 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
index 554856bb8ef..23e04edfcfc 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
@@ -53,7 +53,7 @@ function init_params {
 
 # run_benchmark
 function run_benchmark {
-    extra_cmd=' --int8 '
+    extra_cmd=''
 
     if [[ ${mode} == "accuracy" ]]; then
         mode_cmd=" -e "
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
index 93272f4b4f5..169142cddb8 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_benchmark.sh
@@ -66,11 +66,16 @@ function run_benchmark {
         echo "Error: No such mode: ${mode}"
         exit 1
     fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
     echo $extra_cmd
     
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
-        tuned_checkpoint="saved"
     fi
     if [[ ${mode} == "accuracy" ]]; then
         python -u run_clm_no_trainer.py \
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 8c7cd66c5fe..1fabd101b32 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -80,7 +80,7 @@ def get_example_inputs(tokenizer):
     dynamic_shapes = {"input_ids": (batch, seq_len)}
     example_inputs = get_example_inputs(tokenizer)
     exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)
-
+    
     quant_config = get_default_static_config()
     # prepare
     prepare_model = prepare(exported_model, quant_config)
@@ -98,15 +98,24 @@ def get_example_inputs(tokenizer):
 
     opt_model.config = user_model.config # for lm eval
     user_model = opt_model
+
+    # save
     if args.output_dir:
         user_model.save(example_inputs=example_inputs, output_dir = args.output_dir)
 
 
-if args.accuracy:
+
+if args.int8:
     if args.output_dir:
+        print("Load int8 model.")
         from neural_compressor.torch.quantization import load
         model = load(args.output_dir)
-        model.config = user_model.config
+
+        model.config = user_model.config # for lm eval
+        user_model = opt_model
+
+if args.accuracy:
+
     from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf",
diff --git a/neural_compressor/common/__init__.py b/neural_compressor/common/__init__.py
index e38627d5c7c..cbda53e57b3 100644
--- a/neural_compressor/common/__init__.py
+++ b/neural_compressor/common/__init__.py
@@ -15,6 +15,7 @@
 
 from neural_compressor.common.utils import (
     level,
+    level_name,
     logger,
     Logger,
     TuningLogger,
@@ -31,6 +32,7 @@
 __all__ = [
     "options",
     "level",
+    "level_name",
     "logger",
     "Logger",
     "TuningLogger",
diff --git a/neural_compressor/common/utils/logger.py b/neural_compressor/common/utils/logger.py
index 4c933368fdd..a7f0b06009f 100644
--- a/neural_compressor/common/utils/logger.py
+++ b/neural_compressor/common/utils/logger.py
@@ -24,6 +24,7 @@
 
 __all__ = [
     "level",
+    "level_name",
     "Logger",  # TODO: not expose it
     "logger",
     "TuningLogger",
@@ -138,6 +139,7 @@ def warning(msg, *args, **kwargs):
 
 
 level = Logger().get_logger().level
+level_name = logging.getLevelName(level)
 
 logger = Logger
 
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py b/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
index bd1865e674c..74b52a003b4 100644
--- a/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
+++ b/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
@@ -149,7 +149,8 @@ def transformation(gm: torch.fx.GraphModule, node_candidate_list: List[str], tar
     for pattern_pair in HALF_PRECISION_PATTERN_REGISTRY[target_dtype].values():
         apply_single_pattern_pair(gm, pattern_pair, node_candidate_list)
     utils.logger.info("Half precision conversion is done:")
-    gm.print_readable(True)
+    if utils.level_name == "DEBUG": # pragma: no cover
+        gm.print_readable(True)
 
 
 # =============================================================================
diff --git a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
index 606c31f41c2..fb3473d17a8 100644
--- a/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
+++ b/neural_compressor/torch/algorithms/pt2e_quant/save_load.py
@@ -25,7 +25,8 @@ def save(model, example_inputs, output_dir="./saved_results"):
     os.makedirs(output_dir, exist_ok=True)
     qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
     qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
-    quantized_ep = torch.export.export(model, example_inputs)
+    dynamic_shapes = model.dynamic_shapes
+    quantized_ep = torch.export.export(model, example_inputs, dynamic_shapes=dynamic_shapes)
     torch.export.save(quantized_ep, qmodel_file_path)
     for key, op_config in model.qconfig.items():
         model.qconfig[key] = op_config.to_dict()
diff --git a/neural_compressor/torch/export/pt2e_export.py b/neural_compressor/torch/export/pt2e_export.py
index 579e816894f..d187f9b5289 100644
--- a/neural_compressor/torch/export/pt2e_export.py
+++ b/neural_compressor/torch/export/pt2e_export.py
@@ -67,7 +67,9 @@ def export(
     dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
 ) -> Optional[GraphModule]:
     if not is_ipex_imported():
-        return export_model_for_pt2e_quant(model, example_inputs, dynamic_shapes)
+        model = export_model_for_pt2e_quant(model, example_inputs, dynamic_shapes)
+        model.dynamic_shapes = dynamic_shapes
+        return model
     else:
         # TODO, add `export` for ipex
         pass
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index 2a3eada9bf5..9281dd305e2 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -215,6 +215,7 @@ def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode
     run_fn = kwargs.get("run_fn", None)
     example_inputs = kwargs.get("example_inputs", None)
     inplace = kwargs.get("inplace", True)
+    dynamic_shapes = model.dynamic_shapes
     W8A8PT2EQuantizer.is_dynamic = True
     for _, quant_config in configs_mapping.items():
         if quant_config.name == PT2E_DYNAMIC_QUANT:
@@ -222,6 +223,7 @@ def pt2e_dynamic_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode
             model = w8a8_quantizer.execute(
                 model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace
             )
+            model.dynamic_shapes = dynamic_shapes
             model.qconfig = configs_mapping
             model.save = MethodType(save, model)
             return model
@@ -238,12 +240,14 @@ def pt2e_static_quant_entry(model: torch.nn.Module, configs_mapping, mode: Mode,
     run_fn = kwargs.get("run_fn", None)
     example_inputs = kwargs.get("example_inputs", None)
     inplace = kwargs.get("inplace", True)
+    dynamic_shapes = model.dynamic_shapes
     for _, quant_config in configs_mapping.items():
         if quant_config.name == STATIC_QUANT:
             w8a8_quantizer = W8A8PT2EQuantizer(quant_config=quant_config)
             model = w8a8_quantizer.execute(
                 model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace
             )
+            model.dynamic_shapes = dynamic_shapes
             model.qconfig = configs_mapping
             model.save = MethodType(save, model)
             return model

From 8d5c10813293d57b1e73e0783b5479c66da6b9ba Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 22 Jul 2024 15:57:51 +0800
Subject: [PATCH 09/17] update nlp performance

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../static_quant/pt2e/run_clm_no_trainer.py   | 42 ++++++++-----------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 1fabd101b32..7b4c9a46630 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -112,7 +112,7 @@ def get_example_inputs(tokenizer):
         model = load(args.output_dir)
 
         model.config = user_model.config # for lm eval
-        user_model = opt_model
+        user_model = model
 
 if args.accuracy:
 
@@ -135,29 +135,21 @@ def get_example_inputs(tokenizer):
     print('Batch size = %d' % args.batch_size)
 
 if args.performance:
-    # user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    batch_size, input_leng = args.batch_size, 512
+    example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long)
+    print("Batch size = {:d}".format(batch_size))
+    print("The length of input tokens = {:d}".format(input_leng))
     import time
 
-    samples = args.iters * args.batch_size
-    eval_args = LMEvalParser(
-        model="hf",
-        user_model=user_model,
-        tokenizer=tokenizer,
-        batch_size=args.batch_size,
-        tasks=args.tasks,
-        limit=samples,
-        device="cpu",
-    )
-    start = time.time()
-    results = evaluate(eval_args)
-    end = time.time()
-    for task_name in args.tasks.split(","):
-        if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity,none"]
-        else:
-            acc = results["results"][task_name]["acc,none"]
-    print("Accuracy: %.5f" % acc)
-    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
-    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
-    print('Batch size = %d' % args.batch_size)
+    total_iters = args.iters
+    warmup_iters = 5
+    with torch.no_grad():
+        for i in range(total_iters):
+            if i == warmup_iters:
+                start = time.time()
+            user_model(example_inputs)
+        end = time.time()
+    latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
+    throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
+    print("Latency: {:.3f} ms".format(latency * 10**3))
+    print("Throughput: {:.3f} samples/sec".format(throughput))

From 7753b4046ed059dff6815d835bc594876ffb2f83 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 08:23:48 +0000
Subject: [PATCH 10/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/pt2e_quant/half_precision_rewriter.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py b/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
index 74b52a003b4..759752b7c80 100644
--- a/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
+++ b/neural_compressor/torch/algorithms/pt2e_quant/half_precision_rewriter.py
@@ -149,7 +149,7 @@ def transformation(gm: torch.fx.GraphModule, node_candidate_list: List[str], tar
     for pattern_pair in HALF_PRECISION_PATTERN_REGISTRY[target_dtype].values():
         apply_single_pattern_pair(gm, pattern_pair, node_candidate_list)
     utils.logger.info("Half precision conversion is done:")
-    if utils.level_name == "DEBUG": # pragma: no cover
+    if utils.level_name == "DEBUG":  # pragma: no cover
         gm.print_readable(True)
 
 

From a2c8364754ab881b2755ad7129c67465f3d8b3de Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 23 Jul 2024 10:08:26 +0800
Subject: [PATCH 11/17] add dynamic_shapes to cv

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/main.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index 3a9e88d1a1f..36e2f1b07d5 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -289,9 +289,17 @@ def main_worker(gpu, ngpus_per_node, args):
         # Prepare the float model and example inputs for exporting model
         x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last)
         example_inputs = (x,)
+        
+        # Specify that the first dimension of each input is that batch size
+        from torch.export import Dim
+        print(args.batch_size)
+        batch = Dim("batch", min=16)
+    
+        # Specify that the first dimension of each input is that batch size
+        dynamic_shapes = {"x": {0: batch}}
 
         # Export eager model into FX graph model
-        exported_model = export(model=model, example_inputs=example_inputs)
+        exported_model = export(model=model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)
         # Quantize the model
         quant_config = get_default_static_config()
         

From c4b351f8f1787d47c6b8d07e0865c4ab1db71045 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 23 Jul 2024 13:14:18 +0800
Subject: [PATCH 12/17] fix default iters

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index 36e2f1b07d5..d5f1ead682c 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -79,13 +79,13 @@
 parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
 parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
                     help='quantize model')
-parser.add_argument("--calib_iters", default=-1, type=int,
+parser.add_argument("--calib_iters", default=128, type=int,
                     help="For calibration only.")
 parser.add_argument('-o', '--output_dir', default='', type=str, metavar='PATH',
                     help='path to quantized result.')
 parser.add_argument('--performance', dest='performance', action='store_true',
                     help='do benchmark')
-parser.add_argument("--iters", default=-1, type=int,
+parser.add_argument("--iters", default=100, type=int,
                     help="For benchmark only.")
 parser.add_argument('--int8', dest='int8', action='store_true',
                     help='Load quantized model')
@@ -347,7 +347,7 @@ def main_worker(gpu, ngpus_per_node, args):
 
 def benchmark(val_loader, model, args): 
 
-    total_iters = args.iters
+    total_iters = args.iters if args.iters > len(val_loader) else len(val_loader)
     warmup_iters = 5
     with torch.no_grad():
         

From ed8637c909286ee32551476102895fc5b1146a9b Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 23 Jul 2024 13:14:47 +0800
Subject: [PATCH 13/17] fix default iters

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index d5f1ead682c..b86987aa3f3 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -347,7 +347,7 @@ def main_worker(gpu, ngpus_per_node, args):
 
 def benchmark(val_loader, model, args): 
 
-    total_iters = args.iters if args.iters > len(val_loader) else len(val_loader)
+    total_iters = args.iters if args.iters < len(val_loader) else len(val_loader)
     warmup_iters = 5
     with torch.no_grad():
         

From 63e0278ef62bb654a02982e041a6cd36ce18fe34 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 26 Jul 2024 17:30:49 +0800
Subject: [PATCH 14/17] fix cv acc

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../3.x_api/pytorch/cv/static_quant/main.py   | 459 +++++++-----------
 .../pytorch/cv/static_quant/run_benchmark.sh  |  22 +-
 .../pytorch/cv/static_quant/run_quant.sh      |  10 +-
 3 files changed, 202 insertions(+), 289 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index b86987aa3f3..71b7a99aa18 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -4,30 +4,25 @@
 import shutil
 import time
 import warnings
-from enum import Enum
+import sys
 
 import torch
-import torch.backends.cudnn as cudnn
-import torch.distributed as dist
-import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.parallel
+import torch.distributed as dist
 import torch.optim
+import torch.multiprocessing as mp
 import torch.utils.data
 import torch.utils.data.distributed
+import torchvision.transforms as transforms
 import torchvision.datasets as datasets
 import torchvision.models as models
-import torchvision.transforms as transforms
-from torch.optim.lr_scheduler import StepLR
-from torch.utils.data import Subset
 
-model_names = sorted(name for name in models.__dict__
-    if name.islower() and not name.startswith("__")
-    and callable(models.__dict__[name]))
+model_names = models.list_models(module=models)
 
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
-parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
-                    help='path to dataset (default: imagenet)')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
 parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                     choices=model_names,
                     help='model architecture: ' +
@@ -57,6 +52,8 @@
                     help='path to latest checkpoint (default: none)')
 parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                     help='evaluate model on validation set')
+parser.add_argument('-t', '--tune', dest='tune', action='store_true',
+                    help='tune best int8 model on calibration dataset')
 parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                     help='use pre-trained model')
 parser.add_argument('--world-size', default=-1, type=int,
@@ -71,88 +68,45 @@
                     help='seed for initializing training. ')
 parser.add_argument('--gpu', default=None, type=int,
                     help='GPU id to use.')
+parser.add_argument('--ppn', default=1, type=int,
+                    help='number of processes on each node of distributed training')
 parser.add_argument('--multiprocessing-distributed', action='store_true',
                     help='Use multi-processing distributed training to launch '
                          'N processes per node, which has N GPUs. This is the '
                          'fastest way to use PyTorch for either single node or '
                          'multi node data parallel training')
-parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
-parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
-                    help='quantize model')
+parser.add_argument('-i', "--iter", default=0, type=int,
+                    help='For accuracy measurement only.')
+parser.add_argument('-w', "--warmup_iter", default=5, type=int,
+                    help='For benchmark measurement only.')
+parser.add_argument('--performance', dest='performance', action='store_true',
+                    help='run benchmark')
+parser.add_argument('-r', "--accuracy", dest='accuracy', action='store_true',
+                    help='For accuracy measurement only.')
+parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH',
+                    help='path to checkpoint tuned by Neural Compressor (default: ./)')
+parser.add_argument('--int8', dest='int8', action='store_true',
+                    help='Load int8 model.')
 parser.add_argument("--calib_iters", default=128, type=int,
                     help="For calibration only.")
-parser.add_argument('-o', '--output_dir', default='', type=str, metavar='PATH',
-                    help='path to quantized result.')
-parser.add_argument('--performance', dest='performance', action='store_true',
-                    help='do benchmark')
 parser.add_argument("--iters", default=100, type=int,
                     help="For benchmark only.")
-parser.add_argument('--int8', dest='int8', action='store_true',
-                    help='Load quantized model')
-
 
 best_acc1 = 0
 
 
 def main():
     args = parser.parse_args()
+    
+    if 'mobilenet' in args.arch:
+        import torchvision.models.quantization as models
+    else:
+        import torchvision.models as models
 
     if args.seed is not None:
         random.seed(args.seed)
         torch.manual_seed(args.seed)
-        cudnn.deterministic = True
-        cudnn.benchmark = False
-        warnings.warn('You have chosen to seed training. '
-                      'This will turn on the CUDNN deterministic setting, '
-                      'which can slow down your training considerably! '
-                      'You may see unexpected behavior when restarting '
-                      'from checkpoints.')
-
-    if args.gpu is not None:
-        warnings.warn('You have chosen a specific GPU. This will completely '
-                      'disable data parallelism.')
-
-    if args.dist_url == "env://" and args.world_size == -1:
-        args.world_size = int(os.environ["WORLD_SIZE"])
-
-    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
-
-    if torch.cuda.is_available():
-        ngpus_per_node = torch.cuda.device_count()
-        if ngpus_per_node == 1 and args.dist_backend == "nccl":
-            warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
-    else:
-        ngpus_per_node = 1
-
-    if args.multiprocessing_distributed:
-        # Since we have ngpus_per_node processes per node, the total world_size
-        # needs to be adjusted accordingly
-        args.world_size = ngpus_per_node * args.world_size
-        # Use torch.multiprocessing.spawn to launch distributed processes: the
-        # main_worker process function
-        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
-    else:
-        # Simply call main_worker function
-        main_worker(args.gpu, ngpus_per_node, args)
-
-
-def main_worker(gpu, ngpus_per_node, args):
-    global best_acc1
-    args.gpu = gpu
-
-    if args.gpu is not None:
-        print("Use GPU: {} for training".format(args.gpu))
-
-    if args.distributed:
-        if args.dist_url == "env://" and args.rank == -1:
-            args.rank = int(os.environ["RANK"])
-        if args.multiprocessing_distributed:
-            # For multiprocessing distributed training, rank needs to be the
-            # global rank among all the processes
-            args.rank = args.rank * ngpus_per_node + gpu
-        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
-                                world_size=args.world_size, rank=args.rank)
-    # create model
+
     if args.pretrained:
         print("=> using pre-trained model '{}'".format(args.arch))
         model = models.__dict__[args.arch](pretrained=True)
@@ -160,70 +114,18 @@ def main_worker(gpu, ngpus_per_node, args):
         print("=> creating model '{}'".format(args.arch))
         model = models.__dict__[args.arch]()
 
-    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
-        print('using CPU, this will be slow')
-    elif args.distributed:
-        # For multiprocessing distributed, DistributedDataParallel constructor
-        # should always set the single device scope, otherwise,
-        # DistributedDataParallel will use all available devices.
-        if torch.cuda.is_available():
-            if args.gpu is not None:
-                torch.cuda.set_device(args.gpu)
-                model.cuda(args.gpu)
-                # When using a single GPU per process and per
-                # DistributedDataParallel, we need to divide the batch size
-                # ourselves based on the total number of GPUs of the current node.
-                args.batch_size = int(args.batch_size / ngpus_per_node)
-                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
-                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-            else:
-                model.cuda()
-                # DistributedDataParallel will divide and allocate batch_size to all
-                # available GPUs if device_ids are not set
-                model = torch.nn.parallel.DistributedDataParallel(model)
-    elif args.gpu is not None and torch.cuda.is_available():
-        torch.cuda.set_device(args.gpu)
-        model = model.cuda(args.gpu)
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-        model = model.to(device)
-    else:
-        # DataParallel will divide and allocate batch_size to all available GPUs
-        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
-            model.features = torch.nn.DataParallel(model.features)
-            model.cuda()
-        else:
-            model = torch.nn.DataParallel(model).cuda()
-
-    if torch.cuda.is_available():
-        if args.gpu:
-            device = torch.device('cuda:{}'.format(args.gpu))
-        else:
-            device = torch.device("cuda")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
-    # define loss function (criterion), optimizer, and learning rate scheduler
-    criterion = nn.CrossEntropyLoss().to(device)
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss()
 
     optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay)
-    
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
-    
+
     # optionally resume from a checkpoint
     if args.resume:
         if os.path.isfile(args.resume):
             print("=> loading checkpoint '{}'".format(args.resume))
-            if args.gpu is None:
-                checkpoint = torch.load(args.resume)
-            elif torch.cuda.is_available():
-                # Map model to be loaded to specified single gpu.
-                loc = 'cuda:{}'.format(args.gpu)
-                checkpoint = torch.load(args.resume, map_location=loc)
+            checkpoint = torch.load(args.resume)
             args.start_epoch = checkpoint['epoch']
             best_acc1 = checkpoint['best_acc1']
             if args.gpu is not None:
@@ -231,58 +133,51 @@ def main_worker(gpu, ngpus_per_node, args):
                 best_acc1 = best_acc1.to(args.gpu)
             model.load_state_dict(checkpoint['state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer'])
-            scheduler.load_state_dict(checkpoint['scheduler'])
             print("=> loaded checkpoint '{}' (epoch {})"
                   .format(args.resume, checkpoint['epoch']))
         else:
             print("=> no checkpoint found at '{}'".format(args.resume))
 
-
     # Data loading code
-    if args.dummy:
-        print("=> Dummy data is used!")
-        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
-        val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor())
-    else:
-        traindir = os.path.join(args.data, 'train')
-        valdir = os.path.join(args.data, 'val')
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                      std=[0.229, 0.224, 0.225])
 
-        train_dataset = datasets.ImageFolder(
-            traindir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        val_dataset = datasets.ImageFolder(
-            valdir,
-            transforms.Compose([
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-    if args.distributed:
-        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
-        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
-    else:
-        train_sampler = None
-        val_sampler = None
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
 
     train_loader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
-        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+        train_dataset, batch_size=args.batch_size, shuffle=True,
+        num_workers=args.workers, pin_memory=True, sampler=None)
+
+    val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ]))
 
     val_loader = torch.utils.data.DataLoader(
-        val_dataset, batch_size=args.batch_size, shuffle=False,
-        num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+        val_dataset,
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+    def eval_func(model):
+        accu = validate(val_loader, model, criterion, args)
+        return float(accu)
     
-    if args.quantize:
+    if args.tune:
         from neural_compressor.torch.export import export
         from neural_compressor.torch.quantization import prepare, convert, get_default_static_config
 
@@ -317,7 +212,7 @@ def main_worker(gpu, ngpus_per_node, args):
                 if torch.cuda.is_available():
                     target = target.cuda(args.gpu, non_blocking=True)
                 # compute output
-                model(images)
+                prepared_model(images)
                 
         q_model = convert(prepared_model)
         # Compile the quantized model and replace the Q/DQ pattern with Q-operator
@@ -327,28 +222,29 @@ def main_worker(gpu, ngpus_per_node, args):
         opt_model = torch.compile(q_model)
         model = opt_model
 
-        if args.output_dir:
-            model.save(example_inputs=example_inputs, output_dir = args.output_dir)
- 
-    if args.int8:
-        if args.output_dir:
-            print("load int8 model")
-            from neural_compressor.torch.quantization import load
-            model = load(args.output_dir)
- 
-      
-    if args.evaluate:
-        validate(val_loader, model, criterion, args)
+        if args.tuned_checkpoint:
+            model.save(example_inputs=example_inputs, output_dir = args.tuned_checkpoint)
         return
-        
-    if args.performance:
-        benchmark(val_loader, model, args)
+    
+    if args.performance or args.accuracy:
+        # model.eval()
+        if args.int8:
+            from neural_compressor.torch.quantization import load
+            new_model = load(args.tuned_checkpoint)
+        else:
+            new_model = model
+        if args.performance:
+            benchmark(val_loader, new_model, args)
+            return
+        if args.accuracy:
+            validate(val_loader, new_model, criterion, args)
         return
 
+
 def benchmark(val_loader, model, args): 
 
     total_iters = args.iters if args.iters < len(val_loader) else len(val_loader)
-    warmup_iters = 5
+    warmup_iters = args.warmup_iter
     with torch.no_grad():
         
         for i, (images, target) in enumerate(val_loader):
@@ -377,66 +273,94 @@ def benchmark(val_loader, model, args):
     print("Latency: {:.3f} ms".format(latency * 10**3))
     print("Throughput: {:.3f} samples/sec".format(throughput))
 
-def validate(val_loader, model, criterion, args):
+def train(train_loader, model, criterion, optimizer, epoch, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(len(train_loader), batch_time, data_time, losses, top1,
+                             top5, prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            input = input.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+
+        # compute output
+        output = model(input)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), input.size(0))
+        top1.update(acc1[0], input.size(0))
+        top5.update(acc5[0], input.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
 
-    def run_validate(loader, base_progress=0):
-        with torch.no_grad():
-            end = time.time()
-            for i, (images, target) in enumerate(loader):
-                i = base_progress + i
-                if args.gpu is not None and torch.cuda.is_available():
-                    images = images.cuda(args.gpu, non_blocking=True)
-                if torch.backends.mps.is_available():
-                    images = images.to('mps')
-                    target = target.to('mps')
-                if torch.cuda.is_available():
-                    target = target.cuda(args.gpu, non_blocking=True)
+        if i % args.print_freq == 0:
+            progress.print(i)
 
-                # compute output
-                output = model(images)
-                loss = criterion(output, target)
-
-                # measure accuracy and record loss
-                acc1, acc5 = accuracy(output, target, topk=(1, 5))
-                losses.update(loss.item(), images.size(0))
-                top1.update(acc1[0], images.size(0))
-                top5.update(acc5[0], images.size(0))
-
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-
-                if i % args.print_freq == 0:
-                    progress.display(i + 1)
-
-    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
-    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
-    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
-    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
-    progress = ProgressMeter(
-        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
-        [batch_time, losses, top1, top5],
-        prefix='Test: ')
-
-    # switch to evaluate mode, pt2e no eval() or train()
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5,
+                             prefix='Test: ')
+
+    # switch to evaluate mode
     # model.eval()
 
-    run_validate(val_loader)
-    if args.distributed:
-        top1.all_reduce()
-        top5.all_reduce()
+    with torch.no_grad():
+        for i, (input, target) in enumerate(val_loader):
+            if i >= args.warmup_iter:
+                start = time.time()
+            if args.gpu is not None:
+                input = input.cuda(args.gpu, non_blocking=True)
+                target = target.cuda(args.gpu, non_blocking=True)
 
-    if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)):
-        aux_val_dataset = Subset(val_loader.dataset,
-                                 range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset)))
-        aux_val_loader = torch.utils.data.DataLoader(
-            aux_val_dataset, batch_size=args.batch_size, shuffle=False,
-            num_workers=args.workers, pin_memory=True)
-        run_validate(aux_val_loader, len(val_loader))
+            # compute output
+            output = model(input)
+            loss = criterion(output, target)
 
-    progress.display_summary()
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1[0], input.size(0))
+            top5.update(acc5[0], input.size(0))
 
-    return top1.avg
+            # measure elapsed time
+            if i >= args.warmup_iter:
+                batch_time.update(time.time() - start)
+
+            if i % args.print_freq == 0:
+                progress.print(i)
+
+            if args.iter > 0 and i >= (args.warmup_iter + args.iter - 1):
+                break
+
+        print('Batch size = %d' % args.batch_size)
+        print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'
+              .format(top1=(top1.avg / 100), top5=(top5.avg / 100)))
+
+    return top1.avg/100
 
 
 def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
@@ -444,18 +368,11 @@ def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
     if is_best:
         shutil.copyfile(filename, 'model_best.pth.tar')
 
-class Summary(Enum):
-    NONE = 0
-    AVERAGE = 1
-    SUM = 2
-    COUNT = 3
-
 class AverageMeter(object):
     """Computes and stores the average and current value"""
-    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+    def __init__(self, name, fmt=':f'):
         self.name = name
         self.fmt = fmt
-        self.summary_type = summary_type
         self.reset()
 
     def reset(self):
@@ -470,59 +387,35 @@ def update(self, val, n=1):
         self.count += n
         self.avg = self.sum / self.count
 
-    def all_reduce(self):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-        elif torch.backends.mps.is_available():
-            device = torch.device("mps")
-        else:
-            device = torch.device("cpu")
-        total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
-        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
-        self.sum, self.count = total.tolist()
-        self.avg = self.sum / self.count
-
     def __str__(self):
         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
         return fmtstr.format(**self.__dict__)
-    
-    def summary(self):
-        fmtstr = ''
-        if self.summary_type is Summary.NONE:
-            fmtstr = ''
-        elif self.summary_type is Summary.AVERAGE:
-            fmtstr = '{name} {avg:.3f}'
-        elif self.summary_type is Summary.SUM:
-            fmtstr = '{name} {sum:.3f}'
-        elif self.summary_type is Summary.COUNT:
-            fmtstr = '{name} {count:.3f}'
-        else:
-            raise ValueError('invalid summary type %r' % self.summary_type)
-        
-        return fmtstr.format(**self.__dict__)
 
 
 class ProgressMeter(object):
-    def __init__(self, num_batches, meters, prefix=""):
+    def __init__(self, num_batches, *meters, prefix=""):
         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
         self.meters = meters
         self.prefix = prefix
 
-    def display(self, batch):
+    def print(self, batch):
         entries = [self.prefix + self.batch_fmtstr.format(batch)]
         entries += [str(meter) for meter in self.meters]
         print('\t'.join(entries))
-        
-    def display_summary(self):
-        entries = [" *"]
-        entries += [meter.summary() for meter in self.meters]
-        print(' '.join(entries))
 
     def _get_batch_fmtstr(self, num_batches):
         num_digits = len(str(num_batches // 1))
         fmt = '{:' + str(num_digits) + 'd}'
         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
 
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
 def accuracy(output, target, topk=(1,)):
     """Computes the accuracy over the k top predictions for the specified values of k"""
     with torch.no_grad():
@@ -541,4 +434,4 @@ def accuracy(output, target, topk=(1,)):
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
index 23e04edfcfc..6f6b69c35df 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_benchmark.sh
@@ -56,7 +56,7 @@ function run_benchmark {
     extra_cmd=''
 
     if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" -e "
+        mode_cmd=" --accuracy "
     elif [[ ${mode} == "performance" ]]; then
         mode_cmd=" --performance --iters "${iters}
     else
@@ -76,10 +76,24 @@ function run_benchmark {
     fi
 
     if [[ ${mode} == "accuracy" ]]; then
-        python main.py -a ${model_name_or_path} ${dataset_location} -e -o ${tuned_checkpoint} ${extra_cmd} ${mode_cmd}
+        python main.py \
+                --pretrained \
+                -a resnet18 \
+                -b 30 \
+                --tuned_checkpoint ${tuned_checkpoint} \
+                ${dataset_location} \
+                ${extra_cmd} \
+                ${mode_cmd}
     elif [[ ${mode} == "performance" ]]; then
-        incbench --num_cores_per_instance 4 main.py -a ${model_name_or_path} \
-          ${dataset_location} -o ${tuned_checkpoint} ${extra_cmd} ${mode_cmd}
+        incbench --num_cores_per_instance 4 \
+                main.py \
+                --pretrained \
+                -a resnet18 \
+                -b 30 \
+                --tuned_checkpoint ${tuned_checkpoint} \
+                ${dataset_location} \
+                ${extra_cmd} \
+                ${mode_cmd}
     else
         echo "Error: No such mode: ${mode}"
         exit 1
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
index 940e70175c6..1f4588e933c 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
@@ -10,6 +10,7 @@ function main {
 
 # init params
 function init_params {
+  tuned_checkpoint="saved_results"
   for var in "$@"
   do
     case $var in
@@ -38,9 +39,14 @@ function init_params {
 function run_tuning {
     if [ "${topology}" = "resnet18_pt2e_static" ]; then
         model_name_or_path="resnet18"
-        output_dir="saved_results"
     fi
-    python main.py -a ${model_name_or_path} ${dataset_location} -q -o ${output_dir}
+    python main.py \
+            --pretrained \
+            -t \
+            -a resnet18 \
+            -b 30 \
+            --tuned_checkpoint ${tuned_checkpoint} \
+            ${dataset_location}
 }
 
 main "$@"

From c628331c846b9aad8f9adcde46b9317b5871c98f Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 29 Jul 2024 08:54:45 +0800
Subject: [PATCH 15/17] fix cv fp32 acc

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index 71b7a99aa18..e6273ec1314 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -227,12 +227,12 @@ def eval_func(model):
         return
     
     if args.performance or args.accuracy:
-        # model.eval()
         if args.int8:
             from neural_compressor.torch.quantization import load
             new_model = load(args.tuned_checkpoint)
         else:
             new_model = model
+            new_model.eval()
         if args.performance:
             benchmark(val_loader, new_model, args)
             return

From 61d9325c08c574d054bd9f9a8d630cedcdf4b7e7 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 29 Jul 2024 13:58:18 +0800
Subject: [PATCH 16/17] enhance benchmark

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../3.x_api/pytorch/cv/static_quant/main.py   | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index e6273ec1314..3d7af7827e3 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -215,21 +215,22 @@ def eval_func(model):
                 prepared_model(images)
                 
         q_model = convert(prepared_model)
-        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
-        from torch._inductor import config
-
-        config.freezing = True
-        opt_model = torch.compile(q_model)
-        model = opt_model
 
         if args.tuned_checkpoint:
-            model.save(example_inputs=example_inputs, output_dir = args.tuned_checkpoint)
+            q_model.save(example_inputs=example_inputs, output_dir = args.tuned_checkpoint)
         return
     
     if args.performance or args.accuracy:
         if args.int8:
             from neural_compressor.torch.quantization import load
-            new_model = load(args.tuned_checkpoint)
+            q_model = load(args.tuned_checkpoint)
+            
+            # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+            from torch._inductor import config
+
+            config.freezing = True
+            opt_model = torch.compile(q_model)
+            new_model = opt_model
         else:
             new_model = model
             new_model.eval()
@@ -243,23 +244,21 @@ def eval_func(model):
 
 def benchmark(val_loader, model, args): 
 
-    total_iters = args.iters if args.iters < len(val_loader) else len(val_loader)
+    total_iters = args.iters
     warmup_iters = args.warmup_iter
-    with torch.no_grad():
-        
-        for i, (images, target) in enumerate(val_loader):
+    for i, (images, target) in enumerate(val_loader):
+        if args.gpu is not None and torch.cuda.is_available():
+            images = images.cuda(args.gpu, non_blocking=True)
+        if torch.backends.mps.is_available():
+            images = images.to('mps')
+        break
+    
+    with torch.no_grad():        
+        for i in range(total_iters):
             if i == total_iters:
                 break
             if i == warmup_iters:
                 start = time.time()
-
-            if args.gpu is not None and torch.cuda.is_available():
-                images = images.cuda(args.gpu, non_blocking=True)
-            if torch.backends.mps.is_available():
-                images = images.to('mps')
-                target = target.to('mps')
-            if torch.cuda.is_available():
-                target = target.cuda(args.gpu, non_blocking=True)
             
             # model inference
             model(images)
@@ -435,3 +434,4 @@ def accuracy(output, target, topk=(1,)):
 
 if __name__ == '__main__':
     main()
+

From c760cf1de3541be98ecf4cde5a233cabcb2193e7 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Mon, 29 Jul 2024 14:03:32 +0800
Subject: [PATCH 17/17] fix nlp benchmark

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../static_quant/pt2e/run_clm_no_trainer.py   | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 7b4c9a46630..395bc6f9b57 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -90,18 +90,10 @@ def get_example_inputs(tokenizer):
         prepare_model(*example_inputs)
     # convert
     converted_model = convert(prepare_model)
-    # inference
-    from torch._inductor import config
-
-    config.freezing = True
-    opt_model = torch.compile(converted_model)
-
-    opt_model.config = user_model.config # for lm eval
-    user_model = opt_model
-
+    
     # save
     if args.output_dir:
-        user_model.save(example_inputs=example_inputs, output_dir = args.output_dir)
+        converted_model.save(example_inputs=example_inputs, output_dir = args.output_dir)
 
 
 
@@ -112,7 +104,15 @@ def get_example_inputs(tokenizer):
         model = load(args.output_dir)
 
         model.config = user_model.config # for lm eval
-        user_model = model
+        
+        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+        from torch._inductor import config
+
+        config.freezing = True
+        opt_model = torch.compile(model)
+
+        opt_model.config = user_model.config # for lm eval
+        user_model = opt_model
 
 if args.accuracy: