Skip to content

Commit 2bb257e

Browse files
Add woq examples (#1982)
Signed-off-by: Kaihui-intel <kaihui.tang@intel.com> Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com> Co-authored-by: Sun, Xuehao <xuehao.sun@intel.com>
1 parent 586eb88 commit 2bb257e

File tree

5 files changed

+309
-45
lines changed

5 files changed

+309
-45
lines changed

examples/.config/model_params_pytorch_3x.json

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,34 @@
8484
"main_script": "run_clm_no_trainer.py",
8585
"batch_size": 8
8686
},
87+
"gpt_j_woq_awq_int4":{
88+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
89+
"dataset_location": "",
90+
"input_model": "",
91+
"main_script": "run_clm_no_trainer.py",
92+
"batch_size": 1
93+
},
94+
"opt_125m_woq_awq_int4":{
95+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
96+
"dataset_location": "",
97+
"input_model": "",
98+
"main_script": "run_clm_no_trainer.py",
99+
"batch_size": 1
100+
},
101+
"opt_125m_woq_autoround_int4":{
102+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
103+
"dataset_location": "",
104+
"input_model": "",
105+
"main_script": "run_clm_no_trainer.py",
106+
"batch_size": 1
107+
},
108+
"opt_125m_woq_autotune_int4":{
109+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
110+
"dataset_location": "",
111+
"input_model": "",
112+
"main_script": "run_clm_no_trainer.py",
113+
"batch_size": 1
114+
},
87115
"gpt_j_ipex":{
88116
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
89117
"dataset_location": "",

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
3535
--woq_group_size 128 \
3636
--gptq_max_seq_length 2048 \
3737
--gptq_use_max_length \
38-
--accuracy \
39-
--tasks "lambada_openai" \
40-
--double_quant_type "BNB_NF4"
38+
--double_quant_type "BNB_NF4" \
39+
--output_dir saved_results
4140

4241
# "--woq_algo RTN" is used to enable RTN algorithms
4342
python run_clm_no_trainer.py \
@@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
4847
--woq_bits 4 \
4948
--woq_scheme asym \
5049
--woq_group_size 128 \
50+
--double_quant_type "BNB_NF4"
51+
--output_dir saved_results
52+
53+
# "--woq_algo AWQ" is used to enable AWQ algorithms
54+
python run_clm_no_trainer.py \
55+
--model EleutherAI/gpt-j-6B \
56+
--dataset NeelNanda/pile-10k \
57+
--quantize \
58+
--woq_algo AWQ \
59+
--woq_bits 4 \
60+
--woq_scheme asym \
61+
--woq_group_size 128 \
62+
--calib_iters 128
63+
64+
# "--woq_algo AutoRound" is used to enable AutoRound algorithms
65+
python run_clm_no_trainer.py \
66+
--model EleutherAI/gpt-j-6B \
67+
--dataset NeelNanda/pile-10k \
68+
--quantize \
69+
--woq_algo AutoRound \
70+
--woq_bits 4 \
71+
--woq_scheme asym \
72+
--woq_group_size 128
73+
74+
# "--accuracy" for eval
75+
python run_clm_no_trainer.py \
76+
--model EleutherAI/gpt-j-6B \
77+
--dataset NeelNanda/pile-10k \
78+
--int8 \
5179
--accuracy \
5280
--tasks "lambada_openai" \
53-
--double_quant_type "BNB_NF4"
81+
--output_dir saved_results
5482
```
5583
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
5684

@@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
72100
--woq_group_size 128 \
73101
--gptq_max_seq_length 2048 \
74102
--gptq_use_max_length \
75-
--accuracy \
76-
--tasks "lambada_openai" \
77103
--double_quant_type "BNB_NF4"
78104

79105
# "--woq_algo RTN" is used to enable RTN algorithms
@@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
85111
--woq_bits 4 \
86112
--woq_scheme asym \
87113
--woq_group_size 128 \
114+
--double_quant_type "BNB_NF4"
115+
116+
# "--woq_algo AWQ" is used to enable AWQ algorithms
117+
python run_clm_no_trainer.py \
118+
--model facebook/opt-125m \
119+
--dataset NeelNanda/pile-10k \
120+
--quantize \
121+
--woq_algo AWQ \
122+
--woq_bits 4 \
123+
--woq_scheme asym \
124+
--woq_group_size 128 \
125+
--calib_iters 128
126+
127+
# "--woq_algo AutoRound" is used to enable AutoRound algorithms
128+
python run_clm_no_trainer.py \
129+
--model facebook/opt-125m \
130+
--dataset NeelNanda/pile-10k \
131+
--quantize \
132+
--woq_algo AutoRound \
133+
--woq_bits 4 \
134+
--woq_scheme asym \
135+
--woq_group_size 128
136+
137+
# "--accuracy" for eval
138+
python run_clm_no_trainer.py \
139+
--model facebook/opt-125m \
140+
--dataset NeelNanda/pile-10k \
141+
--int8 \
88142
--accuracy \
89143
--tasks "lambada_openai" \
90-
--double_quant_type "BNB_NF4"
144+
--output_dir saved_results
91145
```
92146

93147
### LLAMA2-7b/13b/70b
94-
>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
95148
#### Quantization
96149

97150
```bash
@@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
107160
--woq_group_size 128 \
108161
--gptq_max_seq_length 2048 \
109162
--gptq_use_max_length \
110-
--accuracy \
111-
--tasks "lambada_openai" \
112163
--double_quant_type "BNB_NF4"
113164

114165
# "--woq_algo RTN" is used to enable RTN algorithms
@@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
120171
--woq_bits 4 \
121172
--woq_scheme asym \
122173
--woq_group_size 128 \
123-
--accuracy \
124-
--tasks "lambada_openai" \
125174
--double_quant_type "BNB_NF4"
126175
```
127176

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -70,58 +70,59 @@ function run_benchmark {
7070
fi
7171
echo $extra_cmd
7272

73-
if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
73+
if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
7474
model_name_or_path="facebook/opt-125m"
75-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
7675
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
7776
model_name_or_path="facebook/opt-125m"
78-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
79-
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
8077
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
8178
model_name_or_path="facebook/opt-125m"
82-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
83-
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
8479
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
8580
model_name_or_path="meta-llama/Llama-2-7b-hf"
86-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
8781
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
8882
model_name_or_path="meta-llama/Llama-2-7b-hf"
89-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
90-
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
9183
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
9284
model_name_or_path="meta-llama/Llama-2-7b-hf"
93-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
94-
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
9585
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
9686
model_name_or_path="EleutherAI/gpt-j-6b"
97-
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
9887
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
99-
model_name_or_path="EleutherAI/gpt-j-6b"\
100-
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
101-
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
88+
model_name_or_path="EleutherAI/gpt-j-6b"
10289
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
103-
model_name_or_path="EleutherAI/gpt-j-6b"\
104-
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
105-
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
90+
model_name_or_path="EleutherAI/gpt-j-6b"
10691
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
10792
model_name_or_path="EleutherAI/gpt-j-6b"
108-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
10993
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
11094
model_name_or_path="EleutherAI/gpt-j-6b"
111-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
112-
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
11395
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
11496
model_name_or_path="EleutherAI/gpt-j-6b"
115-
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
116-
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
97+
elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
98+
model_name_or_path="EleutherAI/gpt-j-6b"
99+
elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
100+
model_name_or_path="facebook/opt-125m"
101+
elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
102+
model_name_or_path="facebook/opt-125m"
103+
extra_cmd=$extra_cmd" --woq_algo AutoRound"
104+
elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
105+
model_name_or_path="facebook/opt-125m"
117106
fi
118107

119-
python -u run_clm_no_trainer.py \
120-
--model ${model_name_or_path} \
121-
--output_dir ${tuned_checkpoint} \
122-
--task ${task} \
123-
--batch_size ${batch_size} \
124-
${extra_cmd} ${mode_cmd}
108+
if [[ ${mode} == "accuracy" ]]; then
109+
python -u run_clm_no_trainer.py \
110+
--model ${model_name_or_path} \
111+
--output_dir ${tuned_checkpoint} \
112+
--task ${task} \
113+
--batch_size ${batch_size} \
114+
${extra_cmd} ${mode_cmd}
115+
elif [[ ${mode} == "performance" ]]; then
116+
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
117+
--model ${model_name_or_path} \
118+
--batch_size ${batch_size} \
119+
--output_dir ${tuned_checkpoint} \
120+
${extra_cmd} ${mode_cmd}
121+
else
122+
echo "Error: No such mode: ${mode}"
123+
exit 1
124+
fi
125+
125126
}
126127

127128
main "$@"

0 commit comments

Comments
 (0)