From 24b3bbdef590f9f5329005c1cc83315d7ee0ebcc Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Thu, 4 Aug 2022 17:52:40 +0800 Subject: [PATCH] [cherry-pick] fix QuantizeLinear pass and support reduce_max in quantization (#44872) * fix QuantizeLinear kernel and pass in QAT (#44784) * Add Reduce Max in Quant (#44825) Co-authored-by: Chang Xu --- paddle/fluid/operators/fake_quantize_op.h | 3 +- paddle/fluid/operators/quantize_linear_op.cc | 29 +++++- paddle/fluid/operators/quantize_linear_op.h | 27 +++++- .../post_training_quantization.py | 18 ++-- .../slim/quantization/quantization_pass.py | 88 +++++++++++-------- .../fluid/contrib/slim/quantization/utils.py | 2 + .../tests/unittests/test_fake_quantize_op.py | 37 ++++++-- 7 files changed, 148 insertions(+), 56 deletions(-) diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 6931ac4325b7b..9d0527d710412 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -139,7 +139,8 @@ struct FindMovingAverageAbsMaxFunctor { void operator()(const DeviceContext &ctx, const framework::Tensor &in_accum, const framework::Tensor &in_state, - const framework::Tensor &cur_scale, + const T *cur_scale, + const float rate, framework::Tensor *out_state, framework::Tensor *out_accum, framework::Tensor *out_scale); diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc index 4580acbe3fc83..98ae8abe083bc 100644 --- a/paddle/fluid/operators/quantize_linear_op.cc +++ b/paddle/fluid/operators/quantize_linear_op.cc @@ -93,6 +93,12 @@ class QuantizeLinearOp : public framework::OperatorWithKernel { ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]}); } } + if (ctx->HasOutput("OutState")) { + ctx->SetOutputDim("OutState", {1}); + } + if (ctx->HasOutput("OutAccum")) { + ctx->SetOutputDim("OutAccum", {1}); + } ctx->ShareLoD("X", /*->*/ "Y"); } @@ -113,7 +119,25 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Y", "(Tensor) Output of quantized low level tensor, " "but also saved as float data type."); - AddOutput("OutScale", "(Tensor) Current scale").AsDispensable().AsExtra(); + AddInput("InAccum", "Last accum.") + .AsDispensable() + .AsExtra(); // only qat use + AddInput("InState", "Last state.") + .AsDispensable() + .AsExtra(); // only qat use + AddOutput("OutState", "(Tensor) state buffer.") + .AsDispensable() + .AsExtra(); // only qat use + AddOutput("OutAccum", "(Tensor) accum buffer.") + .AsDispensable() + .AsExtra(); // only qat use + AddOutput("OutScale", "(Tensor) Current scale") + .AsDispensable() + .AsExtra(); // only qat use + AddAttr("moving_rate", + "(float, default 0.9) moving rate.") // only qat use + .SetDefault(0.9) + .AsExtra(); AddAttr("quant_axis", "(int, default 0) The axis for quantization. " "For conv2d, depthwise_conv2d, conv2d_transpose " @@ -154,8 +178,7 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker { "nearest ties to even and 1 is rounding to nearest " "ties away from zero.but the received is %d", round_type)); - }) - .AsExtra(); + }); AddAttr("is_test", "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h index 47e65784b6b57..278e347aaceda 100644 --- a/paddle/fluid/operators/quantize_linear_op.h +++ b/paddle/fluid/operators/quantize_linear_op.h @@ -56,10 +56,31 @@ class QuantizeLinearKernel : public framework::OpKernel { if (quant_axis < 0) { if (!is_test) { - auto* out_scale = context.Output("OutScale"); - T* out_s = out_scale->mutable_data(context.GetPlace()); + // training + auto* in_accum = context.Input("InAccum"); + auto* in_state = context.Input("InState"); + auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); + T* cur_scale_data = static_cast(cur_scale->ptr()); + FindAbsMaxFunctor()( - dev_ctx, in->data(), in->numel(), out_s); + dev_ctx, in->data(), in->numel(), cur_scale_data); + + auto* out_state = context.Output("OutState"); + auto* out_accum = context.Output("OutAccum"); + auto* out_scale = context.Output("OutScale"); + out_state->mutable_data(context.GetPlace()); + out_accum->mutable_data(context.GetPlace()); + out_scale->mutable_data(context.GetPlace()); + float moving_rate = context.Attr("moving_rate"); + + FindMovingAverageAbsMaxFunctor()(dev_ctx, + *in_accum, + *in_state, + cur_scale_data, + moving_rate, + out_state, + out_accum, + out_scale); ClipAndFakeQuantFunctor()( dev_ctx, *in, *out_scale, bin_cnt, round_type, out); } else { diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index f1da3990a36be..bfd76a44b4dcc 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -418,8 +418,7 @@ def quantize(self): self._update_program() # save out_threshold for quantized ops. - if not self._onnx_format: - self._save_output_threshold() + self._save_output_threshold() if any(op_type in self._quantizable_op_type for op_type in self._dynamic_quantize_op_type): @@ -996,16 +995,23 @@ def _save_output_threshold(self): ''' Save output threshold to the quantized op. ''' + self._calibration_scales = {} def save_info(op_node, out_var_name, threshold_map, out_info_name, quantized_type): assert out_var_name in threshold_map, \ "The output ({}) of {} node does not have threshold.".format( out_var_name, op_node.type) - op_node._set_attr(out_info_name, threshold_map[var_name]) - op_node._set_attr("with_quant_attr", True) - if op_node.type in self._quantizable_op_type: - op._set_attr("quantization_type", quantized_type) + if self._onnx_format: + # For easy extension, every var_node set a dict to save parameters of quant. + self._calibration_scales[var_name] = {} + self._calibration_scales[var_name]['scale'] = threshold_map[ + var_name] + else: + op_node._set_attr(out_info_name, threshold_map[var_name]) + op_node._set_attr("with_quant_attr", True) + if op_node.type in self._quantizable_op_type: + op._set_attr("quantization_type", quantized_type) def analysis_and_save_info(op_node, out_var_name): argname_index = utils._get_output_name_index(op_node, out_var_name) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 3a316e9192e39..5abb1d382b324 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1785,6 +1785,7 @@ class InsertQuantizeLinear(object): equal to 0, it will quantization with per channel, else quantization with per layer. Default is -1. channel_wise(bool, optional): Whether quantization with per channel or not. Default is False. + moving_rate(float): the rate for 'moving average' method. is_test(bool, optional): Whether quantization with training or not. Default is True. """ @@ -1794,6 +1795,7 @@ def __init__(self, quant_bits=8, quant_axis=-1, channel_wise=False, + moving_rate=0.9, is_test=True): self._place = place self._scope = scope @@ -1801,15 +1803,16 @@ def __init__(self, self.quant_axis = quant_axis self.channel_wise = channel_wise self._is_test = is_test + self._moving_rate = moving_rate - def insert_quant_op(self, graph, var_node): + def insert_quant_op(self, graph, var_node, var_name=None): assert var_node.is_var(), '{} is not a var'.format(var_node.name()) - - quant_var_node = graph.create_var_node(name=self._quantized_var_name( - var_node.name()), - var_type=var_node.type(), - shape=var_node.shape(), - var_dtype=var_node.dtype()) + var_name = var_node.name() if not var_name else var_name + quant_var_node = graph.create_var_node( + name=self._quantized_var_name(var_name), + var_type=var_node.type(), + shape=var_node.shape(), + var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' if self.channel_wise: @@ -1821,7 +1824,7 @@ def insert_quant_op(self, graph, var_node): scale_var_type = var_node.type() init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type) scale_var_node = graph.create_persistable_node( - name=self._quantized_scale_name(var_node.name()), + name=self._quantized_scale_name(var_name), var_type=scale_var_type, shape=[scale_var_shape], var_dtype=var_node.dtype()) @@ -1844,13 +1847,39 @@ def insert_quant_op(self, graph, var_node): inputs["ZeroPoint"] = zero_point_node attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits} + attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward outputs = {"Y": quant_var_node} if not self._is_test: - attrs["is_test"] = self._is_test - attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward scale_out_node = graph.create_var_node_from_desc( scale_var_node.var()) + state_in_node = graph.create_persistable_node( + name=unique_name.generate('state'), + var_type=core.VarDesc.VarType.LOD_TENSOR, + var_dtype=var_node.dtype(), + shape=[1]) + data_type = 'float64' if var_node.dtype( + ) == core.VarDesc.VarType.FP64 else 'float32' + _init_var_node(state_in_node, np.ones([1], dtype=data_type), + self._scope, self._place) + accum_in_node = graph.create_persistable_node( + name=unique_name.generate('accum'), + var_type=core.VarDesc.VarType.LOD_TENSOR, + var_dtype=var_node.dtype(), + shape=[1]) + _init_var_node(accum_in_node, np.ones([1], dtype=data_type), + self._scope, self._place) + state_out_node = graph.create_var_node_from_desc( + state_in_node.var()) + accum_out_node = graph.create_var_node_from_desc( + accum_in_node.var()) + outputs["OutScale"] = scale_out_node + inputs['InState'] = state_in_node + inputs['InAccum'] = accum_in_node + outputs['OutState'] = state_out_node + outputs['OutAccum'] = accum_out_node + attrs["is_test"] = self._is_test + attrs['moving_rate'] = self._moving_rate quant_op_node = graph.create_op_node(op_type="quantize_linear", attrs=attrs, @@ -1863,6 +1892,10 @@ def insert_quant_op(self, graph, var_node): graph.link_to(zero_point_node, quant_op_node) graph.link_to(quant_op_node, quant_var_node) if not self._is_test: + graph.link_to(state_in_node, quant_op_node) + graph.link_to(accum_in_node, quant_op_node) + graph.link_to(quant_op_node, state_out_node) + graph.link_to(quant_op_node, accum_out_node) graph.link_to(quant_op_node, scale_out_node) return quant_var_node, scale_var_node @@ -1891,8 +1924,7 @@ def insert_dequant_op(self, graph, var_node, scale_var_node): inputs["ZeroPoint"] = zero_point_node attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits} - if not self._is_test: - attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward + attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward quant_op_node = graph.create_op_node(op_type="dequantize_linear", attrs=attrs, @@ -1931,10 +1963,10 @@ def _zero_point_name(self, var_name): return "%s@zero_point" % (var_name) -class QuantizationTransformPassV2(object): +class QuantizationTransformPassV2(QuantizationTransformPass): """ Quantize the ops that have weights. Add quant and dequant ops for - the quantized ops's inputs. + the quantized ops's inputs. It is used in the new format of quantization. """ def __init__(self, @@ -2130,13 +2162,13 @@ def _transform_forward(self, graph, op): if is_weight and self._weight_quantize_func is not None: target_out_node = self._insert_func( graph, self._weight_quantize_func, var_node, op) - processed_vars.append(name) + self.processed_vars.append(name) continue elif not is_weight and self._act_quantize_func is not None: target_out_node = self._insert_func(graph, self._act_quantize_func, var_node, op) - processed_vars.append(name) + self.processed_vars.append(name) continue quant_bits = self._weight_bits if var_node.name() in self.persistable_vars \ @@ -2155,9 +2187,10 @@ def _transform_forward(self, graph, op): quant_bits=quant_bits, quant_axis=quant_axis, channel_wise=channel_wise, + moving_rate=self._moving_rate, is_test=self._is_test) quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op( - graph, var_node) + graph, var_node, var_name=name) dequant_var_node = insert_quant_pass.insert_dequant_op( graph, quant_var_node, scale_var_node) @@ -2182,24 +2215,6 @@ def _has_weight(self, op): has_weight = True return has_weight - def _is_skip_quant(self, graph, op_node): - """ - Analyse whether the op node skips quantization. - """ - is_skip = False - if op_node.op().has_attr("skip_quant") and \ - op_node.op().attr("skip_quant"): - is_skip = True - # if the inputs of mul and matmul are not all persistable, use - # AddQuantDequantPassV2 to quantize them. - if op_node.name() in ["mul", "matmul", "matmul_v2"] and \ - _is_input_all_not_persistable(graph, op_node): - is_skip = True - if op_node.op().has_attr("quantization_type") and \ - op_node.op().attr("quantization_type") == "qat_without_weight": - is_skip = True - return is_skip - def apply(self, graph): """ Quantize the graph for training process. According to weight and @@ -2250,7 +2265,7 @@ def apply(self, graph): class AddQuantDequantPassV2(object): """ Quantize the ops that do not have weights, and add quant_linear and dequant_linear - op for the quantized ops's inputs. + op for the quantized ops's inputs. It is used in the new format of quantization. """ # To be compatible with PaddleSlim, not remove _activation_type for now @@ -2377,6 +2392,7 @@ def apply(self, graph): quant_bits=self._quant_bits, quant_axis=-1, channel_wise=False, + moving_rate=self._moving_rate, is_test=self._is_test) quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op( graph, in_node) diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py index 28efcd2d59157..e7187018c8d5b 100644 --- a/python/paddle/fluid/contrib/slim/quantization/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/utils.py @@ -109,6 +109,7 @@ "square", "softplus", "shuffle_channel", + "reduce_max", ] _out_scale_op_list = list( @@ -213,6 +214,7 @@ "square": [["X"], ["Out"]], "softplus": [["X"], ["Out"]], "shuffle_channel": [["X"], ["Out"]], + "reduce_max": [["X"], ["Out"]], } diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index 02fff35fec71f..71415128be658 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -550,18 +550,41 @@ def set_args(self): def setUp(self): self.set_args() self.op_type = "quantize_linear" - x = np.random.randn(31, 65).astype(self.data_type) - yq, scale = quantize_max_abs(x, self.max_range) - scale = np.array(scale).astype(self.data_type) - zero_point = np.zeros(scale.shape, dtype="int32") - - self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point} self.attrs = { 'bit_length': self.bit_length, 'quant_axis': self.quant_axis, + 'moving_rate': 0.9, 'is_test': self.is_test } - self.outputs = {'Y': yq, 'OutScale': scale} + + x = np.random.randn(31, 65).astype(self.data_type) + scale = np.array([0.001]).astype(self.data_type) + zero_point = np.zeros(scale.shape, dtype="int32") + in_accum = np.ones(1).astype(self.data_type) + in_state = np.ones(1).astype(self.data_type) + out_accum = np.zeros(1).astype(self.data_type) + out_state = np.zeros(1).astype(self.data_type) + out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max( + np.abs(x)) + out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0 + out_scale = out_accum / out_state + + round_out = np.round(x / out_scale * self.max_range) + quant_data = np.clip(round_out, -self.max_range - 1, self.max_range) + + self.inputs = { + 'X': x, + 'Scale': scale, + 'ZeroPoint': zero_point, + 'InAccum': in_accum, + 'InState': in_state, + } + self.outputs = { + 'Y': quant_data, + 'OutScale': out_scale, + 'OutAccum': out_accum, + 'OutState': out_state, + } def test_check_output(self): self.check_output()