[MKLDNN] add quantized sum (#14614)

* add quantized sum * fix gpu compiler error and cpu testcase fail * add default forward function for quantized_sum * skip quantized_sum for gpu ctx * fix comments * fix indetation and comments * retrigger CI * alloc memeory through TmpMemMgr * fix comments Apr.12 * change sum to elemwise_add * change Sum to ElemwiseAdd * fix indents * retrigger CI * trigger CI * fix indentation and typo * trigger CI * fix typo * fix typo * remove USE_MKLDNN macro for requantize params * rename param same as its op * trigger CI * trigger CI * trigger CI
apache · Apr 30, 2019 · 84c1635 · 84c1635
1 parent cdd7087
commit 84c1635
Show file tree

Hide file tree

Showing 8 changed files with 518 additions and 28 deletions.
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_elemwise_add.cc
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_quantized_elemwise_add.cc
+ * \brief
+ */
+
+#if MXNET_USE_MKLDNN == 1
+#include "../quantized_elemwise_add-inl.h"
+#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(QuantizeElemwiseAddParam);
+
+static inline float GetScale(const NDArray& data, float min, float max) {
+ auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
+ return data_range / MaxAbs(min, max);
+}
+
+static void MKLDNNQuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
+ const QuantizeElemwiseAddParam& params = nnvm::get<QuantizeElemwiseAddParam>(attrs.parsed);
+ // A, B, A_min, A_max, B_min, B_max
+ CHECK_EQ(in_data.size(), 6U) << "should be A, B, A_min, A_max, B_min, B_max";
+ // C, C_min, C_max
+ CHECK_EQ(out_data.size(), 3U) << "should be C, C_min, C_max";
+ // Collect data min,max,absmax
+ const float dataA_min = in_data[quantized_elemwise_add_enum::kAMin].data().dptr<float>()[0];
+ const float dataB_min = in_data[quantized_elemwise_add_enum::kBMin].data().dptr<float>()[0];
+ const float dataA_max = in_data[quantized_elemwise_add_enum::kAMax].data().dptr<float>()[0];
+ const float dataB_max = in_data[quantized_elemwise_add_enum::kBMax].data().dptr<float>()[0];
+ const float dataA_absmax = MaxAbs(dataA_min, dataA_max);
+ const float dataB_absmax = MaxAbs(dataB_min, dataB_max);
+
+ auto dataA_mem = in_data[quantized_elemwise_add_enum::kDataA].GetMKLDNNData();
+ auto dataB_mem = in_data[quantized_elemwise_add_enum::kDataB].GetMKLDNNData();
+ const bool is_dataA_int8 = (in_data[quantized_elemwise_add_enum::kDataA].dtype()
+ == mshadow::kInt8);
+ const size_t dataA_range = is_dataA_int8 ? kInt8Range : kUint8Range;
+
+ const float A_scale = GetScale(in_data[quantized_elemwise_add_enum::kDataA],
+ dataA_min,
+ dataA_max);
+ const float B_scale = GetScale(in_data[quantized_elemwise_add_enum::kDataB],
+ dataB_min,
+ dataB_max);
+ // rescaled_mem is for reorder mkldnn memory
+ mkldnn::memory *rescaled_mem;
+
+ // output default set as int32
+ size_t output_data_range = kInt32Range;
+ auto output_data_type = mkldnn::memory::s32;
+ // dataA && dataB are uint8
+ if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kInt8) {
+ output_data_range = kInt8Range;
+ output_data_type = mkldnn::memory::s8;
+ } else if (out_data[quantized_elemwise_add_enum::kOut].dtype() == mshadow::kUint8) {
+ output_data_range = kUint8Range;
+ output_data_type = mkldnn::memory::u8;
+ } else {
+ output_data_range = kInt32Range;
+ output_data_type = mkldnn::memory::s32;
+ }
+
+ float output_min = 0;
+ float output_max = 0;
+ float out_data_scale = 0;
+ if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+ output_min = params.min_calib_range.value();
+ output_max = params.max_calib_range.value();
+ out_data_scale = output_data_range / MaxAbs(output_min, output_max);
+ } else {
+ output_max = dataA_absmax + dataB_absmax;
+ output_min = -output_max;
+ }
+ // 2: scale 0 for dataA, scale 1 for data B
+ const int scales_num = 2;
+ std::vector<float> scales(scales_num, 1);
+ if (in_data[quantized_elemwise_add_enum::kDataA].dtype()
+ != in_data[quantized_elemwise_add_enum::kDataB].dtype()) {
+ auto s8_pd = (is_dataA_int8 == true)
+ ? dataA_mem->get_primitive_desc()
+ : dataB_mem->get_primitive_desc();
+ rescaled_mem = TmpMemMgr::Get()->Alloc(s8_pd);
+ float u8_reorder_scale = 0;
+ if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+ if (is_dataA_int8 == true) {
+ u8_reorder_scale = out_data_scale / B_scale;
+ scales[0] = out_data_scale / A_scale;
+ } else {
+ u8_reorder_scale = out_data_scale / A_scale;
+ scales[1] = out_data_scale / B_scale;
+ }
+ } else {
+ // x*dataA_absmax/dataA_range = y*(dataA_absmax+dataB_absmax)/output_range
+ if (is_dataA_int8 == true) {
+ u8_reorder_scale = dataB_absmax * output_data_range
+ / ((dataA_absmax + dataB_absmax) * kUint8Range);
+ scales[0] = dataA_absmax * output_data_range
+ / ((dataA_absmax + dataB_absmax) * dataA_range);
+ } else {
+ u8_reorder_scale = dataA_absmax * output_data_range
+ / ((dataA_absmax + dataB_absmax) * dataA_range);
+ scales[1] = dataB_absmax * output_data_range
+ / ((dataA_absmax + dataB_absmax) * kInt8Range);
+ }
+ }
+ std::vector<float> reorder_scale = {u8_reorder_scale};
+ primitive_attr reorder_attr;
+ reorder_attr.set_int_output_round_mode(round_mode::round_nearest);
+ reorder_attr.set_output_scales(0, reorder_scale);
+ auto u8_mem = (is_dataA_int8 == true) ? dataB_mem : dataA_mem;
+ const auto reorder_pd = mkldnn::reorder::primitive_desc(u8_mem->get_primitive_desc(),
+ s8_pd,
+ reorder_attr);
+ MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *u8_mem, *rescaled_mem));
+
+ if (is_dataA_int8 == true) {
+ dataB_mem = rescaled_mem;
+ } else {
+ dataA_mem = rescaled_mem;
+ }
+ } else {
+ // same data type and has same data range
+ if (params.max_calib_range.has_value() && params.min_calib_range.has_value()) {
+ scales[0] = out_data_scale / A_scale;
+ scales[1] = out_data_scale / B_scale;
+ } else {
+ scales[0] = dataA_absmax * output_data_range / ((dataA_absmax + dataB_absmax) * dataA_range);
+ scales[1] = dataB_absmax * output_data_range / ((dataA_absmax + dataB_absmax) * dataA_range);
+ }
+ }
+
+ std::vector<mkldnn::primitive::at> in_prims;
+ std::vector<mkldnn::memory::primitive_desc> in_pds;
+ in_prims.push_back(*dataA_mem);
+ in_prims.push_back(*dataB_mem);
+ in_pds.push_back(dataA_mem->get_primitive_desc());
+ in_pds.push_back(dataB_mem->get_primitive_desc());
+ size_t i_ndim = in_data[quantized_elemwise_add_enum::kDataA].shape().ndim();
+ mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+ for (size_t i = 0; i < i_ndim; i++) {
+ i_dims[i] = static_cast<int>(in_data[quantized_elemwise_add_enum::kDataA].shape()[i]);
+ }
+ mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(
+ in_pds[quantized_elemwise_add_enum::kDataA].desc().data.format);
+ auto output_desc = mkldnn::memory::desc(i_dims, output_data_type, i_fmt);
+ mkldnn::sum::primitive_desc pdesc(output_desc, scales, in_pds);
+ auto mem = CreateMKLDNNMem(out_data[quantized_elemwise_add_enum::kOut],
+ pdesc.dst_primitive_desc(),
+ req[0],
+ &in_data[0]);
+ MKLDNNStream *stream = MKLDNNStream::Get();
+ stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
+ CommitOutput(out_data[quantized_elemwise_add_enum::kOut], mem);
+ stream->Submit();
+
+ out_data[quantized_elemwise_add_enum::kMin].data().dptr<float>()[0] = output_min;
+ out_data[quantized_elemwise_add_enum::kMax].data().dptr<float>()[0] = output_max;
+}
+
+inline static bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask,
+ DispatchMode* dispatch_mode, std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ // Check num of inputs: A, B, A_min, A_max, B_min, B_max
+ CHECK_EQ(in_attrs->size(), 6U);
+ // Check num of outputs: C, C_min, C_max
+ CHECK_EQ(out_attrs->size(), 3U);
+
+ return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_elemwise_add)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedElemwiseAddForward)
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr_parser(ParamParser<QuantizeElemwiseAddParam>)
+.add_arguments(QuantizeElemwiseAddParam::__FIELDS__());
+} // namespace op
+} // namespace mxnet
+
+#endif // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
@@ -34,6 +34,7 @@ namespace op {
 
 static const size_t kUint8Range = 255;
 static const size_t kInt8Range = 127;
+static const size_t kInt32Range = 0x7fffffff;
 
 template<typename T>
 MSHADOW_XINLINE int Sign(T val) {

diff --git a/src/operator/quantization/quantized_elemwise_add-inl.h b/src/operator/quantization/quantized_elemwise_add-inl.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file quantized_elemwise_add-inl.h
+ * \brief
+ * \author Rong Zhang
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_ADD_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_ADD_INL_H_
+
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+/* These structure is used for requantization only when fusion */
+struct QuantizeElemwiseAddParam : public dmlc::Parameter<QuantizeElemwiseAddParam> {
+ dmlc::optional<float> min_calib_range;
+ dmlc::optional<float> max_calib_range;
+ DMLC_DECLARE_PARAMETER(QuantizeElemwiseAddParam) {
+ DMLC_DECLARE_FIELD(min_calib_range)
+ .set_default(dmlc::optional<float>())
+ .describe("The minimum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to requantize the "
+ "int8 output data.");
+ DMLC_DECLARE_FIELD(max_calib_range)
+ .set_default(dmlc::optional<float>())
+ .describe("The maximum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to requantize the "
+ "int8 output data.");
+ }
+};
+
+namespace quantized_elemwise_add_enum {
+enum QuantizedElemwiseAddOutputs { kOut, kMin, kMax };
+enum QuantizedElemwiseAddInputs { kDataA, kDataB, kAMin, kAMax, kBMin, kBMax};
+}
+
+} // namespace op
+} // namespace mxnet
+
+#endif // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_ELEMWISE_ADD_INL_H_