From 1552dd401d8f46ebae97fef9c56b8e68f6deb5e7 Mon Sep 17 00:00:00 2001 From: chenxiny Date: Fri, 11 Oct 2019 15:59:04 +0800 Subject: [PATCH 1/8] add quantized embedding --- .../quantization/quantized_indexing_op.cc | 189 ++++++++++++++++++ src/operator/tensor/indexing_op.h | 6 + 2 files changed, 195 insertions(+) create mode 100644 src/operator/quantization/quantized_indexing_op.cc diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc new file mode 100644 index 000000000000..667730da91ed --- /dev/null +++ b/src/operator/quantization/quantized_indexing_op.cc @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file quantized_indexing_op.cc +*/ +#include +#include "../tensor/indexing_op.h" + +namespace mxnet { +namespace op { + + +inline bool QuantizedEmbeddingOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector *in_attrs, + mxnet::ShapeVector *out_attrs) { + using namespace mshadow; + const mxnet::TShape &dshape = (*in_attrs)[quantized_embedding::kData]; + if (!ndim_is_known(dshape)) return false; + const EmbeddingParam& param = nnvm::get(attrs.parsed); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeight, Shape2(param.input_dim, + param.output_dim)); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeightMin, mxnet::TShape(1, 1)); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeightMax, mxnet::TShape(1, 1)); + out_attrs->clear(); + + mxnet::TShape oshape(dshape.ndim()+1, -1); + for (int i = 0; i < dshape.ndim(); ++i) { + oshape[i] = dshape[i]; + } + oshape[dshape.ndim()] = param.output_dim; + out_attrs->push_back(oshape); + out_attrs->push_back(mxnet::TShape(1, 1)); + out_attrs->push_back(mxnet::TShape(1, 1)); + return shape_is_known(oshape); +} + +inline bool QuantizedEmbeddingOpType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + CHECK_EQ(in_type->size(), 4U); + CHECK_GE(out_type->size(), 3U); + int itype = (*in_type)[0]; + CHECK_NE(itype, -1) << "First input must have specified type"; + TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8); + TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*in_type, 3, mshadow::kFloat32); + out_type->clear(); + out_type->push_back(mshadow::kInt8); + int dtype_out_min = 0; + int dtype_out_max = 0; + out_type->push_back(dtype_out_min); + out_type->push_back(dtype_out_max); + return true; +} + +// storage type inference function for Embedding +inline bool QuantizedEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 4U); + CHECK_EQ(out_attrs->size(), 3U); + const int& data_stype = in_attrs->at(quantized_embedding::kData); + const int& weight_stype = in_attrs->at(quantized_embedding::kWeight); + const int& weight_min_stype = in_attrs->at(quantized_embedding::kWeightMin); + const int& weight_max_stype = in_attrs->at(quantized_embedding::kWeightMax); + int& out_stype = out_attrs->at(quantized_embedding::kOut); + int& out_stype_min = out_attrs->at(quantized_embedding::kOutMin); + int& out_stype_max = out_attrs->at(quantized_embedding::kOutMax); + bool dispatched = false; + CHECK_EQ(weight_min_stype, kDefaultStorage); + CHECK_EQ(weight_max_stype, kDefaultStorage); + if (!dispatched && data_stype == kDefaultStorage && weight_stype == kDefaultStorage) { + // dns, dns -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + dispatched = storage_type_assign(&out_stype_min, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + dispatched = storage_type_assign(&out_stype_max, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + } + if (!dispatched && data_stype == kDefaultStorage && weight_stype == kRowSparseStorage) { + // dns, rsp -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + } + return dispatched; +} + +void QuantizedEmbeddingOpForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(req[quantized_embedding::kOut], kWriteTo); + CHECK_EQ(inputs.size(), 4U); + CHECK_EQ(outputs.size(), 3U); + CHECK_EQ(inputs[quantized_embedding::kWeight].ndim(), 2U) + << "Embedding layer expects its weight to be two-dimensional. " + << inputs[quantized_embedding::kWeight].ndim() + << " dimensional input is given instead"; + mshadow::Stream *s = ctx.get_stream(); + EmbeddingOpForwardDnsImpl(s, inputs[quantized_embedding::kData], inputs[quantized_embedding::kWeight], + req[quantized_embedding::kOut], outputs[quantized_embedding::kOut]); + float min_weight = inputs[quantized_embedding::kWeightMin].dptr()[0]; + float max_weight = inputs[quantized_embedding::kWeightMax].dptr()[0]; + outputs[quantized_embedding::kOutMin].dptr()[0] = min_weight; + outputs[quantized_embedding::kOutMax].dptr()[0] = max_weight; +} + +NNVM_REGISTER_OP(_contrib_quantized_embedding) +.describe(R"code(Maps integer indices to int8 vector representations (embeddings). +)code" ADD_FILELINE) +.set_num_inputs(4) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "weight", "min_weight", "max_weight"}; + }) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "min_output", "max_output"}; + }) +.set_attr("FInferShape", QuantizedEmbeddingOpShape) +.set_attr("FInferType", QuantizedEmbeddingOpType) +.set_attr("FInferStorageType", QuantizedEmbeddingOpForwardStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", QuantizedEmbeddingOpForward) +// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow, +// will be reverted after the improvement of CachedOP is done. +.set_attr("FGradient", MakeZeroGradNodes) +.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.") +.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.") +.add_argument("min_weight", "NDArray-or-Symbol", "Minimum value of data.") +.add_argument("max_weight", "NDArray-or-Symbol", "Maximum value of data.") +.add_arguments(EmbeddingParam::__FIELDS__()); + +NNVM_REGISTER_OP(Embedding) +.set_attr("FQuantizable", [](const NodeAttrs& attrs) { + return QuantizeType::kMust; +}) +.set_attr("FQuantizedOp", [](const NodeAttrs& attrs) { + EmbeddingParam param; + param.Init(attrs.dict); + nnvm::NodePtr node = nnvm::Node::Create(); + if (param.dtype == mshadow::kFloat32) { + node->attrs.op = Op::Get("_contrib_quantized_embedding"); + node->attrs.name = "quantized_" + attrs.name; + } else { + node->attrs.op = Op::Get("Embedding"); + node->attrs.name = attrs.name; + } + node->attrs.dict = attrs.dict; + if (node->op()->attr_parser != nullptr) { + node->op()->attr_parser(&(node->attrs)); + } + return node; + }) +.set_attr("FAvoidQuantizeInput", [](const NodeAttrs &attrs, size_t index) { + if (index == 0) + return true; + else + return false; +}); +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 828d761fefd4..004308ca7588 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -58,6 +58,12 @@ enum EmbeddingOpOutputs {kOut}; enum EmbeddingOpResource {kTempSpace}; } // namespace embedding +namespace quantized_embedding { +enum QuantizedEmbeddingOpInputs {kData, kWeight, kWeightMin, kWeightMax}; +enum QuantizedEmbeddingOpOutputs {kOut, kOutMin, kOutMax}; +enum QuantizedEmbeddingOpResource {kTempSpace}; +} // namespace quantized embedding + struct SparseEmbeddingParam: public dmlc::Parameter { int input_dim; From 8645271c04740acf158019c4696113a6b392ca10 Mon Sep 17 00:00:00 2001 From: chenxiny Date: Fri, 11 Oct 2019 15:59:04 +0800 Subject: [PATCH 2/8] add quantized embedding --- .../quantization/quantized_indexing_op.cc | 189 ++++++++++++++++++ src/operator/tensor/indexing_op.h | 6 + 2 files changed, 195 insertions(+) create mode 100644 src/operator/quantization/quantized_indexing_op.cc diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc new file mode 100644 index 000000000000..667730da91ed --- /dev/null +++ b/src/operator/quantization/quantized_indexing_op.cc @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file quantized_indexing_op.cc +*/ +#include +#include "../tensor/indexing_op.h" + +namespace mxnet { +namespace op { + + +inline bool QuantizedEmbeddingOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector *in_attrs, + mxnet::ShapeVector *out_attrs) { + using namespace mshadow; + const mxnet::TShape &dshape = (*in_attrs)[quantized_embedding::kData]; + if (!ndim_is_known(dshape)) return false; + const EmbeddingParam& param = nnvm::get(attrs.parsed); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeight, Shape2(param.input_dim, + param.output_dim)); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeightMin, mxnet::TShape(1, 1)); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeightMax, mxnet::TShape(1, 1)); + out_attrs->clear(); + + mxnet::TShape oshape(dshape.ndim()+1, -1); + for (int i = 0; i < dshape.ndim(); ++i) { + oshape[i] = dshape[i]; + } + oshape[dshape.ndim()] = param.output_dim; + out_attrs->push_back(oshape); + out_attrs->push_back(mxnet::TShape(1, 1)); + out_attrs->push_back(mxnet::TShape(1, 1)); + return shape_is_known(oshape); +} + +inline bool QuantizedEmbeddingOpType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + CHECK_EQ(in_type->size(), 4U); + CHECK_GE(out_type->size(), 3U); + int itype = (*in_type)[0]; + CHECK_NE(itype, -1) << "First input must have specified type"; + TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8); + TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*in_type, 3, mshadow::kFloat32); + out_type->clear(); + out_type->push_back(mshadow::kInt8); + int dtype_out_min = 0; + int dtype_out_max = 0; + out_type->push_back(dtype_out_min); + out_type->push_back(dtype_out_max); + return true; +} + +// storage type inference function for Embedding +inline bool QuantizedEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 4U); + CHECK_EQ(out_attrs->size(), 3U); + const int& data_stype = in_attrs->at(quantized_embedding::kData); + const int& weight_stype = in_attrs->at(quantized_embedding::kWeight); + const int& weight_min_stype = in_attrs->at(quantized_embedding::kWeightMin); + const int& weight_max_stype = in_attrs->at(quantized_embedding::kWeightMax); + int& out_stype = out_attrs->at(quantized_embedding::kOut); + int& out_stype_min = out_attrs->at(quantized_embedding::kOutMin); + int& out_stype_max = out_attrs->at(quantized_embedding::kOutMax); + bool dispatched = false; + CHECK_EQ(weight_min_stype, kDefaultStorage); + CHECK_EQ(weight_max_stype, kDefaultStorage); + if (!dispatched && data_stype == kDefaultStorage && weight_stype == kDefaultStorage) { + // dns, dns -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + dispatched = storage_type_assign(&out_stype_min, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + dispatched = storage_type_assign(&out_stype_max, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + } + if (!dispatched && data_stype == kDefaultStorage && weight_stype == kRowSparseStorage) { + // dns, rsp -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + } + return dispatched; +} + +void QuantizedEmbeddingOpForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(req[quantized_embedding::kOut], kWriteTo); + CHECK_EQ(inputs.size(), 4U); + CHECK_EQ(outputs.size(), 3U); + CHECK_EQ(inputs[quantized_embedding::kWeight].ndim(), 2U) + << "Embedding layer expects its weight to be two-dimensional. " + << inputs[quantized_embedding::kWeight].ndim() + << " dimensional input is given instead"; + mshadow::Stream *s = ctx.get_stream(); + EmbeddingOpForwardDnsImpl(s, inputs[quantized_embedding::kData], inputs[quantized_embedding::kWeight], + req[quantized_embedding::kOut], outputs[quantized_embedding::kOut]); + float min_weight = inputs[quantized_embedding::kWeightMin].dptr()[0]; + float max_weight = inputs[quantized_embedding::kWeightMax].dptr()[0]; + outputs[quantized_embedding::kOutMin].dptr()[0] = min_weight; + outputs[quantized_embedding::kOutMax].dptr()[0] = max_weight; +} + +NNVM_REGISTER_OP(_contrib_quantized_embedding) +.describe(R"code(Maps integer indices to int8 vector representations (embeddings). +)code" ADD_FILELINE) +.set_num_inputs(4) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "weight", "min_weight", "max_weight"}; + }) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "min_output", "max_output"}; + }) +.set_attr("FInferShape", QuantizedEmbeddingOpShape) +.set_attr("FInferType", QuantizedEmbeddingOpType) +.set_attr("FInferStorageType", QuantizedEmbeddingOpForwardStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", QuantizedEmbeddingOpForward) +// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow, +// will be reverted after the improvement of CachedOP is done. +.set_attr("FGradient", MakeZeroGradNodes) +.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.") +.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.") +.add_argument("min_weight", "NDArray-or-Symbol", "Minimum value of data.") +.add_argument("max_weight", "NDArray-or-Symbol", "Maximum value of data.") +.add_arguments(EmbeddingParam::__FIELDS__()); + +NNVM_REGISTER_OP(Embedding) +.set_attr("FQuantizable", [](const NodeAttrs& attrs) { + return QuantizeType::kMust; +}) +.set_attr("FQuantizedOp", [](const NodeAttrs& attrs) { + EmbeddingParam param; + param.Init(attrs.dict); + nnvm::NodePtr node = nnvm::Node::Create(); + if (param.dtype == mshadow::kFloat32) { + node->attrs.op = Op::Get("_contrib_quantized_embedding"); + node->attrs.name = "quantized_" + attrs.name; + } else { + node->attrs.op = Op::Get("Embedding"); + node->attrs.name = attrs.name; + } + node->attrs.dict = attrs.dict; + if (node->op()->attr_parser != nullptr) { + node->op()->attr_parser(&(node->attrs)); + } + return node; + }) +.set_attr("FAvoidQuantizeInput", [](const NodeAttrs &attrs, size_t index) { + if (index == 0) + return true; + else + return false; +}); +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 828d761fefd4..004308ca7588 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -58,6 +58,12 @@ enum EmbeddingOpOutputs {kOut}; enum EmbeddingOpResource {kTempSpace}; } // namespace embedding +namespace quantized_embedding { +enum QuantizedEmbeddingOpInputs {kData, kWeight, kWeightMin, kWeightMax}; +enum QuantizedEmbeddingOpOutputs {kOut, kOutMin, kOutMax}; +enum QuantizedEmbeddingOpResource {kTempSpace}; +} // namespace quantized embedding + struct SparseEmbeddingParam: public dmlc::Parameter { int input_dim; From ab6dae7c1c47d3c7d469d6225d8a6333d1c432fb Mon Sep 17 00:00:00 2001 From: chenxiny Date: Fri, 11 Oct 2019 15:59:04 +0800 Subject: [PATCH 3/8] add quantized embedding --- .../quantization/quantized_indexing_op.cc | 189 ++++++++++++++++++ src/operator/tensor/indexing_op.h | 6 + 2 files changed, 195 insertions(+) create mode 100644 src/operator/quantization/quantized_indexing_op.cc diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc new file mode 100644 index 000000000000..667730da91ed --- /dev/null +++ b/src/operator/quantization/quantized_indexing_op.cc @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 by Contributors + * \file quantized_indexing_op.cc +*/ +#include +#include "../tensor/indexing_op.h" + +namespace mxnet { +namespace op { + + +inline bool QuantizedEmbeddingOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector *in_attrs, + mxnet::ShapeVector *out_attrs) { + using namespace mshadow; + const mxnet::TShape &dshape = (*in_attrs)[quantized_embedding::kData]; + if (!ndim_is_known(dshape)) return false; + const EmbeddingParam& param = nnvm::get(attrs.parsed); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeight, Shape2(param.input_dim, + param.output_dim)); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeightMin, mxnet::TShape(1, 1)); + SHAPE_ASSIGN_CHECK(*in_attrs, quantized_embedding::kWeightMax, mxnet::TShape(1, 1)); + out_attrs->clear(); + + mxnet::TShape oshape(dshape.ndim()+1, -1); + for (int i = 0; i < dshape.ndim(); ++i) { + oshape[i] = dshape[i]; + } + oshape[dshape.ndim()] = param.output_dim; + out_attrs->push_back(oshape); + out_attrs->push_back(mxnet::TShape(1, 1)); + out_attrs->push_back(mxnet::TShape(1, 1)); + return shape_is_known(oshape); +} + +inline bool QuantizedEmbeddingOpType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + CHECK_EQ(in_type->size(), 4U); + CHECK_GE(out_type->size(), 3U); + int itype = (*in_type)[0]; + CHECK_NE(itype, -1) << "First input must have specified type"; + TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8); + TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*in_type, 3, mshadow::kFloat32); + out_type->clear(); + out_type->push_back(mshadow::kInt8); + int dtype_out_min = 0; + int dtype_out_max = 0; + out_type->push_back(dtype_out_min); + out_type->push_back(dtype_out_max); + return true; +} + +// storage type inference function for Embedding +inline bool QuantizedEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 4U); + CHECK_EQ(out_attrs->size(), 3U); + const int& data_stype = in_attrs->at(quantized_embedding::kData); + const int& weight_stype = in_attrs->at(quantized_embedding::kWeight); + const int& weight_min_stype = in_attrs->at(quantized_embedding::kWeightMin); + const int& weight_max_stype = in_attrs->at(quantized_embedding::kWeightMax); + int& out_stype = out_attrs->at(quantized_embedding::kOut); + int& out_stype_min = out_attrs->at(quantized_embedding::kOutMin); + int& out_stype_max = out_attrs->at(quantized_embedding::kOutMax); + bool dispatched = false; + CHECK_EQ(weight_min_stype, kDefaultStorage); + CHECK_EQ(weight_max_stype, kDefaultStorage); + if (!dispatched && data_stype == kDefaultStorage && weight_stype == kDefaultStorage) { + // dns, dns -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + dispatched = storage_type_assign(&out_stype_min, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + dispatched = storage_type_assign(&out_stype_max, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + } + if (!dispatched && data_stype == kDefaultStorage && weight_stype == kRowSparseStorage) { + // dns, rsp -> dns + dispatched = storage_type_assign(&out_stype, kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + } + return dispatched; +} + +void QuantizedEmbeddingOpForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(req[quantized_embedding::kOut], kWriteTo); + CHECK_EQ(inputs.size(), 4U); + CHECK_EQ(outputs.size(), 3U); + CHECK_EQ(inputs[quantized_embedding::kWeight].ndim(), 2U) + << "Embedding layer expects its weight to be two-dimensional. " + << inputs[quantized_embedding::kWeight].ndim() + << " dimensional input is given instead"; + mshadow::Stream *s = ctx.get_stream(); + EmbeddingOpForwardDnsImpl(s, inputs[quantized_embedding::kData], inputs[quantized_embedding::kWeight], + req[quantized_embedding::kOut], outputs[quantized_embedding::kOut]); + float min_weight = inputs[quantized_embedding::kWeightMin].dptr()[0]; + float max_weight = inputs[quantized_embedding::kWeightMax].dptr()[0]; + outputs[quantized_embedding::kOutMin].dptr()[0] = min_weight; + outputs[quantized_embedding::kOutMax].dptr()[0] = max_weight; +} + +NNVM_REGISTER_OP(_contrib_quantized_embedding) +.describe(R"code(Maps integer indices to int8 vector representations (embeddings). +)code" ADD_FILELINE) +.set_num_inputs(4) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "weight", "min_weight", "max_weight"}; + }) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "min_output", "max_output"}; + }) +.set_attr("FInferShape", QuantizedEmbeddingOpShape) +.set_attr("FInferType", QuantizedEmbeddingOpType) +.set_attr("FInferStorageType", QuantizedEmbeddingOpForwardStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", QuantizedEmbeddingOpForward) +// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow, +// will be reverted after the improvement of CachedOP is done. +.set_attr("FGradient", MakeZeroGradNodes) +.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.") +.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.") +.add_argument("min_weight", "NDArray-or-Symbol", "Minimum value of data.") +.add_argument("max_weight", "NDArray-or-Symbol", "Maximum value of data.") +.add_arguments(EmbeddingParam::__FIELDS__()); + +NNVM_REGISTER_OP(Embedding) +.set_attr("FQuantizable", [](const NodeAttrs& attrs) { + return QuantizeType::kMust; +}) +.set_attr("FQuantizedOp", [](const NodeAttrs& attrs) { + EmbeddingParam param; + param.Init(attrs.dict); + nnvm::NodePtr node = nnvm::Node::Create(); + if (param.dtype == mshadow::kFloat32) { + node->attrs.op = Op::Get("_contrib_quantized_embedding"); + node->attrs.name = "quantized_" + attrs.name; + } else { + node->attrs.op = Op::Get("Embedding"); + node->attrs.name = attrs.name; + } + node->attrs.dict = attrs.dict; + if (node->op()->attr_parser != nullptr) { + node->op()->attr_parser(&(node->attrs)); + } + return node; + }) +.set_attr("FAvoidQuantizeInput", [](const NodeAttrs &attrs, size_t index) { + if (index == 0) + return true; + else + return false; +}); +} // namespace op +} // namespace mxnet diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 828d761fefd4..004308ca7588 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -58,6 +58,12 @@ enum EmbeddingOpOutputs {kOut}; enum EmbeddingOpResource {kTempSpace}; } // namespace embedding +namespace quantized_embedding { +enum QuantizedEmbeddingOpInputs {kData, kWeight, kWeightMin, kWeightMax}; +enum QuantizedEmbeddingOpOutputs {kOut, kOutMin, kOutMax}; +enum QuantizedEmbeddingOpResource {kTempSpace}; +} // namespace quantized embedding + struct SparseEmbeddingParam: public dmlc::Parameter { int input_dim; From 087c1ee69fbd725439827f9fd4b7f1d4bdbb763a Mon Sep 17 00:00:00 2001 From: chenxiny Date: Fri, 1 Nov 2019 11:28:42 +0800 Subject: [PATCH 4/8] imporve lint --- src/operator/quantization/quantized_indexing_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc index 667730da91ed..b47d4502e18e 100644 --- a/src/operator/quantization/quantized_indexing_op.cc +++ b/src/operator/quantization/quantized_indexing_op.cc @@ -187,3 +187,4 @@ NNVM_REGISTER_OP(Embedding) }); } // namespace op } // namespace mxnet + From eb6af74dd3563328def8e6a2a52668994a04b4fa Mon Sep 17 00:00:00 2001 From: chenxiny Date: Fri, 1 Nov 2019 11:35:14 +0800 Subject: [PATCH 5/8] change to ksupport --- src/operator/quantization/quantized_indexing_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc index b47d4502e18e..e69ec0f454ea 100644 --- a/src/operator/quantization/quantized_indexing_op.cc +++ b/src/operator/quantization/quantized_indexing_op.cc @@ -160,7 +160,7 @@ NNVM_REGISTER_OP(_contrib_quantized_embedding) NNVM_REGISTER_OP(Embedding) .set_attr("FQuantizable", [](const NodeAttrs& attrs) { - return QuantizeType::kMust; + return QuantizeType::kSupport; }) .set_attr("FQuantizedOp", [](const NodeAttrs& attrs) { EmbeddingParam param; From ba4cac6fc201d1a89249a7fb7f891d949c47df0a Mon Sep 17 00:00:00 2001 From: chenxiny Date: Fri, 1 Nov 2019 13:38:29 +0800 Subject: [PATCH 6/8] fix lint --- src/operator/quantization/quantized_indexing_op.cc | 6 ++++-- src/operator/tensor/indexing_op.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc index e69ec0f454ea..b4af3ecb704f 100644 --- a/src/operator/quantization/quantized_indexing_op.cc +++ b/src/operator/quantization/quantized_indexing_op.cc @@ -119,8 +119,10 @@ void QuantizedEmbeddingOpForward(const nnvm::NodeAttrs& attrs, << inputs[quantized_embedding::kWeight].ndim() << " dimensional input is given instead"; mshadow::Stream *s = ctx.get_stream(); - EmbeddingOpForwardDnsImpl(s, inputs[quantized_embedding::kData], inputs[quantized_embedding::kWeight], - req[quantized_embedding::kOut], outputs[quantized_embedding::kOut]); + EmbeddingOpForwardDnsImpl(s, inputs[quantized_embedding::kData], + inputs[quantized_embedding::kWeight], + req[quantized_embedding::kOut], + outputs[quantized_embedding::kOut]); float min_weight = inputs[quantized_embedding::kWeightMin].dptr()[0]; float max_weight = inputs[quantized_embedding::kWeightMax].dptr()[0]; outputs[quantized_embedding::kOutMin].dptr()[0] = min_weight; diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 004308ca7588..f81d831b562b 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -62,7 +62,7 @@ namespace quantized_embedding { enum QuantizedEmbeddingOpInputs {kData, kWeight, kWeightMin, kWeightMax}; enum QuantizedEmbeddingOpOutputs {kOut, kOutMin, kOutMax}; enum QuantizedEmbeddingOpResource {kTempSpace}; -} // namespace quantized embedding +} // namespace quantized_embedding struct SparseEmbeddingParam: public dmlc::Parameter { From 9e5e10c7ec69218a0bf57cbedfc897e3130f295a Mon Sep 17 00:00:00 2001 From: chenxiny Date: Mon, 9 Dec 2019 10:25:36 +0800 Subject: [PATCH 7/8] add quantized embedding test case --- .../python/quantization/test_quantization.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py index 723873ac6fb3..9ea79307e618 100644 --- a/tests/python/quantization/test_quantization.py +++ b/tests/python/quantization/test_quantization.py @@ -516,6 +516,49 @@ def maxabs(a, b): check_quantized_fc((256, 2048, 2, 2), 800, True, qdtype) check_quantized_fc((256, 111, 2, 2), 800, True, qdtype) +@with_seed() +def test_quantized_embedding(): + def check_quantized_embedding(data_shape, input_dim, output_dim): + + def maxabs(a, b): + return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b)) + + data0 = mx.sym.Variable(name='data', shape=data_shape, dtype='int32') + embedding_fp32 = mx.sym.Embedding(data=data0, input_dim=input_dim, output_dim=output_dim) + arg_shapes, _, _ = embedding_fp32.infer_shape(data=data_shape) + arg_names = embedding_fp32.list_arguments() + embedding_fp32_exe = embedding_fp32.simple_bind(ctx=mx.current_context(), grad_req='null') + int8_range = 127.0 + data = mx.nd.random.uniform(low=0, high=input_dim, + shape=arg_shapes[0]).astype('int32') + weight = mx.nd.random.uniform(low=-int8_range, high=int8_range, + shape=arg_shapes[1]).astype('int32') + embedding_fp32_exe.arg_dict[arg_names[0]][:] = data + embedding_fp32_exe.arg_dict[arg_names[1]][:] = weight + + weight_min = mx.nd.min(weight).astype('float32') + weight_max = mx.nd.max(weight).astype('float32') + weight_range = maxabs(weight_min, weight_max) + + output = embedding_fp32_exe.forward()[0] + + embedding_int8 = mx.sym.contrib.quantized_embedding(data=data0, input_dim=input_dim, output_dim=output_dim) + qarg_names = embedding_int8.list_arguments() + type_dict = {qarg_names[1]: 'int8'} + embedding_int8_exe = embedding_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null') + embedding_int8_exe.arg_dict[qarg_names[0]][:] = embedding_fp32_exe.arg_dict[arg_names[0]] + embedding_int8_exe.arg_dict[qarg_names[1]][:] = embedding_fp32_exe.arg_dict[arg_names[1]].astype('int8') + embedding_int8_exe.arg_dict[qarg_names[2]][:] = -weight_range + embedding_int8_exe.arg_dict[qarg_names[3]][:] = weight_range + qoutput, min_range, max_range = embedding_int8_exe.forward() + + assert_almost_equal(output.asnumpy(), qoutput.asnumpy()) + + check_quantized_embedding((1,), 1000, 256) + check_quantized_embedding((1,), 1024, 512) + check_quantized_embedding((32,), 1000, 256) + check_quantized_embedding((32,), 1024, 512) + @with_seed() def test_quantized_flatten(): def check_quantized_flatten(shape, qdtype): From c0246cb32f0a31e4b91cd515d7d54cb16947dd15 Mon Sep 17 00:00:00 2001 From: chenxiny Date: Mon, 9 Dec 2019 13:16:52 +0800 Subject: [PATCH 8/8] skip gpu ut --- tests/python/quantization/test_quantization.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py index 9ea79307e618..a46fa787eea1 100644 --- a/tests/python/quantization/test_quantization.py +++ b/tests/python/quantization/test_quantization.py @@ -519,6 +519,9 @@ def maxabs(a, b): @with_seed() def test_quantized_embedding(): def check_quantized_embedding(data_shape, input_dim, output_dim): + if is_test_for_gpu(): + print('skipped testing test_quantized_embedding for gpu since it is not supported yet') + return def maxabs(a, b): return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b))