This repository has been archived by the owner on Nov 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Sparse square sum #7206
Merged
Merged
Sparse square sum #7206
Changes from 4 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
3c991d6
Add square_sum op
reminisce 96445bb
Add unit test and fix check_numeric_gradient
reminisce 9b400d1
Add .cu file and example
reminisce a05d0e5
Fix lint
reminisce e5b80c9
Remove gpu registration
reminisce 1b78707
Use square_sum in test_module_fm
reminisce File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,340 @@ | ||
/*! | ||
* Copyright (c) 2017 by Contributors | ||
* \file square_sum-inl.h | ||
* \brief This is a temporary solution for fusing operators | ||
* square and sum together as a composite op for row sparse tensors. | ||
* The purpose for fusing square and sum for row sparse tensors | ||
* is that the gradient of the fused operator depends on the input | ||
* ndarray and thus its gradient is a row-sparse ndarray too. | ||
* This fused op will become deprecated after the functionality | ||
* of fusing operators is finished in the future. | ||
*/ | ||
|
||
#ifndef MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ | ||
#define MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ | ||
|
||
#include <vector> | ||
#include <algorithm> | ||
#include <utility> | ||
#include "../mxnet_op.h" | ||
#include "./broadcast_reduce_op.h" | ||
|
||
namespace mxnet { | ||
namespace op { | ||
|
||
inline bool SquareSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, | ||
const Context& ctx, | ||
std::vector<int>* in_attrs, | ||
std::vector<int>* out_attrs) { | ||
CHECK_EQ(in_attrs->size(), 1U); | ||
CHECK_EQ(out_attrs->size(), 1U); | ||
CHECK_EQ((*in_attrs)[0], kRowSparseStorage) | ||
<< "_square_sum only supports row-sparse ndarray as input"; | ||
const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed); | ||
if (param.axis[0] == 1 && param.keepdims) { // sum per row and keep dims | ||
STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); | ||
} else { | ||
STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage); | ||
} | ||
return true; | ||
} | ||
|
||
inline bool SquareSumBackwardInferStorageType(const nnvm::NodeAttrs& attrs, | ||
const Context& ctx, | ||
std::vector<int>* in_attrs, | ||
std::vector<int>* out_attrs) { | ||
CHECK_EQ(in_attrs->size(), 2U); | ||
CHECK_EQ(out_attrs->size(), 1U); | ||
STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage); | ||
STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kRowSparseStorage); | ||
STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kRowSparseStorage); | ||
return true; | ||
} | ||
|
||
/*! | ||
* \brief square sum of a rsp | ||
* if axis = -1, same as mx.nd.sum(tensor*tensor) | ||
* if axis = 0, same as mx.nd.sum(tensor*tensor, axis=0) | ||
* if axis = 1, same as mx.nd.sum(tensor*tensor, axis=1) | ||
* where tensor*tensor is elemwise multiplication of two ndarrays. | ||
*/ | ||
template<int req, int axis, bool keepdim> | ||
struct SquareSumRspKernel; | ||
|
||
/*! | ||
* \brief square sum of a rsp on axis=0 without keeping the dim | ||
*/ | ||
template<int req> | ||
struct SquareSumRspKernel<req, 0, false> { | ||
/*! | ||
* \param j the element index in out_data and column id of in_data | ||
*/ | ||
template<typename DType> | ||
MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data, | ||
const int64_t nnr, const int64_t num_cols) { | ||
DType sum = 0; | ||
for (int64_t i = 0; i < nnr; ++i) { | ||
const DType val = in_data[i*num_cols+j]; | ||
sum += val * val; | ||
} | ||
KERNEL_ASSIGN(out_data[j], req, sum); | ||
} | ||
}; | ||
|
||
/*! | ||
* \brief square sum of a rsp on axis=1 without keeping the dim | ||
*/ | ||
template<int req> | ||
struct SquareSumRspKernel<req, 1, false> { | ||
/*! | ||
* \param i the i-th non-zero row of in_data | ||
*/ | ||
template<typename IType, typename DType> | ||
MSHADOW_XINLINE static void Map(int i, DType* out_data, const IType* in_row_idx, | ||
const DType* in_data, const int64_t num_cols) { | ||
DType sum = 0; | ||
const int64_t offset = i * num_cols; | ||
for (int64_t j = 0; j < num_cols; ++j) { | ||
const DType val = in_data[offset+j]; | ||
sum += val * val; | ||
} | ||
KERNEL_ASSIGN(out_data[in_row_idx[i]], req, sum); | ||
} | ||
}; | ||
|
||
/*! | ||
* \brief square sum of a rsp on axis=1 keeping the dim | ||
*/ | ||
template<int req> | ||
struct SquareSumRspKernel<req, 1, true> { | ||
/*! | ||
* \param i the i-th non-zero row of in_data | ||
*/ | ||
template<typename IType, typename DType> | ||
MSHADOW_XINLINE static void Map(int i, IType* out_row_idx, DType* out_data, | ||
const IType* in_row_idx, const DType* in_data, | ||
const int64_t num_cols) { | ||
DType sum = 0; | ||
out_row_idx[i] = in_row_idx[i]; | ||
const int64_t offset = i * num_cols; | ||
for (int64_t j = 0; j < num_cols; ++j) { | ||
const DType val = in_data[offset+j]; | ||
sum += val * val; | ||
} | ||
KERNEL_ASSIGN(out_data[i], req, sum); | ||
} | ||
}; | ||
|
||
template<int req, int axis> | ||
struct SquareSumRspGradKernel; | ||
|
||
template<int req> | ||
struct SquareSumRspGradKernel<req, 0> { | ||
/*! | ||
* \param i element index in in_grad and in_data | ||
* \param in_grad_row_idx row_idx of the gradient of the op's input | ||
* \param in_grad gradient of the op's input | ||
* \param out_grad gradient of the op's output | ||
* \param in_row_idx row idx of the op's input | ||
* \param in_data op's input | ||
*/ | ||
template<typename IType, typename DType> | ||
MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, | ||
const DType* out_grad, const IType* in_row_idx, | ||
const DType* in_data, const int64_t num_cols) { | ||
const int64_t row = i / num_cols; | ||
in_grad_row_idx[row] = in_row_idx[row]; | ||
KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[i%num_cols]); | ||
} | ||
}; | ||
|
||
template<int req> | ||
struct SquareSumRspGradKernel<req, 1> { | ||
/*! | ||
* \param i element index in in_grad and in_data | ||
* \param in_grad_row_idx row_idx of the gradient of the op's input | ||
* \param in_grad gradient of the op's input | ||
* \param out_grad gradient of the op's output | ||
* \param in_row_idx row idx of the op's input | ||
* \param in_data op's input | ||
*/ | ||
template<typename IType, typename DType> | ||
MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad, | ||
const DType* out_grad, const IType* in_row_idx, | ||
const DType* in_data, const int64_t num_cols) { | ||
const int64_t row = i / num_cols; | ||
in_grad_row_idx[row] = in_row_idx[row]; | ||
KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[in_row_idx[row]]); | ||
} | ||
}; | ||
|
||
template<typename xpu> | ||
void SquareSumRspImpl(const nnvm::NodeAttrs& attrs, | ||
mshadow::Stream<xpu>* s, | ||
const NDArray& input, | ||
const OpReqType req, | ||
NDArray* output) { | ||
const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed); | ||
CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; | ||
CHECK(param.axis[0] == 0 || param.axis[0] == 1) | ||
<< "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; | ||
CHECK_EQ(input.storage_type(), kRowSparseStorage) | ||
<< "_square_sum op only supports row-sparse matrix as input"; | ||
int64_t out_data_size = 0; | ||
if (param.axis[0] == 0) { // axis = 0 | ||
CHECK_EQ(output->storage_type(), kDefaultStorage); | ||
out_data_size = input.storage_shape()[1]; | ||
} else if (param.keepdims) { // axis = 1, keepdims = true | ||
CHECK_EQ(output->storage_type(), kRowSparseStorage); | ||
out_data_size = input.storage_shape()[0]; | ||
} else { // axis = 1, keepdims = false | ||
CHECK_EQ(output->storage_type(), kDefaultStorage); | ||
out_data_size = input.shape()[0]; | ||
} | ||
CHECK_NE(req, kWriteInplace); | ||
|
||
using namespace mxnet_op; | ||
if (!input.storage_initialized()) { | ||
if (req == kWriteTo && output->storage_type() == kDefaultStorage) { | ||
MSHADOW_TYPE_SWITCH(output->data().type_flag_, DType, { | ||
Kernel<set_zero, xpu>::Launch(s, out_data_size, output->data().dptr<DType>()); | ||
}) | ||
} | ||
return; | ||
} | ||
|
||
if (output->storage_type() == kRowSparseStorage) { | ||
output->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)}); | ||
} | ||
const TBlob& out_data = output->data(); | ||
const int64_t nnr = input.storage_shape()[0]; | ||
const int64_t num_cols = input.storage_shape()[1]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it still correct for k-dimensional inputs? Are you assuming 2d inputs here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume the storage_shape for row-sparse storage type is 2D regardless of the dimension of the shape, which is more intuitively understandable and avoids converting k-dim row-sparse shape to 2-D in FComputeEx functions. |
||
const TBlob& in_data = input.data(); | ||
if (0 == param.axis[0]) { // axis = 0, output is dense | ||
MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { | ||
MXNET_ASSIGN_REQ_SWITCH(req, req_type, { | ||
Kernel<SquareSumRspKernel<req_type, 0, false>, xpu>::Launch(s, num_cols, | ||
out_data.dptr<DType>(), input.data().dptr<DType>(), nnr, num_cols); | ||
}) | ||
}) | ||
} else { // axis = 1 | ||
const TBlob in_row_idx = input.aux_data(rowsparse::kIdx); | ||
if (param.keepdims) { // output is rsp | ||
const TBlob out_row_idx = output->aux_data(rowsparse::kIdx); | ||
MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { | ||
MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, { | ||
MXNET_ASSIGN_REQ_SWITCH(req, req_type, { | ||
Kernel<SquareSumRspKernel<req_type, 1, true>, xpu>::Launch(s, nnr, | ||
out_row_idx.dptr<IType>(), out_data.dptr<DType>(), in_row_idx.dptr<IType>(), | ||
in_data.dptr<DType>(), num_cols); | ||
}) | ||
}) | ||
}) | ||
} else { // output is dense | ||
if (req == kWriteTo) { | ||
MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { | ||
Kernel<set_zero, xpu>::Launch(s, out_data_size, out_data.dptr<DType>()); | ||
}) | ||
} | ||
MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, { | ||
MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, { | ||
MXNET_ASSIGN_REQ_SWITCH(req, req_type, { | ||
Kernel<SquareSumRspKernel<req_type, 1, false>, xpu>::Launch(s, nnr, | ||
out_data.dptr<DType>(), in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols); | ||
}) | ||
}) | ||
}) | ||
} | ||
} | ||
} | ||
|
||
template<typename xpu> | ||
void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs, | ||
mshadow::Stream<xpu>* s, | ||
const NDArray& ograd, | ||
const NDArray& input, | ||
const OpReqType req, | ||
NDArray* igrad) { | ||
const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed); | ||
CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0"; | ||
CHECK(param.axis[0] == 0 || param.axis[0] == 1) | ||
<< "_square_sum(row_sparse_matrix) only supports axis=0 or 1"; | ||
CHECK_EQ(ograd.storage_type(), kDefaultStorage); | ||
CHECK_EQ(input.storage_type(), kRowSparseStorage); | ||
CHECK_EQ(igrad->storage_type(), kRowSparseStorage); | ||
CHECK_NE(req, kWriteInplace); | ||
if (!input.storage_initialized()) return; | ||
|
||
using namespace mxnet_op; | ||
igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)}); | ||
const int64_t num_cols = input.storage_shape()[1]; | ||
const TBlob& igrad_data = igrad->data(); | ||
const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx); | ||
const TBlob& ograd_data = ograd.data(); | ||
const TBlob in_data = input.data(); | ||
const TBlob in_row_idx = input.aux_data(rowsparse::kIdx); | ||
MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, { | ||
MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, { | ||
MXNET_ASSIGN_REQ_SWITCH(req, req_type, { | ||
if (0 == param.axis[0]) { // forward is sum per column | ||
Kernel<SquareSumRspGradKernel<req_type, 0>, xpu>::Launch(s, igrad_data.Size(), | ||
igrad_row_idx.dptr<IType>(), igrad_data.dptr<DType>(), ograd_data.dptr<DType>(), | ||
in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols); | ||
} else { // forward is sum per row | ||
Kernel<SquareSumRspGradKernel<req_type, 1>, xpu>::Launch(s, igrad_data.Size(), | ||
igrad_row_idx.dptr<IType>(), igrad_data.dptr<DType>(), ograd_data.dptr<DType>(), | ||
in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols); | ||
} | ||
}) | ||
}) | ||
}) | ||
} | ||
|
||
template<typename xpu> | ||
void SquareSumOpForwardEx(const nnvm::NodeAttrs& attrs, | ||
const OpContext& ctx, | ||
const std::vector<NDArray>& inputs, | ||
const std::vector<OpReqType>& req, | ||
const std::vector<NDArray>& outputs) { | ||
CHECK_EQ(inputs.size(), 1U); | ||
CHECK_EQ(outputs.size(), 1U); | ||
CHECK_EQ(req.size(), 1U); | ||
mshadow::Stream<xpu>* s = ctx.get_stream<xpu>(); | ||
const NDArrayStorageType istype = inputs[0].storage_type(); | ||
if (istype == kRowSparseStorage) { | ||
NDArray output = outputs[0]; | ||
SquareSumRspImpl(attrs, s, inputs[0], req[0], &output); | ||
} else { | ||
LOG(FATAL) << "_square_sum op only supports row-sparse ndarray" | ||
" as input, while input stype = " | ||
<< istype; | ||
} | ||
} | ||
|
||
template<typename xpu> | ||
void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs, | ||
const OpContext& ctx, | ||
const std::vector<NDArray>& inputs, | ||
const std::vector<OpReqType>& req, | ||
const std::vector<NDArray>& outputs) { | ||
CHECK_EQ(inputs.size(), 2U); | ||
CHECK_EQ(outputs.size(), 1U); | ||
CHECK_EQ(req.size(), 1U); | ||
mshadow::Stream<xpu>* s = ctx.get_stream<xpu>(); | ||
const NDArrayStorageType ograd_stype = inputs[0].storage_type(); | ||
const NDArrayStorageType input_stype = inputs[1].storage_type(); | ||
if (input_stype == kRowSparseStorage && ograd_stype == kDefaultStorage) { | ||
NDArray output = outputs[0]; | ||
SquareSumRspGradImpl(attrs, s, inputs[0], inputs[1], req[0], &output); | ||
} else { | ||
LOG(FATAL) << "_square_sum op backward only supports dense ndarray as ograd," | ||
" row-sparse ndarray as input and row-sparse ndarray as igrad," | ||
" while ograd_stype = " << ograd_stype | ||
<< " input_stype = " << input_stype; | ||
} | ||
} | ||
|
||
} // namespace op | ||
} // namespace mxnet | ||
|
||
#endif // MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/*! | ||
* Copyright (c) 2017 by Contributors | ||
* \file square_sum.cc | ||
* \brief CPU Implementation of square_sum op. | ||
*/ | ||
#include "./square_sum-inl.h" | ||
|
||
namespace mxnet { | ||
namespace op { | ||
MXNET_OPERATOR_REGISTER_REDUCE(_square_sum) | ||
.describe(R"code(Computes the square sum of array elements over a given axis | ||
for row-sparse matrix. This is a temporary solution for fusing ops square and | ||
sum together for row-sparse matrix to save memory for storing gradients. | ||
It will become deprecated once the functionality of fusing operators is finished | ||
in the future. | ||
|
||
Example:: | ||
|
||
dns = mx.nd.array([[0, 0], [1, 2], [0, 0], [3, 4], [0, 0]]) | ||
rsp = mx.nd.cast_storage(dns, stype='row_sparse') | ||
sum = mx.nd._internal._square_sum(rsp, axis=1) | ||
sum = [0, 5, 0, 25, 0] | ||
)code" ADD_FILELINE) | ||
.set_attr<FInferStorageType>("FInferStorageType", SquareSumForwardInferStorageType) | ||
.set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpForwardEx<cpu>) | ||
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square_sum"}); | ||
|
||
MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_square_sum) | ||
.set_num_inputs(2) | ||
.set_attr<FInferStorageType>("FInferStorageType", SquareSumBackwardInferStorageType) | ||
.set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpBackwardEx<cpu>); | ||
|
||
} // namespace op | ||
} // namespace mxnet |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this necessary? Was the previous version using cast_storage not working?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is the correct way to do it. The previous version is not correct. The args_grad are outputs in backward FComputeEx functions. They must be CheckAndAlloc inside the FComputeEx functions. The cast_storage would actually create an initialized sparse tensor for the backward which is not right. One corner case illustrating this is that, if the input is uninitialized, the square_sum backward function returns immediately, so the output should be uninitialized as well.