Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Speedup _contrib_index_copy #14359

Merged
merged 4 commits into from
Mar 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 2 additions & 91 deletions src/operator/contrib/index_copy-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,108 +37,19 @@
namespace mxnet {
namespace op {

template<int req>
struct index_copy_forward {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
int dim,
IType* index,
DType* new_tensor,
DType* out_tensor) {
DType* out_ptr = out_tensor + static_cast<int>(index[i]) * dim;
DType* new_ptr = new_tensor + i * dim;
for (int idx = 0; idx < dim; ++idx) {
KERNEL_ASSIGN(out_ptr[idx], req, new_ptr[idx]);
}
}
};

template<typename xpu>
void IndexCopyForward(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
const TBlob& out = outputs[0];
const TBlob& original_tensor = inputs[0];
const TBlob& idx_vector = inputs[1];
const TBlob& copied_tensor = inputs[2];
int dim = inputs[2].Size() / inputs[1].Size();
// copy original tensor to output
mxnet_op::copy(s, out, original_tensor);
// index copy
MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
mxnet_op::Kernel<index_copy_forward<req_type>, xpu>::Launch(s,
idx_vector.Size(), dim,
idx_vector.dptr<IType>(),
copied_tensor.dptr<DType>(),
out.dptr<DType>());
});
});
});
}

struct index_copy_backward {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
int dim,
int index_size,
int req1, int req2,
DType* out_grad,
IType* index,
DType* in_grad_1,
DType* in_grad_2) {
// Copy to in_grad_2
for (int p = 0; p < index_size; ++p) {
int idx = static_cast<int>(index[p]);
if (i >= idx*dim && i < (idx+1)*dim) {
int offset = i - idx*dim;
KERNEL_ASSIGN(in_grad_2[p*dim+offset], req2, out_grad[i]);
return;
}
}
// Copy to in_grad_1
KERNEL_ASSIGN(in_grad_1[i], req1, out_grad[i]);
}
};
const std::vector<TBlob>& outputs);

template<typename xpu>
void IndexCopyBackward(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 4U);
CHECK_EQ(outputs.size(), 3U);
mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
const TBlob& out_grad = inputs[0];
const TBlob& index = inputs[2];
const TBlob& in_grad_1 = outputs[0];
const TBlob& in_grad_2 = outputs[2];
int dim = inputs[3].Size() / inputs[2].Size();
int index_size = inputs[2].Size();
Fill<false>(s, outputs[0], req[0], 0);
Fill<false>(s, outputs[2], req[2], 0);
// index_copy_backward
MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
mxnet_op::Kernel<index_copy_backward, xpu>::Launch(s,
out_grad.Size(),
dim, index_size,
req[0], req[2],
out_grad.dptr<DType>(),
index.dptr<IType>(),
in_grad_1.dptr<DType>(),
in_grad_2.dptr<DType>());
});
});
}
const std::vector<TBlob>& outputs);

inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
mxnet::ShapeVector *in_attrs,
Expand Down
120 changes: 120 additions & 0 deletions src/operator/contrib/index_copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,122 @@
namespace mxnet {
namespace op {

struct index_copy_fwd_cpu {
template<typename DType, typename IType>
static void Map(int i,
const DType* new_tensor,
const IType* idx,
DType* out_tensor,
int dim_size) {
DType* out_ptr = out_tensor + static_cast<int>(idx[i]) * dim_size;
const DType* new_ptr = new_tensor + i * dim_size;
std::memcpy(out_ptr, new_ptr, sizeof(DType) * dim_size);
}
};

template<>
void IndexCopyForward<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
CHECK(req[0] != kAddTo);
if (req[0] == kNullOp) return;
mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
haojin2 marked this conversation as resolved.
Show resolved Hide resolved
const TBlob& out = outputs[0];
const TBlob& original_tensor = inputs[0];
const TBlob& idx_vector = inputs[1];
const TBlob& copied_tensor = inputs[2];
int dim_size = inputs[2].Size() / inputs[1].Size();
// copy original tensor to output
copy(s, out, original_tensor);
// index copy
MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
Kernel<index_copy_fwd_cpu, cpu>::Launch(
s, idx_vector.Size(), copied_tensor.dptr<DType>(),
idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
});
});
}

struct index_copy_bwd_cpu {
template<typename DType, typename IType>
static void Map(int i,
const DType* out_tensor_grad,
DType* orig_tensor_grad,
DType* new_tensor_grad,
const IType* idx,
int dim_size,
int idx_size,
OpReqType orig_req,
OpReqType new_req) {
const int index = idx[i];
DType* new_ptr = new_tensor_grad + i * dim_size;
DType* orig_ptr = orig_tensor_grad + index * dim_size;
const DType* src_ptr = out_tensor_grad + index * dim_size;
for (int iter = 0; iter < dim_size; ++iter) {
KERNEL_ASSIGN(new_ptr[iter], new_req, src_ptr[iter]);
}
if (orig_req == kAddTo) {
for (int iter = 0; iter < dim_size; ++iter) {
orig_ptr[iter] -= src_ptr[iter];
}
} else if (orig_req == kNullOp) {
return;
} else {
std::memset(orig_ptr, 0, sizeof(DType) * dim_size);
}
}
};

template<>
void IndexCopyBackward<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 4U);
CHECK_EQ(outputs.size(), 3U);
Stream<cpu> *s = ctx.get_stream<cpu>();
const TBlob& out_grad = inputs[0];
const TBlob& index = inputs[2];
const TBlob& in_grad_1 = outputs[0];
haojin2 marked this conversation as resolved.
Show resolved Hide resolved
const TBlob& in_grad_2 = outputs[2];
int dim_size = inputs[3].Size() / inputs[2].Size();
int index_size = inputs[2].Size();
OpReqType orig_req = req[0];
OpReqType new_req = req[2];
// index_copy_backward
MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
switch (orig_req) {
case kNullOp:
break;
case kWriteTo:
case kWriteInplace:
copy(s, in_grad_1, out_grad);
break;
case kAddTo:
Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, cpu>::Launch(
s, out_grad.Size(), in_grad_1.dptr<DType>(),
out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
}
Kernel<index_copy_bwd_cpu, cpu>::Launch(
s, index_size, out_grad.dptr<DType>(),
in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
});
});
}

static bool IndexCopyType(const nnvm::NodeAttrs& attrs,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
Expand Down Expand Up @@ -71,6 +187,10 @@ Examples::
.set_attr<nnvm::FInferType>("FInferType", IndexCopyType)
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_contrib_backward_index_copy"})
.set_attr<FCompute>("FCompute<cpu>", IndexCopyForward<cpu>)
.set_attr<nnvm::FListInputNames>("FListInputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"old_tensor", "index_vector", "new_tensor"};
})
.add_argument("old_tensor", "NDArray-or-Symbol", "Old tensor")
.add_argument("index_vector", "NDArray-or-Symbol", "Index vector")
.add_argument("new_tensor", "NDArray-or-Symbol", "New tensor to be copied");
Expand Down
110 changes: 109 additions & 1 deletion src/operator/contrib/index_copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,122 @@
*/

/*!
* \file index_copy.cc
* \file index_copy.cu
* \brief
*/
#include "./index_copy-inl.h"

namespace mxnet {
namespace op {

struct index_copy_fwd_gpu {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
const DType* new_tensor,
const IType* idx,
DType* out_tensor,
int dim_size) {
int index = static_cast<int>(idx[i / dim_size]);
out_tensor[index * dim_size + i % dim_size] = new_tensor[i];
}
};

template<>
void IndexCopyForward<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
CHECK(req[0] != kAddTo);
if (req[0] == kNullOp) return;
mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
const TBlob& out = outputs[0];
const TBlob& original_tensor = inputs[0];
const TBlob& idx_vector = inputs[1];
const TBlob& copied_tensor = inputs[2];
int dim_size = inputs[2].Size() / inputs[1].Size();
// copy original tensor to output
copy(s, out, original_tensor);
// index copy
MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
Kernel<index_copy_fwd_gpu, gpu>::Launch(
s, copied_tensor.Size(), copied_tensor.dptr<DType>(),
idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
});
});
}

struct index_copy_bwd_gpu {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
const DType* out_grad,
DType* orig_grad,
DType* new_grad,
const IType* idx,
int dim_size,
int idx_size,
OpReqType orig_req,
OpReqType new_req) {
int index = idx[i / dim_size];
KERNEL_ASSIGN(new_grad[i], new_req, out_grad[index * dim_size + i % dim_size]);
if (orig_req == kAddTo) {
orig_grad[index * dim_size + i % dim_size] -= new_grad[i];
} else if (orig_req == kNullOp) {
return;
} else {
orig_grad[index * dim_size + i % dim_size] = 0;
}
}
};

template<>
void IndexCopyBackward<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 4U);
CHECK_EQ(outputs.size(), 3U);
Stream<gpu> *s = ctx.get_stream<gpu>();
const TBlob& out_grad = inputs[0];
const TBlob& index = inputs[2];
const TBlob& in_grad_1 = outputs[0];
const TBlob& in_grad_2 = outputs[2];
int dim_size = inputs[3].Size() / inputs[2].Size();
int index_size = inputs[2].Size();
OpReqType orig_req = req[0];
OpReqType new_req = req[2];
// index_copy_backward
MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
switch (orig_req) {
case kNullOp:
break;
case kWriteTo:
case kWriteInplace:
copy(s, in_grad_1, out_grad);
break;
case kAddTo:
Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, gpu>::Launch(
s, out_grad.Size(), in_grad_1.dptr<DType>(),
out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
}
Kernel<index_copy_bwd_gpu, gpu>::Launch(
s, in_grad_2.Size(), out_grad.dptr<DType>(),
in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
});
});
}

NNVM_REGISTER_OP(_contrib_index_copy)
.set_attr<FCompute>("FCompute<gpu>", IndexCopyForward<gpu>);

Expand Down