Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Add Sub Mul Max Min Pow binary functors in elementwise system #33050

Merged
merged 27 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e4d29f3
First Commit.
JamesLim-sy May 21, 2021
3f243ed
First Commit.
JamesLim-sy May 21, 2021
bc12b1b
Debuging the multipler bugs
JamesLim-sy May 23, 2021
8290346
Adding Max_Min operators
JamesLim-sy May 23, 2021
49c221b
Fixs multipler bugs and supporting Max\Min OPs
JamesLim-sy May 23, 2021
dd24d12
Fixs multipler bugs and supporting Max\Min OPs
JamesLim-sy May 23, 2021
173ee57
Delete the useless codes in elementwise_mul_op.cu
JamesLim-sy May 23, 2021
0a7bfef
Delete the useless codes in elementwise_mul_op.cu
JamesLim-sy May 23, 2021
07b3797
Delete the useless codes in elementwise_mul_op.cu
JamesLim-sy May 23, 2021
a16ba39
Merge branch 'Adding_binary_functor_support' of https://github.com/Ja…
JamesLim-sy May 23, 2021
9bca0af
Merge broadcast update with OutType template argument.
JamesLim-sy May 24, 2021
b5182f1
Adjust elementwise-functor location
JamesLim-sy May 24, 2021
9d46543
Fisrt commit
JamesLim-sy May 25, 2021
74e4179
Trigger of rerun
JamesLim-sy May 25, 2021
656ac99
To avoid spartial specification bugs which happened in PR-CI-ROCM
JamesLim-sy May 26, 2021
585566f
Avoid kUnary instantiation of LaunchElementwiseCudaKernel at compile …
JamesLim-sy May 30, 2021
b9c5ea5
refine the warpper of binary ops
JamesLim-sy May 30, 2021
25d290e
refine the warpper of binary ops
JamesLim-sy May 30, 2021
0e4a011
Fix bugs
JamesLim-sy May 31, 2021
d9c70ec
refine warpper of broadcast and add cuda op
JamesLim-sy May 31, 2021
ce5a717
fix bus
JamesLim-sy May 31, 2021
950965b
adding pow
JamesLim-sy Jun 1, 2021
1f72b51
adding pow
JamesLim-sy Jun 1, 2021
90a0b29
Merge branch 'Adding_binary_functor_support' of https://github.com/Ja…
JamesLim-sy Jun 1, 2021
5b146cd
Fix header quote sort
JamesLim-sy Jun 1, 2021
f5a2ce7
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
JamesLim-sy Jun 2, 2021
cd40092
refine warpper
JamesLim-sy Jun 2, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions paddle/fluid/operators/elementwise/elementwise_add_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ namespace operators {
1. For Unary Op, the length of input array is 1,
e.g. Relu: return args[0] > 0 ? args[0] : 0;
2. For Binary Op, the length of input array is 2,
e.g. Add: return args[0] + args[1];
e.g. Add: return args[0] expr args[1];
*/
template <typename T>
struct CudaAddFunctor {
__device__ __forceinline__ T operator()(const T* args) const {
inline HOSTDEVICE T operator()(const T* args) const {
return args[0] + args[1];
}
};
Expand Down
37 changes: 37 additions & 0 deletions paddle/fluid/operators/elementwise/elementwise_max_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,46 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"

namespace ops = paddle::operators;

namespace paddle {
namespace operators {

template <typename T>
struct CudaMaxFunctor {
inline HOSTDEVICE T operator()(const T* args) const {
return (args[0] > args[1] ? args[0] : args[1]);
}
};

template <typename T>
class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这行axis的换算也可以放到LaunchElementwiseCudaKernel里面?


std::vector<const framework::Tensor*> ins = {x, y};
std::vector<framework::Tensor*> outs = {z};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();

LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
cuda_ctx, ins, &outs, axis, CudaMaxFunctor<T>());
}
};

} // namespace operators
} // namespace paddle

REGISTER_OP_CUDA_KERNEL(
elementwise_max,
ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/elementwise/elementwise_max_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {

z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MaxFunctor<T>(), z);
ElementwiseComputeEx<MaxFunctor<T>, platform::CPUDeviceContext, T>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这是一个通用的实现,不是一个特化的实现,不要改这里。后面如果想删除ElementwiseComputeEx的GPU实现代码,可以把ElementwiseMaxKernel在.h里面声明,CPU Kernel在.cc里面特化,GPU Kernel在.cu里面特化。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

按照建议修改

ctx, x, y, axis, MaxFunctor<T>(), z);
}
};

Expand Down
37 changes: 37 additions & 0 deletions paddle/fluid/operators/elementwise/elementwise_min_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,46 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"

namespace ops = paddle::operators;

namespace paddle {
namespace operators {

template <typename T>
struct CudaMinFunctor {
inline HOSTDEVICE T operator()(const T* args) const {
return (args[0] > args[1] ? args[1] : args[0]);
}
};

template <typename T>
class ElementwiseMinKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;

std::vector<const framework::Tensor*> ins = {x, y};
std::vector<framework::Tensor*> outs = {z};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();

LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
cuda_ctx, ins, &outs, axis, CudaMinFunctor<T>());
}
};

} // namespace operators
} // namespace paddle

REGISTER_OP_CUDA_KERNEL(
elementwise_min,
ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/elementwise/elementwise_min_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {

z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MinFunctor<T>(), z);
ElementwiseComputeEx<MinFunctor<T>, platform::CPUDeviceContext, T>(
ctx, x, y, axis, MinFunctor<T>(), z);
}
};

Expand Down
80 changes: 52 additions & 28 deletions paddle/fluid/operators/elementwise/elementwise_mul_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
Expand All @@ -25,37 +26,60 @@ namespace paddle {
namespace operators {

template <typename T>
struct SameDimsElemwiseMul<platform::CUDADeviceContext, T> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
MulRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
x->numel());
for_range(functor);
struct CudaMulFunctor {
inline HOSTDEVICE T operator()(const T* args) const {
return args[0] * args[1];
}
};

template <>
struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto size = x->numel();
dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
PADDLE_CUDA_THREAD_SIZE,
1);
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
const half* x2 =
reinterpret_cast<const half*>(x->data<platform::float16>());
const half* y2 =
reinterpret_cast<const half*>(y->data<platform::float16>());
half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
SameDimsElemwiseMulCUDAKernel<<<
grid_size, block_size, 0,
ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
x2, y2, z2, size);
template <typename T>
class ElementwiseMulKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE_EQ(x_var != nullptr, true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个检查可以用PADDLE_ENFORCE_NOT_NULL

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

按照建议修改

platform::errors::InvalidArgument(
"Cannot get input Variable X, Variable name = %s.",
ctx.InputName("X")));
auto* y = ctx.Input<framework::LoDTensor>("Y");
framework::Tensor x, *z;

if (x_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true,
platform::errors::InvalidArgument(
"For elementwise_op, if X is Sparse, Y must be "
"scalar. But reveived the size of Y = %s.",
y->dims().size()));
auto& x_sele = x_var->Get<framework::SelectedRows>();
auto out_sele = ctx.Output<framework::SelectedRows>("Out");
x = x_sele.value();
out_sele->set_rows(x_sele.rows());
out_sele->set_height(x_sele.height());
out_sele->mutable_value()->Resize(x_sele.value().dims());
out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
} else if (x_var->IsType<framework::LoDTensor>()) {
x = x_var->Get<framework::LoDTensor>();
z = ctx.Output<framework::LoDTensor>("Out");
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"X's type[%s] is not supported by elementwise_op. X's type should be "
"LoDTensor or SelectedRows.",
framework::ToTypeName(x_var->Type())));
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不要复制粘贴大段代码,L41 - L70写个函数封装一下。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L41 - L70也可以封装到PackTensorsIntoVector函数里面。

z->mutable_data<T>(ctx.GetPlace());

int axis = ctx.Attr<int>("axis");
axis = axis == -1 ? std::abs(x.dims().size() - y->dims().size()) : axis;

std::vector<const framework::Tensor*> ins = {&x, y};
std::vector<framework::Tensor*> outs = {z};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();

LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
cuda_ctx, ins, &outs, axis, CudaMulFunctor<T>());
}
};

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/elementwise/elementwise_mul_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
z->mutable_data<T>(ctx.GetPlace());
auto dims_equal = x.dims() == y->dims();
if (dims_equal) {
SameDimsElemwiseMul<DeviceContext, T> same_dims_mul;
SameDimsElemwiseMul<platform::CPUDeviceContext, T> same_dims_mul;
same_dims_mul(ctx, &x, y, z);
} else {
default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
default_elementwise_mul<platform::CPUDeviceContext, T>(ctx, &x, y, z);
}
}
};
Expand Down
51 changes: 23 additions & 28 deletions paddle/fluid/operators/elementwise/elementwise_sub_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
Expand All @@ -25,37 +26,31 @@ namespace paddle {
namespace operators {

template <typename T>
struct SameDimsElemwiseSub<platform::CUDADeviceContext, T> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
SubRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
x->numel());
for_range(functor);
struct CudaSubFunctor {
inline HOSTDEVICE T operator()(const T* args) const {
return args[0] - args[1];
}
};

template <>
struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto size = x->numel();
dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
PADDLE_CUDA_THREAD_SIZE,
1);
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
const half* x2 =
reinterpret_cast<const half*>(x->data<platform::float16>());
const half* y2 =
reinterpret_cast<const half*>(y->data<platform::float16>());
half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
SameDimsElemwiseSubCUDAKernel<<<
grid_size, block_size, 0,
ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
x2, y2, z2, size);
template <typename T>
class ElementwiseSubKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;

std::vector<const framework::Tensor*> ins = {x, y};
std::vector<framework::Tensor*> outs = {z};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();

LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
cuda_ctx, ins, &outs, axis, CudaSubFunctor<T>());
}
};

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/elementwise/elementwise_sub_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {

auto dims_equal = x->dims() == y->dims();
if (dims_equal) {
SameDimsElemwiseSub<DeviceContext, T> same_dims_sub;
SameDimsElemwiseSub<platform::CPUDeviceContext, T> same_dims_sub;
same_dims_sub(ctx, x, y, z);
} else {
default_elementwise_sub<DeviceContext, T>(ctx, x, y, z);
default_elementwise_sub<platform::CPUDeviceContext, T>(ctx, x, y, z);
}
}
};
Expand Down