PaddlePaddle · Xreki · Jun 2, 2021 · May 21, 2021 · May 21, 2021 · May 23, 2021
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -29,11 +29,11 @@ namespace operators {
    1. For Unary Op, the length of input array is 1,
       e.g. Relu: return args[0] > 0 ? args[0] : 0;
    2. For Binary Op, the length of input array is 2,
-      e.g. Add: return args[0] + args[1];
+      e.g. Add: return args[0] expr args[1];
 */
 template <typename T>
 struct CudaAddFunctor {
-  __device__ __forceinline__ T operator()(const T* args) const {
+  inline HOSTDEVICE T operator()(const T* args) const {
     return args[0] + args[1];
   }
 };

diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -12,9 +12,46 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaMaxFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return (args[0] > args[1] ? args[0] : args[1]);
+  }
+};
+
+template <typename T>
+class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
+
+    std::vector<const framework::Tensor*> ins = {x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMaxFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,

diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -35,8 +35,8 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
 
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MaxFunctor<T>(), z);
+    ElementwiseComputeEx<MaxFunctor<T>, platform::CPUDeviceContext, T>(
+        ctx, x, y, axis, MaxFunctor<T>(), z);
   }
 };
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -12,9 +12,46 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaMinFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return (args[0] > args[1] ? args[1] : args[0]);
+  }
+};
+
+template <typename T>
+class ElementwiseMinKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
+
+    std::vector<const framework::Tensor*> ins = {x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMinFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,

diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -35,8 +35,8 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
 
     z->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MinFunctor<T>(), z);
+    ElementwiseComputeEx<MinFunctor<T>, platform::CPUDeviceContext, T>(
+        ctx, x, y, axis, MinFunctor<T>(), z);
   }
 };
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
@@ -25,37 +26,60 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseMul<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    MulRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaMulFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] * args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseMulCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+template <typename T>
+class ElementwiseMulKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x_var = ctx.InputVar("X");
+    PADDLE_ENFORCE_EQ(x_var != nullptr, true,
+                      platform::errors::InvalidArgument(
+                          "Cannot get input Variable X, Variable name = %s.",
+                          ctx.InputName("X")));
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    framework::Tensor x, *z;
+
+    if (x_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true,
+                        platform::errors::InvalidArgument(
+                            "For elementwise_op, if X is Sparse, Y must be "
+                            "scalar. But reveived the size of Y = %s.",
+                            y->dims().size()));
+      auto& x_sele = x_var->Get<framework::SelectedRows>();
+      auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+      x = x_sele.value();
+      out_sele->set_rows(x_sele.rows());
+      out_sele->set_height(x_sele.height());
+      out_sele->mutable_value()->Resize(x_sele.value().dims());
+      out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
+      z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    } else if (x_var->IsType<framework::LoDTensor>()) {
+      x = x_var->Get<framework::LoDTensor>();
+      z = ctx.Output<framework::LoDTensor>("Out");
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "X's type[%s] is not supported by elementwise_op. X's type should be "
+          "LoDTensor or SelectedRows.",
+          framework::ToTypeName(x_var->Type())));
+    }
+    z->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+    axis = axis == -1 ? std::abs(x.dims().size() - y->dims().size()) : axis;
+
+    std::vector<const framework::Tensor*> ins = {&x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMulFunctor<T>());
   }
 };
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -119,10 +119,10 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     z->mutable_data<T>(ctx.GetPlace());
     auto dims_equal = x.dims() == y->dims();
     if (dims_equal) {
-      SameDimsElemwiseMul<DeviceContext, T> same_dims_mul;
+      SameDimsElemwiseMul<platform::CPUDeviceContext, T> same_dims_mul;
       same_dims_mul(ctx, &x, y, z);
     } else {
-      default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
+      default_elementwise_mul<platform::CPUDeviceContext, T>(ctx, &x, y, z);
     }
   }
 };

diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
@@ -25,37 +26,31 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseSub<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    SubRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaSubFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] - args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseSubCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+template <typename T>
+class ElementwiseSubKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
+
+    std::vector<const framework::Tensor*> ins = {x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaSubFunctor<T>());
   }
 };
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -55,10 +55,10 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
 
     auto dims_equal = x->dims() == y->dims();
     if (dims_equal) {
-      SameDimsElemwiseSub<DeviceContext, T> same_dims_sub;
+      SameDimsElemwiseSub<platform::CPUDeviceContext, T> same_dims_sub;
       same_dims_sub(ctx, x, y, z);
     } else {
-      default_elementwise_sub<DeviceContext, T>(ctx, x, y, z);
+      default_elementwise_sub<platform::CPUDeviceContext, T>(ctx, x, y, z);
     }
   }
 };