From 598435b9a5c0d504d4052f0f937f7fff52af33bc Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Wed, 21 Apr 2021 07:32:29 +0000
Subject: [PATCH 1/2] Modify some contents for elementwise op impl

---
 .../elementwise/elementwise_add_op.cu         |  5 +++--
 .../elementwise/elementwise_op_impl.cu.h      | 21 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 0ca03fc32fbf6..5c444e752e797 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
@@ -34,7 +33,9 @@ namespace operators {
 */
 template <typename T>
 struct CudaAddFunctor {
-  inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; }
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] + args[1];
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 36add2112974d..0869001d50b7f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
 namespace paddle {
 namespace operators {
 
@@ -90,8 +92,7 @@ struct ElementwiseDataWrapper {
 
 template <ElementwiseType ET, int VecSize, typename T, typename Functor>
 __device__ void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, T> data, int size, Functor func,
-    int tid) {
+    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
   using VecType = CudaAlignedVector<T, VecSize>;
   VecType ins_vec[ET];
   VecType out_vec;
@@ -121,10 +122,9 @@ __device__ void VectorizedKernelImpl(
   data.store_vector(out_vec, tid);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
-__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, 1, T> data,
-                                 int size, Functor func, int start,
-                                 int remain) {
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
+                                 Functor func, int start, int remain) {
   T ins[ET];
   T out;
 
@@ -146,12 +146,11 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = size - VecSize * tid;
   remain = remain > 0 ? remain : 0;
+  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
   if (remain >= VecSize) {
-    auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
-    VectorizedKernelImpl(data, size, func, tid);
+    VectorizedKernelImpl(data, func, tid);
   } else {
-    auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
-    ScalarKernelImpl(data, size, func, tid * VecSize, remain);
+    ScalarKernelImpl(data, func, tid * VecSize, remain);
   }
 }
 
@@ -162,7 +161,7 @@ __global__ void ScalarKernel(const T *__restrict__ in0,
   auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = tid < size ? 1 : 0;
-  ScalarKernelImpl(data, size, func, tid, remain);
+  ScalarKernelImpl(data, func, tid, remain);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>

From fef72c268621feb145ff3600aea63e840516948d Mon Sep 17 00:00:00 2001
From: ZzSean <18818272991@163.com>
Date: Wed, 21 Apr 2021 12:07:21 +0000
Subject: [PATCH 2/2] change include files

---
 .../operators/elementwise/elementwise_op_impl.cu.h  | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 0869001d50b7f..321826ec647c9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -13,7 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
 
 namespace paddle {
 namespace operators {
@@ -172,7 +181,7 @@ void LaunchElementwiseCudaKernel(
   // calculate the max vec_size for all ins and outs
   auto size = ins[0]->numel();
   int vec_size = GetVectorizedSize<T>(ins, *outs);
-  int block_size = PADDLE_CUDA_THREAD_SIZE;
+  int block_size = ELEMENTWISE_BLOCK_SIZE;
   int grid_size =
       ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
   const T *in0 = ins[0]->data<T>();