From 598435b9a5c0d504d4052f0f937f7fff52af33bc Mon Sep 17 00:00:00 2001 From: ZzSean <18818272991@163.com> Date: Wed, 21 Apr 2021 07:32:29 +0000 Subject: [PATCH 1/2] Modify some contents for elementwise op impl --- .../elementwise/elementwise_add_op.cu | 5 +++-- .../elementwise/elementwise_op_impl.cu.h | 21 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 0ca03fc32fbf6..5c444e752e797 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" @@ -34,7 +33,9 @@ namespace operators { */ template struct CudaAddFunctor { - inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; } + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] + args[1]; + } }; template diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 36add2112974d..0869001d50b7f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + namespace paddle { namespace operators { @@ -90,8 +92,7 @@ struct ElementwiseDataWrapper { template __device__ void VectorizedKernelImpl( - ElementwiseDataWrapper data, int size, Functor func, - int tid) { + ElementwiseDataWrapper data, Functor func, int tid) { using VecType = CudaAlignedVector; VecType ins_vec[ET]; VecType out_vec; @@ -121,10 +122,9 @@ __device__ void VectorizedKernelImpl( data.store_vector(out_vec, tid); } -template -__device__ void ScalarKernelImpl(ElementwiseDataWrapper data, - int size, Functor func, int start, - int remain) { +template +__device__ void ScalarKernelImpl(ElementwiseDataWrapper data, + Functor func, int start, int remain) { T ins[ET]; T out; @@ -146,12 +146,11 @@ __global__ void VectorizedKernel(const T *__restrict__ in0, int tid = blockIdx.x * blockDim.x + threadIdx.x; int remain = size - VecSize * tid; remain = remain > 0 ? remain : 0; + auto data = ElementwiseDataWrapper(out, in0, in1); if (remain >= VecSize) { - auto data = ElementwiseDataWrapper(out, in0, in1); - VectorizedKernelImpl(data, size, func, tid); + VectorizedKernelImpl(data, func, tid); } else { - auto data = ElementwiseDataWrapper(out, in0, in1); - ScalarKernelImpl(data, size, func, tid * VecSize, remain); + ScalarKernelImpl(data, func, tid * VecSize, remain); } } @@ -162,7 +161,7 @@ __global__ void ScalarKernel(const T *__restrict__ in0, auto data = ElementwiseDataWrapper(out, in0, in1); int tid = blockIdx.x * blockDim.x + threadIdx.x; int remain = tid < size ? 1 : 0; - ScalarKernelImpl(data, size, func, tid, remain); + ScalarKernelImpl(data, func, tid, remain); } template From fef72c268621feb145ff3600aea63e840516948d Mon Sep 17 00:00:00 2001 From: ZzSean <18818272991@163.com> Date: Wed, 21 Apr 2021 12:07:21 +0000 Subject: [PATCH 2/2] change include files --- .../operators/elementwise/elementwise_op_impl.cu.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 0869001d50b7f..321826ec647c9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -13,7 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" + +#ifdef __HIPCC__ +#define ELEMENTWISE_BLOCK_SIZE 256 +#else +#define ELEMENTWISE_BLOCK_SIZE 512 +#endif namespace paddle { namespace operators { @@ -172,7 +181,7 @@ void LaunchElementwiseCudaKernel( // calculate the max vec_size for all ins and outs auto size = ins[0]->numel(); int vec_size = GetVectorizedSize(ins, *outs); - int block_size = PADDLE_CUDA_THREAD_SIZE; + int block_size = ELEMENTWISE_BLOCK_SIZE; int grid_size = ((size + vec_size - 1) / vec_size + block_size - 1) / block_size; const T *in0 = ins[0]->data();