From 5b67bf95b4b7e23a91ffb3b2e963751c1fbe3239 Mon Sep 17 00:00:00 2001 From: zhoujianqian <15205085056@163.com> Date: Thu, 16 Dec 2021 04:31:08 +0000 Subject: [PATCH 1/3] relu forward opt --- paddle/fluid/operators/gelu_op.cu | 40 ++++++++++++++++++++ paddle/fluid/operators/gelu_op.h | 63 +++++++++++++++++++++++++++---- 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 5bb2fd247934b4..993f96e382bacd 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -12,9 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +template +struct GeluXFunctor { + using MT = typename details::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T x) { + MT mx = static_cast(x); + MT temp = erf(mx * static_cast(M_SQRT1_2)); + MT out = mx * static_cast(0.5) * (static_cast(1) + temp); + return static_cast(out); + } +}; + +template +typename std::enable_if< + std::is_same::value>::type +default_gelu_fw(const framework::ExecutionContext& ctx, + const framework::Tensor* in, const bool approximate, + framework::Tensor* out) { + std::vector ins; + std::vector outs; + ins = {in}; + outs = {out}; + const auto& dev_ctx = + ctx.template device_context(); + if (approximate) { + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, 0, paddle::operators::math::GeluFunctor()); + } else { + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, 0, GeluXFunctor()); + } +} + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( gelu, ops::GeluKernel, diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h index 0446d7d284b223..4699633924148f 100644 --- a/paddle/fluid/operators/gelu_op.h +++ b/paddle/fluid/operators/gelu_op.h @@ -184,6 +184,31 @@ struct GeluGradFunctor { } }; +template +typename std::enable_if< + std::is_same::value>::type +default_gelu_fw(const framework::ExecutionContext& ctx, + const framework::Tensor* in, const bool approximate, + framework::Tensor* out) { + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + + auto& place = + *ctx.template device_context().eigen_device(); + + GeluFunctor functor; + functor(place, eigen_in, eigen_out, approximate); +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +typename std::enable_if< + std::is_same::value>::type +default_gelu_fw(const framework::ExecutionContext& ctx, + const framework::Tensor* in, const bool approximate, + framework::Tensor* out); +#endif + template class GeluKernel : public framework::OpKernel { public: @@ -193,16 +218,40 @@ class GeluKernel : public framework::OpKernel { auto approximate = context.Attr("approximate"); out->mutable_data(in->place()); - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); + default_gelu_fw(context, in, approximate, out); } }; +// template +// typename std::enable_if< +// std::is_same::value>::type +// default_gelu_bw(const framework::ExecutionContext& ctx, +// const framework::Tensor* in, +// const framework::Tensor* dout, +// const bool approximate, +// framework::Tensor* dx +// ){ +// auto eigen_x = framework::EigenVector::Flatten(*in); +// auto eigen_dout = framework::EigenVector::Flatten(*dout); +// auto eigen_dx = framework::EigenVector::Flatten(*dx); +// auto& place = +// *ctx.template device_context().eigen_device(); + +// GeluGradFunctor functor; +// functor(place, eigen_x, eigen_dout, eigen_dx, approximate); +// } + +// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// template +// typename std::enable_if< +// std::is_same::value>::type +// default_gelu_bw(const framework::ExecutionContext& ctx, +// const framework::Tensor* in, +// const framework::Tensor* dout, +// const bool approximate, +// framework::Tensor* dx); +// #endif + template class GeluGradKernel : public framework::OpKernel { public: From 875e6ac4b08951f086526f0b656f809fac95e507 Mon Sep 17 00:00:00 2001 From: zhoujianqian <15205085056@163.com> Date: Thu, 16 Dec 2021 12:57:18 +0000 Subject: [PATCH 2/3] add gelu functor --- paddle/fluid/operators/gelu_op.cu | 30 ++++++++++++++++++++++-------- paddle/fluid/operators/gelu_op.h | 30 ------------------------------ 2 files changed, 22 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 993f96e382bacd..1d57dfb4dd1d97 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -12,18 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/gelu_op.h" -#include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { template -struct GeluXFunctor { +struct GeluWithApproximateFunctor { using MT = typename details::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T x) { + // this function is tanh approximation of gelu + MT mx = static_cast(x); + MT out = mx * static_cast(0.5) * + (static_cast(1.0) + + tanh(static_cast(0.79788456) * mx * + (static_cast(1) + static_cast(0.044715) * mx * mx))); + return static_cast(out); + } +}; + +template +struct GeluNoApproximateFunctor { + using MT = typename details::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T x) { + // actual gelu with approximation=false + // x * 0.5 * (1.0 + erf(x * 0.70710678)) MT mx = static_cast(x); MT temp = erf(mx * static_cast(M_SQRT1_2)); MT out = mx * static_cast(0.5) * (static_cast(1) + temp); @@ -37,18 +53,16 @@ typename std::enable_if< default_gelu_fw(const framework::ExecutionContext& ctx, const framework::Tensor* in, const bool approximate, framework::Tensor* out) { - std::vector ins; - std::vector outs; - ins = {in}; - outs = {out}; + std::vector ins = {in}; + std::vector outs = {out}; const auto& dev_ctx = ctx.template device_context(); if (approximate) { LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, paddle::operators::math::GeluFunctor()); + dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); } else { LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluXFunctor()); + dev_ctx, ins, &outs, 0, GeluNoApproximateFunctor()); } } diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h index 4699633924148f..94cf1ab6ef660c 100644 --- a/paddle/fluid/operators/gelu_op.h +++ b/paddle/fluid/operators/gelu_op.h @@ -222,36 +222,6 @@ class GeluKernel : public framework::OpKernel { } }; -// template -// typename std::enable_if< -// std::is_same::value>::type -// default_gelu_bw(const framework::ExecutionContext& ctx, -// const framework::Tensor* in, -// const framework::Tensor* dout, -// const bool approximate, -// framework::Tensor* dx -// ){ -// auto eigen_x = framework::EigenVector::Flatten(*in); -// auto eigen_dout = framework::EigenVector::Flatten(*dout); -// auto eigen_dx = framework::EigenVector::Flatten(*dx); -// auto& place = -// *ctx.template device_context().eigen_device(); - -// GeluGradFunctor functor; -// functor(place, eigen_x, eigen_dout, eigen_dx, approximate); -// } - -// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -// template -// typename std::enable_if< -// std::is_same::value>::type -// default_gelu_bw(const framework::ExecutionContext& ctx, -// const framework::Tensor* in, -// const framework::Tensor* dout, -// const bool approximate, -// framework::Tensor* dx); -// #endif - template class GeluGradKernel : public framework::OpKernel { public: From 141c05abdca29b54a2e052a46cf846cd8edd8b8e Mon Sep 17 00:00:00 2001 From: zhoujianqian <15205085056@163.com> Date: Mon, 20 Dec 2021 11:50:40 +0000 Subject: [PATCH 3/3] optimize code --- paddle/fluid/operators/gelu_op.cu | 69 +++++++++++++++++-------------- paddle/fluid/operators/gelu_op.h | 33 ++++----------- 2 files changed, 44 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 1d57dfb4dd1d97..6a4a322b327a0d 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -22,49 +22,54 @@ namespace operators { template struct GeluWithApproximateFunctor { - using MT = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T x) { + using MPType = typename details::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { // this function is tanh approximation of gelu - MT mx = static_cast(x); - MT out = mx * static_cast(0.5) * - (static_cast(1.0) + - tanh(static_cast(0.79788456) * mx * - (static_cast(1) + static_cast(0.044715) * mx * mx))); + MPType x = static_cast(arg_x); + MPType one = static_cast(1); + MPType out = x * static_cast(0.5) * + (one + tanh(static_cast(0.79788456) * x * + (one + static_cast(0.044715) * x * x))); return static_cast(out); } }; template -struct GeluNoApproximateFunctor { - using MT = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T x) { - // actual gelu with approximation=false - // x * 0.5 * (1.0 + erf(x * 0.70710678)) - MT mx = static_cast(x); - MT temp = erf(mx * static_cast(M_SQRT1_2)); - MT out = mx * static_cast(0.5) * (static_cast(1) + temp); +struct GeluWithoutApproximateFunctor { + using MPType = typename details::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // actual gelu with approximation = false + MPType x = static_cast(arg_x); + MPType erf_out = erf(x * static_cast(M_SQRT1_2)); + MPType out = + x * static_cast(0.5) * (static_cast(1) + erf_out); return static_cast(out); } }; -template -typename std::enable_if< - std::is_same::value>::type -default_gelu_fw(const framework::ExecutionContext& ctx, - const framework::Tensor* in, const bool approximate, - framework::Tensor* out) { - std::vector ins = {in}; - std::vector outs = {out}; - const auto& dev_ctx = - ctx.template device_context(); - if (approximate) { - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); - } else { - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluNoApproximateFunctor()); +template +class GeluKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + auto* in = context.Input("X"); + auto approximate = context.Attr("approximate"); + out->mutable_data(in->place()); + + std::vector ins = {in}; + std::vector outs = {out}; + const auto& dev_ctx = + context.template device_context(); + if (approximate) { + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); + } else { + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); + } } -} +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h index 94cf1ab6ef660c..0446d7d284b223 100644 --- a/paddle/fluid/operators/gelu_op.h +++ b/paddle/fluid/operators/gelu_op.h @@ -184,31 +184,6 @@ struct GeluGradFunctor { } }; -template -typename std::enable_if< - std::is_same::value>::type -default_gelu_fw(const framework::ExecutionContext& ctx, - const framework::Tensor* in, const bool approximate, - framework::Tensor* out) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - - auto& place = - *ctx.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -default_gelu_fw(const framework::ExecutionContext& ctx, - const framework::Tensor* in, const bool approximate, - framework::Tensor* out); -#endif - template class GeluKernel : public framework::OpKernel { public: @@ -218,7 +193,13 @@ class GeluKernel : public framework::OpKernel { auto approximate = context.Attr("approximate"); out->mutable_data(in->place()); - default_gelu_fw(context, in, approximate, out); + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto& place = + *context.template device_context().eigen_device(); + + GeluFunctor functor; + functor(place, eigen_in, eigen_out, approximate); } };