From 5b67bf95b4b7e23a91ffb3b2e963751c1fbe3239 Mon Sep 17 00:00:00 2001
From: zhoujianqian <15205085056@163.com>
Date: Thu, 16 Dec 2021 04:31:08 +0000
Subject: [PATCH 1/3] relu forward opt

---
 paddle/fluid/operators/gelu_op.cu | 40 ++++++++++++++++++++
 paddle/fluid/operators/gelu_op.h  | 63 +++++++++++++++++++++++++++----
 2 files changed, 96 insertions(+), 7 deletions(-)
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 5bb2fd247934b4..993f96e382bacd 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -12,9 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GeluXFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T x) {
+    MT mx = static_cast<MT>(x);
+    MT temp = erf(mx * static_cast<MT>(M_SQRT1_2));
+    MT out = mx * static_cast<MT>(0.5) * (static_cast<MT>(1) + temp);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_gelu_fw(const framework::ExecutionContext& ctx,
+                const framework::Tensor* in, const bool approximate,
+                framework::Tensor* out) {
+  std::vector<const framework::Tensor*> ins;
+  std::vector<framework::Tensor*> outs;
+  ins = {in};
+  outs = {out};
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  if (approximate) {
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, paddle::operators::math::GeluFunctor<T>());
+  } else {
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluXFunctor<T>());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     gelu, ops::GeluKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
index 0446d7d284b223..4699633924148f 100644
--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -184,6 +184,31 @@ struct GeluGradFunctor {
   }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+default_gelu_fw(const framework::ExecutionContext& ctx,
+                const framework::Tensor* in, const bool approximate,
+                framework::Tensor* out) {
+  auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+  auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+
+  auto& place =
+      *ctx.template device_context<platform::CPUDeviceContext>().eigen_device();
+
+  GeluFunctor<T> functor;
+  functor(place, eigen_in, eigen_out, approximate);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_gelu_fw(const framework::ExecutionContext& ctx,
+                const framework::Tensor* in, const bool approximate,
+                framework::Tensor* out);
+#endif
+
 template <typename DeviceContext, typename T>
 class GeluKernel : public framework::OpKernel<T> {
  public:
@@ -193,16 +218,40 @@ class GeluKernel : public framework::OpKernel<T> {
     auto approximate = context.Attr<bool>("approximate");
     out->mutable_data<T>(in->place());
 
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluFunctor<T> functor;
-    functor(place, eigen_in, eigen_out, approximate);
+    default_gelu_fw<DeviceContext, T>(context, in, approximate, out);
   }
 };
 
+// template <typename DeviceContext, typename T>
+// typename std::enable_if<
+//     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+// default_gelu_bw(const framework::ExecutionContext& ctx,
+//                              const framework::Tensor* in,
+//                              const framework::Tensor* dout,
+//                              const bool approximate,
+//                              framework::Tensor* dx
+//                              ){
+//     auto eigen_x = framework::EigenVector<T>::Flatten(*in);
+//     auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
+//     auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
+//     auto& place =
+//         *ctx.template device_context<DeviceContext>().eigen_device();
+
+//     GeluGradFunctor<T> functor;
+//     functor(place, eigen_x, eigen_dout, eigen_dx, approximate);
+// }
+
+// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// template <typename DeviceContext, typename T>
+// typename std::enable_if<
+//     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+// default_gelu_bw(const framework::ExecutionContext& ctx,
+//                              const framework::Tensor* in,
+//                              const framework::Tensor* dout,
+//                              const bool approximate,
+//                              framework::Tensor* dx);
+// #endif
+
 template <typename DeviceContext, typename T>
 class GeluGradKernel : public framework::OpKernel<T> {
  public:

From 875e6ac4b08951f086526f0b656f809fac95e507 Mon Sep 17 00:00:00 2001
From: zhoujianqian <15205085056@163.com>
Date: Thu, 16 Dec 2021 12:57:18 +0000
Subject: [PATCH 2/3] add gelu functor

---
 paddle/fluid/operators/gelu_op.cu | 30 ++++++++++++++++++++++--------
 paddle/fluid/operators/gelu_op.h  | 30 ------------------------------
 2 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 993f96e382bacd..1d57dfb4dd1d97 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -12,18 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/gelu_op.h"
-#include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-struct GeluXFunctor {
+struct GeluWithApproximateFunctor {
   using MT = typename details::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T operator()(T x) {
+    // this function is tanh approximation of gelu
+    MT mx = static_cast<MT>(x);
+    MT out = mx * static_cast<MT>(0.5) *
+             (static_cast<MT>(1.0) +
+              tanh(static_cast<MT>(0.79788456) * mx *
+                   (static_cast<MT>(1) + static_cast<MT>(0.044715) * mx * mx)));
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T>
+struct GeluNoApproximateFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T x) {
+    // actual gelu with approximation=false
+    // x * 0.5 * (1.0 + erf(x * 0.70710678))
     MT mx = static_cast<MT>(x);
     MT temp = erf(mx * static_cast<MT>(M_SQRT1_2));
     MT out = mx * static_cast<MT>(0.5) * (static_cast<MT>(1) + temp);
@@ -37,18 +53,16 @@ typename std::enable_if<
 default_gelu_fw(const framework::ExecutionContext& ctx,
                 const framework::Tensor* in, const bool approximate,
                 framework::Tensor* out) {
-  std::vector<const framework::Tensor*> ins;
-  std::vector<framework::Tensor*> outs;
-  ins = {in};
-  outs = {out};
+  std::vector<const framework::Tensor*> ins = {in};
+  std::vector<framework::Tensor*> outs = {out};
   const auto& dev_ctx =
       ctx.template device_context<platform::CUDADeviceContext>();
   if (approximate) {
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, paddle::operators::math::GeluFunctor<T>());
+        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
   } else {
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluXFunctor<T>());
+        dev_ctx, ins, &outs, 0, GeluNoApproximateFunctor<T>());
   }
 }
 
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
index 4699633924148f..94cf1ab6ef660c 100644
--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -222,36 +222,6 @@ class GeluKernel : public framework::OpKernel<T> {
   }
 };
 
-// template <typename DeviceContext, typename T>
-// typename std::enable_if<
-//     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-// default_gelu_bw(const framework::ExecutionContext& ctx,
-//                              const framework::Tensor* in,
-//                              const framework::Tensor* dout,
-//                              const bool approximate,
-//                              framework::Tensor* dx
-//                              ){
-//     auto eigen_x = framework::EigenVector<T>::Flatten(*in);
-//     auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-//     auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-//     auto& place =
-//         *ctx.template device_context<DeviceContext>().eigen_device();
-
-//     GeluGradFunctor<T> functor;
-//     functor(place, eigen_x, eigen_dout, eigen_dx, approximate);
-// }
-
-// #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// template <typename DeviceContext, typename T>
-// typename std::enable_if<
-//     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-// default_gelu_bw(const framework::ExecutionContext& ctx,
-//                              const framework::Tensor* in,
-//                              const framework::Tensor* dout,
-//                              const bool approximate,
-//                              framework::Tensor* dx);
-// #endif
-
 template <typename DeviceContext, typename T>
 class GeluGradKernel : public framework::OpKernel<T> {
  public:

From 141c05abdca29b54a2e052a46cf846cd8edd8b8e Mon Sep 17 00:00:00 2001
From: zhoujianqian <15205085056@163.com>
Date: Mon, 20 Dec 2021 11:50:40 +0000
Subject: [PATCH 3/3] optimize code

---
 paddle/fluid/operators/gelu_op.cu | 69 +++++++++++++++++--------------
 paddle/fluid/operators/gelu_op.h  | 33 ++++-----------
 2 files changed, 44 insertions(+), 58 deletions(-)

diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 1d57dfb4dd1d97..6a4a322b327a0d 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -22,49 +22,54 @@ namespace operators {
 
 template <typename T>
 struct GeluWithApproximateFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T x) {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
     // this function is tanh approximation of gelu
-    MT mx = static_cast<MT>(x);
-    MT out = mx * static_cast<MT>(0.5) *
-             (static_cast<MT>(1.0) +
-              tanh(static_cast<MT>(0.79788456) * mx *
-                   (static_cast<MT>(1) + static_cast<MT>(0.044715) * mx * mx)));
+    MPType x = static_cast<MPType>(arg_x);
+    MPType one = static_cast<MPType>(1);
+    MPType out = x * static_cast<MPType>(0.5) *
+                 (one + tanh(static_cast<MPType>(0.79788456) * x *
+                             (one + static_cast<MPType>(0.044715) * x * x)));
     return static_cast<T>(out);
   }
 };
 
 template <typename T>
-struct GeluNoApproximateFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T x) {
-    // actual gelu with approximation=false
-    // x * 0.5 * (1.0 + erf(x * 0.70710678))
-    MT mx = static_cast<MT>(x);
-    MT temp = erf(mx * static_cast<MT>(M_SQRT1_2));
-    MT out = mx * static_cast<MT>(0.5) * (static_cast<MT>(1) + temp);
+struct GeluWithoutApproximateFunctor {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // actual gelu with approximation = false
+    MPType x = static_cast<MPType>(arg_x);
+    MPType erf_out = erf(x * static_cast<MPType>(M_SQRT1_2));
+    MPType out =
+        x * static_cast<MPType>(0.5) * (static_cast<MPType>(1) + erf_out);
     return static_cast<T>(out);
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-default_gelu_fw(const framework::ExecutionContext& ctx,
-                const framework::Tensor* in, const bool approximate,
-                framework::Tensor* out) {
-  std::vector<const framework::Tensor*> ins = {in};
-  std::vector<framework::Tensor*> outs = {out};
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  if (approximate) {
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
-  } else {
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluNoApproximateFunctor<T>());
+template <typename T>
+class GeluKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    auto approximate = context.Attr<bool>("approximate");
+    out->mutable_data<T>(in->place());
+
+    std::vector<const framework::Tensor*> ins = {in};
+    std::vector<framework::Tensor*> outs = {out};
+    const auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    if (approximate) {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
+    }
   }
-}
+};
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
index 94cf1ab6ef660c..0446d7d284b223 100644
--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -184,31 +184,6 @@ struct GeluGradFunctor {
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-default_gelu_fw(const framework::ExecutionContext& ctx,
-                const framework::Tensor* in, const bool approximate,
-                framework::Tensor* out) {
-  auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-  auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-
-  auto& place =
-      *ctx.template device_context<platform::CPUDeviceContext>().eigen_device();
-
-  GeluFunctor<T> functor;
-  functor(place, eigen_in, eigen_out, approximate);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-default_gelu_fw(const framework::ExecutionContext& ctx,
-                const framework::Tensor* in, const bool approximate,
-                framework::Tensor* out);
-#endif
-
 template <typename DeviceContext, typename T>
 class GeluKernel : public framework::OpKernel<T> {
  public:
@@ -218,7 +193,13 @@ class GeluKernel : public framework::OpKernel<T> {
     auto approximate = context.Attr<bool>("approximate");
     out->mutable_data<T>(in->place());
 
-    default_gelu_fw<DeviceContext, T>(context, in, approximate, out);
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    GeluFunctor<T> functor;
+    functor(place, eigen_in, eigen_out, approximate);
   }
 };