Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speedup FP16 Gelu op using fast math and vectorized 8 kernel #38980

Merged
merged 2 commits into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions paddle/fluid/operators/gelu_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,156 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/gelu_op.h"

DECLARE_bool(use_fast_math);

namespace paddle {
namespace operators {

#ifdef __NVCC__
template <bool FastMode>
static __device__ __forceinline__ float FP32FastTanh(float x) {
#if __CUDA_ARCH__ >= 750 && !defined(_WIN32)
if (FastMode) {
float y;
asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
return y;
}
#endif
return tanhf(x);
}

template <bool FastMode>
static __device__ __forceinline__ float FP32GeluFwd(float x) {
auto tanh_out =
FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
return x * 0.5f * (1.0f + tanh_out);
}

template <bool FastMode>
static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
auto tanh_out =
FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
(0.79788456f + 0.1070322243f * x * x)) +
0.5f * (1.0f + tanh_out);
return tmp * y_g;
}

template <int VecSize, bool FastMode>
static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
size_t n) {
size_t offset =
static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
for (; offset < n; offset += stride) {
using ArrT = platform::AlignedVector<__half, VecSize>;
ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
float tmp = __half2float(in_arr[i]);
in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
}
*reinterpret_cast<ArrT*>(y + offset) = in_arr;
}
}

template <int VecSize, bool FastMode>
static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
const __half* y_g, __half* x_g,
size_t n) {
size_t offset =
static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
for (; offset < n; offset += stride) {
using ArrT = platform::AlignedVector<__half, VecSize>;
ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
__half2 tmp_fp16_2;
tmp_fp16_2.x = x_in_arr[i];
tmp_fp16_2.y = y_g_in_arr[i];
float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
x_in_arr[i] =
__float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
}
*reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
}
}

static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y,
size_t n) {
auto is_aligned = [](const void* p, size_t alignment) {
return reinterpret_cast<uintptr_t>(p) % alignment == 0;
};

#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \
do { \
constexpr auto kAlignment = \
alignof(platform::AlignedVector<__half, __vec_size>); \
if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \
is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \
VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluFwdCUDAKernel< \
__vec_size, \
__use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n); \
return true; \
} \
} while (0)

if (FLAGS_use_fast_math) {
PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
} else {
PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
}

#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
return false;
}

static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
const platform::CUDADeviceContext& dev_ctx, const __half* x,
const __half* y_g, __half* x_g, size_t n) {
auto is_aligned = [](const void* p, size_t alignment) {
return reinterpret_cast<uintptr_t>(p) % alignment == 0;
};

#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \
do { \
constexpr auto kAlignment = \
alignof(platform::AlignedVector<__half, __vec_size>); \
if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \
is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \
is_aligned(x_g, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
size_t block = (n / __vec_size + thread - 1) / thread; \
block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize().x); \
VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \
<< " , thread = " << thread; \
FP16FastGeluBwdCUDAKernel< \
__vec_size, \
__use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y_g, \
x_g, n); \
return true; \
} \
} while (0)

if (FLAGS_use_fast_math) {
PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
} else {
PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
}

#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
return false;
}
#endif

template <typename T>
struct GeluWithApproximateFunctor {
using MPType = typename details::MPTypeTrait<T>::Type;
Expand Down Expand Up @@ -59,7 +206,19 @@ class GeluKernel<platform::CUDADeviceContext, T>
std::vector<framework::Tensor*> outs = {out};
const auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();

if (approximate) {
#ifdef __NVCC__
if (std::is_same<T, platform::float16>::value) {
size_t n = in->numel();
const auto* in_ptr = reinterpret_cast<const __half*>(in->data<T>());
auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr,
out_ptr, n)) {
return;
}
}
#endif
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
} else {
Expand Down Expand Up @@ -120,6 +279,18 @@ class GeluGradKernel<platform::CUDADeviceContext, T>
const auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
if (approximate) {
#ifdef __NVCC__
if (std::is_same<T, platform::float16>::value) {
size_t n = x->numel();
const auto* x_ptr = reinterpret_cast<const __half*>(x->data<T>());
const auto* y_g_ptr = reinterpret_cast<const __half*>(dout->data<T>());
auto* x_g_ptr = reinterpret_cast<__half*>(dx->data<T>());
if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr,
x_g_ptr, n)) {
return;
}
}
#endif
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
} else {
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,9 @@ PADDLE_DEFINE_EXPORTED_bool(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false,
"Disable cudnn in conv2d");

PADDLE_DEFINE_EXPORTED_bool(use_fast_math, false,
"Whether to use fast math GPU functions.");
#endif

/**
Expand Down
32 changes: 32 additions & 0 deletions python/paddle/fluid/tests/unittests/test_gelu_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from scipy.special import erf
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
import paddle
import paddle.nn.functional as F


def gelu(x, approximate):
Expand Down Expand Up @@ -59,6 +61,36 @@ def test_cases(self):
if fluid.is_compiled_with_cuda():
self._test_case1_gpu(approximate)

def test_fast_math(self):
if not paddle.is_compiled_with_cuda():
return

def use_fast_math(enabled):
paddle.set_flags({'FLAGS_use_fast_math': enabled})

shape = [11, 17, 8]
x_np = np.random.uniform(-1, 1, size=shape).astype(np.float16)
y_g_np = np.random.uniform(-1, 1, size=shape).astype(np.float16)

def run_gelu_op(approximate):
with dg.guard():
x = paddle.to_tensor(x_np)
x.stop_gradient = False
y = F.gelu(x, approximate=approximate)
x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0]
return y.numpy(), x_grad.numpy()

use_fast_math(True)
y_fast_math, x_g_fast_math = run_gelu_op(True)
use_fast_math(False)

y_ref, x_g_ref = run_gelu_op(True)
self.assertTrue(np.allclose(y_ref, y_fast_math, rtol=1e-5, atol=5e-4))

self.assertTrue(
np.allclose(
x_g_ref, x_g_fast_math, rtol=1e-5, atol=5e-4))


if __name__ == '__main__':
unittest.main()