From a1f15cb96e7694541d896ec819425d938669dc30 Mon Sep 17 00:00:00 2001 From: zhangting2020 Date: Mon, 20 Jun 2022 11:15:17 +0000 Subject: [PATCH 1/2] slice large tensor for cudnn_softmax --- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 286 ++++++++++++--------- 1 file changed, 159 insertions(+), 127 deletions(-) diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index ca3574de77170..a93151fe8e653 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -772,15 +772,12 @@ static std::vector GetSoftmaxTensorDims(const phi::DDim& dims, template void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, - const DenseTensor& x, + const T* x_data, const int axis, + const int rank, const bool log_mode, - DenseTensor* out) { - auto* out_data = out->data(); - - const int rank = x.dims().size(); - std::vector tensor_dims = GetSoftmaxTensorDims(x.dims(), axis); - + const std::vector& tensor_dims, + T* out_data) { auto handle = dev_ctx.cudnn_handle(); GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; @@ -795,7 +792,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, handle, paddle::platform::CudnnDataType::kOne(), desc, - x.data(), + x_data, paddle::platform::CudnnDataType::kZero(), desc, out_data, @@ -812,7 +809,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, mode, paddle::platform::CudnnDataType::kOne(), desc, - x.data(), + x_data, paddle::platform::CudnnDataType::kZero(), desc, out_data)); @@ -821,16 +818,13 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, template void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, - const DenseTensor& out, - const DenseTensor& dout, + const T* out_data, + const T* dout_data, const int axis, + const int rank, const bool log_mode, - DenseTensor* dx) { - auto* dx_data = dx->data(); - - int rank = out.dims().size(); - std::vector tensor_dims = GetSoftmaxTensorDims(out.dims(), axis); - + const std::vector& tensor_dims, + T* dx_data) { auto handle = dev_ctx.cudnn_handle(); GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; @@ -846,9 +840,9 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, handle, paddle::platform::CudnnDataType::kOne(), desc, - out.data(), + out_data, desc, - dout.data(), + dout_data, paddle::platform::CudnnDataType::kZero(), desc, dx_data, @@ -865,9 +859,9 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, mode, paddle::platform::CudnnDataType::kOne(), desc, - out.data(), + out_data, desc, - dout.data(), + dout_data, paddle::platform::CudnnDataType::kZero(), desc, dx_data)); @@ -878,10 +872,12 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, template <> inline void SoftmaxForwardCudnnKernel( const GPUContext& dev_ctx, - const DenseTensor& x, + const T* x_data, const int axis, + const int rank, const bool log_mode, - DenseTensor* out) { + const std::vector& tensor_dims, + T* out_data) { PADDLE_THROW(errors::Unavailable( "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " "8100.")); @@ -889,11 +885,13 @@ inline void SoftmaxForwardCudnnKernel( template <> inline void SoftmaxBackwardCudnnKernel( const GPUContext& dev_ctx, - const DenseTensor& out, - const DenseTensor& dout, + const T* out_data, + const T* dout_data, const int axis, + const int rank, const bool log_mode, - DenseTensor* dx) { + const std::vector& tensor_dims, + T* dx_data) { PADDLE_THROW(errors::Unavailable( "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " "8100.")); @@ -933,60 +931,73 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, int dim = tensor_dims[1]; int D = tensor_dims[2]; - if (D == 1 && !UseCudnnSoftmax(dev_ctx, dim, true)) { - int dim_log2 = static_cast(Log2Ceil(dim)); - int dim_ceil = 1 << dim_log2; - int warp_size = (dim_ceil < 32) ? dim_ceil : 32; - int batches_per_warp = (dim_ceil <= 32) ? 2 : 1; - - // use 128 threads per block to maximimize gpu utilization - constexpr int threads_per_block = 128; - - int warps_per_block = (threads_per_block / warp_size); - int batches_per_block = warps_per_block * batches_per_warp; - int blocks = (N + batches_per_block - 1) / batches_per_block; - dim3 threads(warp_size, warps_per_block, 1); - - // vectorization read/write - using T4 = typename VecT4::Type; - using T2 = typename VecT2::Type; - - if (dim % 4 == 0) { - SwitchWarpSoftmaxForward(blocks, - threads, - dev_ctx, - out_data, - x.data(), - N, - dim, - dim, - dim_log2); - } else if (dim % 2 == 0) { - SwitchWarpSoftmaxForward(blocks, - threads, - dev_ctx, - out_data, - x.data(), - N, - dim, - dim, - dim_log2); + if (D == 1) { + if (!UseCudnnSoftmax(dev_ctx, dim, true)) { + int dim_log2 = static_cast(Log2Ceil(dim)); + int dim_ceil = 1 << dim_log2; + int warp_size = (dim_ceil < 32) ? dim_ceil : 32; + int batches_per_warp = (dim_ceil <= 32) ? 2 : 1; + + // use 128 threads per block to maximimize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (N + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + + // vectorization read/write + using T4 = typename VecT4::Type; + using T2 = typename VecT2::Type; + + if (dim % 4 == 0) { + SwitchWarpSoftmaxForward(blocks, + threads, + dev_ctx, + out_data, + x.data(), + N, + dim, + dim, + dim_log2); + } else if (dim % 2 == 0) { + SwitchWarpSoftmaxForward(blocks, + threads, + dev_ctx, + out_data, + x.data(), + N, + dim, + dim, + dim_log2); + } else { + SwitchWarpSoftmaxForward(blocks, + threads, + dev_ctx, + out_data, + x.data(), + N, + dim, + dim, + dim_log2); + } } else { - SwitchWarpSoftmaxForward(blocks, - threads, - dev_ctx, - out_data, - x.data(), - N, - dim, - dim, - dim_log2); + int64_t remaining = N; + auto* x_data = x.data(); + int64_t batch_size = INT_MAX / dim; + int offset = batch_size * dim; + while (remaining > 0) { + tensor_dims[0] = std::min(remaining, batch_size); + SoftmaxForwardCudnnKernel( + dev_ctx, x_data, axis, rank, LogMode, tensor_dims, out_data); + x_data += offset; + out_data += offset; + remaining -= batch_size; + } } - } else if (D > 1) { + } else { LaunchNormalSoftmaxForward( dev_ctx, out_data, x.data(), N, dim, D); - } else { - SoftmaxForwardCudnnKernel(dev_ctx, x, axis, LogMode, out); } } @@ -1005,61 +1016,82 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, int dim = tensor_dims[1]; int D = tensor_dims[2]; - if (D == 1 && !UseCudnnSoftmax(dev_ctx, dim, true)) { - int dim_log2 = Log2Ceil(dim); - int dim_ceil = 1 << dim_log2; - int warp_size = (dim_ceil < 32) ? dim_ceil : 32; - int batches_per_warp = (dim_ceil <= 128) ? 2 : 1; - - constexpr int threads_per_block = 128; - - int warps_per_block = (threads_per_block / warp_size); - int batches_per_block = warps_per_block * batches_per_warp; - int blocks = (N + batches_per_block - 1) / batches_per_block; - dim3 threads(warp_size, warps_per_block, 1); - - // vectorization read/write - using T4 = typename VecT4::Type; - using T2 = typename VecT2::Type; - if (dim % 4 == 0) { - SwitchWarpSoftmaxBackward(blocks, - threads, - dev_ctx, - dx_data, - dout.data(), - out.data(), - N, - dim, - dim, - dim_log2); - } else if (dim % 2 == 0) { - SwitchWarpSoftmaxBackward(blocks, - threads, - dev_ctx, - dx_data, - dout.data(), - out.data(), - N, - dim, - dim, - dim_log2); + if (D == 1) { + if (!UseCudnnSoftmax(dev_ctx, dim, true)) { + int dim_log2 = Log2Ceil(dim); + int dim_ceil = 1 << dim_log2; + int warp_size = (dim_ceil < 32) ? dim_ceil : 32; + int batches_per_warp = (dim_ceil <= 128) ? 2 : 1; + + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (N + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + + // vectorization read/write + using T4 = typename VecT4::Type; + using T2 = typename VecT2::Type; + if (dim % 4 == 0) { + SwitchWarpSoftmaxBackward(blocks, + threads, + dev_ctx, + dx_data, + dout.data(), + out.data(), + N, + dim, + dim, + dim_log2); + } else if (dim % 2 == 0) { + SwitchWarpSoftmaxBackward(blocks, + threads, + dev_ctx, + dx_data, + dout.data(), + out.data(), + N, + dim, + dim, + dim_log2); + } else { + SwitchWarpSoftmaxBackward(blocks, + threads, + dev_ctx, + dx_data, + dout.data(), + out.data(), + N, + dim, + dim, + dim_log2); + } } else { - SwitchWarpSoftmaxBackward(blocks, - threads, - dev_ctx, - dx_data, - dout.data(), - out.data(), - N, - dim, - dim, - dim_log2); + int64_t remaining = N; + auto* out_data = out.data(); + auto* dout_data = dout.data(); + int64_t batch_size = INT_MAX / dim; + int offset = batch_size * dim; + while (remaining > 0) { + tensor_dims[0] = std::min(remaining, batch_size); + SoftmaxBackwardCudnnKernel(dev_ctx, + out_data, + dout_data, + axis, + rank, + LogMode, + tensor_dims, + dx_data); + out_data += offset; + dout_data += offset; + dx_data += offset; + remaining -= batch_size; + } } - } else if (D > 1) { + } else { LaunchNormalSoftmaxBackward( dev_ctx, dx_data, dout.data(), out.data(), N, dim, D); - } else { - SoftmaxBackwardCudnnKernel(dev_ctx, out, dout, axis, LogMode, dx); } } From a5f4a39ff60ac2b155950591b859515116f7e453 Mon Sep 17 00:00:00 2001 From: zhangting2020 Date: Mon, 20 Jun 2022 13:17:46 +0000 Subject: [PATCH 2/2] polish code --- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 112 +++++++++++++-------- 1 file changed, 69 insertions(+), 43 deletions(-) diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index a93151fe8e653..ef3406fd7f668 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -816,6 +816,31 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, #endif } +template +void LaunchSoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, + const DenseTensor& x, + const int axis, + const bool log_mode, + DenseTensor* out) { + auto* out_data = out->data(); + auto* x_data = x.data(); + const int rank = x.dims().size(); + + std::vector tensor_dims = GetSoftmaxTensorDims(x.dims(), axis); + int64_t remaining = tensor_dims[0]; + int dim = tensor_dims[1]; + int64_t batch_size = std::numeric_limits::max() / dim; + int offset = batch_size * dim; + while (remaining > 0) { + tensor_dims[0] = std::min(remaining, batch_size); + SoftmaxForwardCudnnKernel( + dev_ctx, x_data, axis, rank, log_mode, tensor_dims, out_data); + x_data += offset; + out_data += offset; + remaining -= batch_size; + } +} + template void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, const T* out_data, @@ -868,30 +893,60 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, #endif } +template +void LaunchSoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + const int axis, + const bool log_mode, + DenseTensor* dx) { + auto* dx_data = dx->data(); + auto* out_data = out.data(); + auto* dout_data = dout.data(); + int rank = out.dims().size(); + + std::vector tensor_dims = GetSoftmaxTensorDims(out.dims(), axis); + int64_t remaining = tensor_dims[0]; + int dim = tensor_dims[1]; + int64_t batch_size = std::numeric_limits::max() / dim; + int offset = batch_size * dim; + while (remaining > 0) { + tensor_dims[0] = std::min(remaining, batch_size); + SoftmaxBackwardCudnnKernel(dev_ctx, + out_data, + dout_data, + axis, + rank, + log_mode, + tensor_dims, + dx_data); + out_data += offset; + dout_data += offset; + dx_data += offset; + remaining -= batch_size; + } +} + #if CUDNN_VERSION < 8100 template <> -inline void SoftmaxForwardCudnnKernel( +inline void LaunchSoftmaxForwardCudnnKernel( const GPUContext& dev_ctx, - const T* x_data, + const DenseTensor& x, const int axis, - const int rank, const bool log_mode, - const std::vector& tensor_dims, - T* out_data) { + DenseTensor* out) { PADDLE_THROW(errors::Unavailable( "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " "8100.")); } template <> -inline void SoftmaxBackwardCudnnKernel( +inline void LaunchSoftmaxBackwardCudnnKernel( const GPUContext& dev_ctx, - const T* out_data, - const T* dout_data, + const DenseTensor& out, + const DenseTensor& dout, const int axis, - const int rank, const bool log_mode, - const std::vector& tensor_dims, - T* dx_data) { + DenseTensor* dx) { PADDLE_THROW(errors::Unavailable( "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " "8100.")); @@ -982,18 +1037,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, dim_log2); } } else { - int64_t remaining = N; - auto* x_data = x.data(); - int64_t batch_size = INT_MAX / dim; - int offset = batch_size * dim; - while (remaining > 0) { - tensor_dims[0] = std::min(remaining, batch_size); - SoftmaxForwardCudnnKernel( - dev_ctx, x_data, axis, rank, LogMode, tensor_dims, out_data); - x_data += offset; - out_data += offset; - remaining -= batch_size; - } + LaunchSoftmaxForwardCudnnKernel(dev_ctx, x, axis, LogMode, out); } } else { LaunchNormalSoftmaxForward( @@ -1068,26 +1112,8 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, dim_log2); } } else { - int64_t remaining = N; - auto* out_data = out.data(); - auto* dout_data = dout.data(); - int64_t batch_size = INT_MAX / dim; - int offset = batch_size * dim; - while (remaining > 0) { - tensor_dims[0] = std::min(remaining, batch_size); - SoftmaxBackwardCudnnKernel(dev_ctx, - out_data, - dout_data, - axis, - rank, - LogMode, - tensor_dims, - dx_data); - out_data += offset; - dout_data += offset; - dx_data += offset; - remaining -= batch_size; - } + LaunchSoftmaxBackwardCudnnKernel( + dev_ctx, out, dout, axis, LogMode, dx); } } else { LaunchNormalSoftmaxBackward(