diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index e8964765ec654..813dce6080130 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel { for (decltype(rank) i = 0; i < rank; ++i) { reduce_dims.push_back(i); } - TensorReduceImpl( - context.cuda_device_context(), *input, output, Div(numel), reduce_dims, - stream); + TensorReduceImpl>( + context.cuda_device_context(), *input, output, + kps::IdentityFunctor(), reduce_dims, stream, true); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 160617695338a..b21e41c5b8548 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, const framework::Tensor& x, framework::Tensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims, - gpuStream_t stream) { + gpuStream_t stream, bool is_mean = false) { y->mutable_data(x.place()); phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims); + origin_reduce_dims, is_mean); } } // namespace operators diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 85c371e9f9d45..17f5cd67ec957 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -453,25 +453,20 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - bool is_last_dim = - (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); - if (rank == reduce_rank || is_last_dim) { #ifdef PADDLE_WITH_XPU_KP - reduce_type = static_cast(ReduceType::kReduceAny); + bool not_higher = x_dim[0] > 1; #else - reduce_type = static_cast(ReduceType::kReduceLastDim); + int device_id = paddle::platform::GetCurrentDeviceId(); + int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2]; + bool not_higher = x_dim[0] >= max_grid_z; #endif + if (reduce_last_dim && (reduce_rank == 1)) { + reduce_type = static_cast(ReduceType::kReduceLastDim); } else if (reduce_rank == 1) { -// ReduceFirstDim and reduceSecondDim -#ifdef PADDLE_WITH_XPU_KP - if (reduce_dim[0] == 0) { - reduce_type = static_cast(ReduceType::kReduceHigherDim); - } else { + reduce_type = static_cast(ReduceType::kReduceHigherDim); + if (rank == 3 && not_higher) { reduce_type = static_cast(ReduceType::kReduceAny); } -#else - reduce_type = static_cast(ReduceType::kReduceHigherDim); -#endif } else { reduce_type = static_cast(ReduceType::kReduceAny); } @@ -648,7 +643,8 @@ __global__ void ReduceAnyKernel(const Tx* x, bool reduce_last_dim, const Calculator reduce_index_calculator, const Calculator left_index_calculator, - const kps::DimConfig dim) { + const kps::DimConfig dim, + bool is_mean) { int input_idx, left_idx, stride; int block_size = 0; bool need_store = true; @@ -752,7 +748,9 @@ __global__ void ReduceAnyKernel(const Tx* x, kps::Reduce( &reduce_var, &reduce_var, reducer, reduce_last_dim); - + if (is_mean) { + reduce_var = reduce_var / static_cast(reduce_num); + } Ty result = static_cast(reduce_var); kps::details::WriteData( y + store_offset + i, &result, static_cast(need_store)); @@ -772,7 +770,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x, int reduce_num, int left_num, int blocking_size, - const kps::DimConfig dim) { + const kps::DimConfig dim, + int mean_div, + bool is_mean) { // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this // function will be used auto block = ReduceIndexMapping(dim); @@ -806,6 +806,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x, kps::details::ReduceMode::kLocalMode>( &reduce_var, &reduce_compute, reducer, false); } + if (is_mean) { + reduce_var = reduce_var / static_cast(mean_div); + } Ty result = static_cast(reduce_var); kps::WriteData( y + store_offset + idx, &result, block.BlockDimX()); @@ -831,6 +834,10 @@ __global__ void ReduceHigherDimKernel(const Tx* x, kps::details::ReduceMode::kLocalMode>( &reduce_var, &reduce_compute, reducer, false); } + + if (is_mean) { + reduce_var = reduce_var / static_cast(mean_div); + } Ty result = static_cast(reduce_var); kps::WriteData( y + store_offset + idx, &result, dim.rem_x); @@ -848,7 +855,8 @@ static void LaunchReduceKernel(const Tx* x_data, const TransformOp& transform, MPType init, KPStream stream, - ReduceConfig config) { + ReduceConfig config, + bool is_mean = false) { if (config.reduce_type == kReduceLastDim) { int stride_reduce = 1; int stride_left = config.reduce_num; @@ -887,7 +895,8 @@ static void LaunchReduceKernel(const Tx* x_data, config.reduce_last_dim, reduce_index_calculator, left_index_calculator, - dim); + dim, + is_mean && (!config.should_reduce_again)); } else { int reduce_rank = config.reduce_strides.size(); @@ -930,7 +939,8 @@ static void LaunchReduceKernel(const Tx* x_data, config.reduce_last_dim, reduce_index_calculator, left_index_calculator, - dim); + dim, + is_mean && (!config.should_reduce_again)); } if (config.should_reduce_again) { @@ -950,15 +960,18 @@ static void LaunchReduceKernel(const Tx* x_data, kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim.SetRem(config.left_num % block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - grid = 8; - block = 64; + int grid_size = 8; + int block_size = 64; +#else + auto grid_size = grid; + auto block_size = block; #endif ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::IdentityFunctor><<>>( + kps::IdentityFunctor><<>>( config.output_data, y_data, reducer, @@ -967,7 +980,9 @@ static void LaunchReduceKernel(const Tx* x_data, config.grid.y, config.left_num, config.grid.y, - dim); + dim, + config.reduce_num, + is_mean); } } @@ -1034,7 +1049,8 @@ void ReduceKernel(const KPDevice& dev_ctx, const phi::DenseTensor& x, phi::DenseTensor* y, const TransformOp& transform, - const std::vector& origin_reduce_dims) { + const std::vector& origin_reduce_dims, + bool is_mean = false) { #ifdef PADDLE_WITH_XPU_KP auto stream = dev_ctx.x_context()->xpu_stream; #else @@ -1069,8 +1085,18 @@ void ReduceKernel(const KPDevice& dev_ctx, bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; #ifndef PADDLE_WITH_XPU_KP if (use_cub_reduce) { - CubTensorReduceImpl( - x_data, y_data, transform, config.reduce_num, dev_ctx, stream); + if (is_mean) { + using Div = kps::DivideFunctor; + CubTensorReduceImpl(x_data, + y_data, + Div(config.reduce_num), + config.reduce_num, + dev_ctx, + stream); + } else { + CubTensorReduceImpl( + x_data, y_data, transform, config.reduce_num, dev_ctx, stream); + } return; } #endif @@ -1115,7 +1141,9 @@ void ReduceKernel(const KPDevice& dev_ctx, config.reduce_num, config.left_num, config.blocking_size, - dim); + dim, + config.reduce_num, + is_mean && (!config.should_reduce_again)); if (config.should_reduce_again) { dim3 block = dim3(config.block.x, 1, 1); @@ -1125,15 +1153,19 @@ void ReduceKernel(const KPDevice& dev_ctx, dim2.SetRem(config.left_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - grid = 8; - block = 64; + int grid_size = 8; + int block_size = 64; +#else + auto grid_size = grid; + auto block_size = block; #endif ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::IdentityFunctor><<>>( + kps::IdentityFunctor><<>>( config.output_data, y_data, reducer, @@ -1142,7 +1174,9 @@ void ReduceKernel(const KPDevice& dev_ctx, config.grid.y, config.left_num, config.grid.y, - dim2); + dim2, + config.reduce_num, + is_mean); } return; } @@ -1151,7 +1185,14 @@ void ReduceKernel(const KPDevice& dev_ctx, // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used LaunchReduceKernel, TransformOp>( - x_data, y_data, reducer, transform, reducer.initial(), stream, config); + x_data, + y_data, + reducer, + transform, + reducer.initial(), + stream, + config, + is_mean); } } // namespace funcs diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index da5315f34479f..e47b3afc9c355 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -30,7 +30,8 @@ void Reduce(const KPDevice& dev_ctx, const std::vector& dims, bool keep_dim, DataType out_dtype, - DenseTensor* out) { + DenseTensor* out, + bool is_mean = false) { std::vector reduce_dims = phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all); @@ -57,12 +58,18 @@ void Reduce(const KPDevice& dev_ctx, tmp_tensor, out, TransformOp(reduce_num), - reduce_dims); + reduce_dims, + is_mean); })); } else { using MPType = typename kps::details::MPTypeTrait::Type; phi::funcs::ReduceKernel>( - dev_ctx, x, out, TransformOp(reduce_num), reduce_dims); + dev_ctx, + x, + out, + TransformOp(reduce_num), + reduce_dims, + is_mean); } } } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu index 6cbe699e8e058..fabd13d4a737c 100644 --- a/paddle/phi/kernels/gpu/reduce_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -27,8 +27,8 @@ void MeanRawKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* out) { auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true); } template