Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add is_mean param for mean op #40757

Merged
merged 4 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions paddle/fluid/operators/mean_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,10 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i);
}
TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>(
context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里前面定义的Div是不是可以删掉了

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已经删除了调用的是IdentityFunctor

stream);
TensorReduceImpl<T, T, kernel_primitives::AddFunctor,
kps::IdentityFunctor<T>>(
context.cuda_device_context(), *input, output,
kps::IdentityFunctor<T>(), reduce_dims, stream, true);
}
};

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/reduce_ops/reduce_op.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) {
gpuStream_t stream, bool is_mean = false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_mean要不要加上const

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好的 下个PR再修改

y->mutable_data<Ty>(x.place());

phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
origin_reduce_dims);
origin_reduce_dims, is_mean);
}

} // namespace operators
Expand Down
105 changes: 73 additions & 32 deletions paddle/phi/kernels/funcs/reduce_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,25 +453,20 @@ struct ReduceConfig {
void SetReduceType() {
int rank = x_dim.size();
int reduce_rank = reduce_dim.size();
bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) {
#ifdef PADDLE_WITH_XPU_KP
reduce_type = static_cast<int>(ReduceType::kReduceAny);
bool not_higher = x_dim[0] > 1;
#else
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
int device_id = paddle::platform::GetCurrentDeviceId();
int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2];
bool not_higher = x_dim[0] >= max_grid_z;
#endif
if (reduce_last_dim && (reduce_rank == 1)) {
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
} else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU_KP
if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
if (rank == 3 && not_higher) {
reduce_type = static_cast<int>(ReduceType::kReduceAny);
}
#else
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
#endif
} else {
reduce_type = static_cast<int>(ReduceType::kReduceAny);
}
Expand Down Expand Up @@ -648,7 +643,8 @@ __global__ void ReduceAnyKernel(const Tx* x,
bool reduce_last_dim,
const Calculator reduce_index_calculator,
const Calculator left_index_calculator,
const kps::DimConfig dim) {
const kps::DimConfig dim,
bool is_mean) {
int input_idx, left_idx, stride;
int block_size = 0;
bool need_store = true;
Expand Down Expand Up @@ -752,7 +748,9 @@ __global__ void ReduceAnyKernel(const Tx* x,

kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim);

if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(reduce_num);
}
Ty result = static_cast<Ty>(reduce_var);
kps::details::WriteData<Ty>(
y + store_offset + i, &result, static_cast<int>(need_store));
Expand All @@ -772,7 +770,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int reduce_num,
int left_num,
int blocking_size,
const kps::DimConfig dim) {
const kps::DimConfig dim,
int mean_div,
bool is_mean) {
// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
// function will be used
auto block = ReduceIndexMapping<false>(dim);
Expand Down Expand Up @@ -806,6 +806,9 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, false>(
y + store_offset + idx, &result, block.BlockDimX());
Expand All @@ -831,6 +834,10 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}

if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, true>(
y + store_offset + idx, &result, dim.rem_x);
Expand All @@ -848,7 +855,8 @@ static void LaunchReduceKernel(const Tx* x_data,
const TransformOp& transform,
MPType init,
KPStream stream,
ReduceConfig<Ty> config) {
ReduceConfig<Ty> config,
bool is_mean = false) {
if (config.reduce_type == kReduceLastDim) {
int stride_reduce = 1;
int stride_left = config.reduce_num;
Expand Down Expand Up @@ -887,7 +895,8 @@ static void LaunchReduceKernel(const Tx* x_data,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
dim,
is_mean && (!config.should_reduce_again));

} else {
int reduce_rank = config.reduce_strides.size();
Expand Down Expand Up @@ -930,7 +939,8 @@ static void LaunchReduceKernel(const Tx* x_data,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
dim,
is_mean && (!config.should_reduce_again));
}

if (config.should_reduce_again) {
Expand All @@ -950,15 +960,18 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP
grid = 8;
block = 64;
int grid_size = 8;
int block_size = 64;
#else
auto grid_size = grid;
auto block_size = block;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
kps::IdentityFunctor<Ty, MPType>><<<grid_size, block_size, 0, stream>>>(
config.output_data,
y_data,
reducer,
Expand All @@ -967,7 +980,9 @@ static void LaunchReduceKernel(const Tx* x_data,
config.grid.y,
config.left_num,
config.grid.y,
dim);
dim,
config.reduce_num,
is_mean);
}
}

Expand Down Expand Up @@ -1034,7 +1049,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
const phi::DenseTensor& x,
phi::DenseTensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims) {
const std::vector<int>& origin_reduce_dims,
bool is_mean = false) {
#ifdef PADDLE_WITH_XPU_KP
auto stream = dev_ctx.x_context()->xpu_stream;
#else
Expand Down Expand Up @@ -1069,8 +1085,18 @@ void ReduceKernel(const KPDevice& dev_ctx,
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
if (is_mean) {
using Div = kps::DivideFunctor<Tx>;
CubTensorReduceImpl<Tx, Ty, ReduceOp, Div>(x_data,
y_data,
Div(config.reduce_num),
config.reduce_num,
dev_ctx,
stream);
} else {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
}
return;
}
#endif
Expand Down Expand Up @@ -1115,7 +1141,9 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.reduce_num,
config.left_num,
config.blocking_size,
dim);
dim,
config.reduce_num,
is_mean && (!config.should_reduce_again));

if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1);
Expand All @@ -1125,15 +1153,19 @@ void ReduceKernel(const KPDevice& dev_ctx,
dim2.SetRem(config.left_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
grid = 8;
block = 64;
int grid_size = 8;
int block_size = 64;
#else
auto grid_size = grid;
auto block_size = block;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
kps::IdentityFunctor<Ty,
MPType>><<<grid_size, block_size, 0, stream>>>(
config.output_data,
y_data,
reducer,
Expand All @@ -1142,7 +1174,9 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.grid.y,
config.left_num,
config.grid.y,
dim2);
dim2,
config.reduce_num,
is_mean);
}
return;
}
Expand All @@ -1151,7 +1185,14 @@ void ReduceKernel(const KPDevice& dev_ctx,
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
// function will be used
LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
x_data, y_data, reducer, transform, reducer.initial(), stream, config);
x_data,
y_data,
reducer,
transform,
reducer.initial(),
stream,
config,
is_mean);
}

} // namespace funcs
Expand Down
13 changes: 10 additions & 3 deletions paddle/phi/kernels/gpu/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ void Reduce(const KPDevice& dev_ctx,
const std::vector<int64_t>& dims,
bool keep_dim,
DataType out_dtype,
DenseTensor* out) {
DenseTensor* out,
bool is_mean = false) {
std::vector<int> reduce_dims =
phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);

Expand All @@ -57,12 +58,18 @@ void Reduce(const KPDevice& dev_ctx,
tmp_tensor,
out,
TransformOp<data_t, MPType>(reduce_num),
reduce_dims);
reduce_dims,
is_mean);
}));
} else {
using MPType = typename kps::details::MPTypeTrait<T>::Type;
phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
dev_ctx,
x,
out,
TransformOp<T, MPType>(reduce_num),
reduce_dims,
is_mean);
}
}
} // namespace phi
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/gpu/reduce_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ void MeanRawKernel(const Context& dev_ctx,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out, true);
}

template <typename T, typename Context>
Expand Down