Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add maximum limit for grid of reduce, elementwise and masked_select kernel #37355

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions paddle/fluid/operators/masked_select_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,22 @@ class MaskedSelectCUDAKernel : public framework::OpKernel<T> {
int32_t* mask_array_data = mask_array.mutable_data<int32_t>(ctx.GetPlace());
int32_t* mask_prefix_sum_data =
mask_prefix_sum.mutable_data<int32_t>(ctx.GetPlace());
int threads = 512;
int grid = (mask_size + threads - 1) / threads;
unsigned int threads = 512;
unsigned int maxGridDimX =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.GetCUDAMaxGridDimSize().x;
unsigned int num_rows = (mask_size + threads - 1) / threads;
// actually, int num_rows < max_grid_size
unsigned int grid = num_rows < maxGridDimX ? num_rows : maxGridDimX;
auto stream = ctx.cuda_device_context().stream();
SetMaskArray<<<grid, threads, 0, stream>>>(mask_data, mask_array_data,
mask_size);

thrust::device_ptr<int32_t> mask_array_dev_ptr =
thrust::device_pointer_cast(mask_array_data);
thrust::device_vector<int32_t> mask_array_vec(
mask_array_dev_ptr, mask_array_dev_ptr + mask_size);
thrust::exclusive_scan(thrust::device, mask_array_vec.begin(),
mask_array_vec.end(), mask_prefix_sum_data);

SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
}
Expand Down
11 changes: 8 additions & 3 deletions paddle/pten/kernels/gpu/elementwise.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,10 @@ void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
Functor func) {
auto numel = ins[0]->numel();
int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
int grid_size =
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
unsigned int num_rows =
((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
unsigned int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
auto stream = ctx.stream();
paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
paddle::framework::Array<OutT *, NumOuts> outs_data;
Expand Down Expand Up @@ -625,8 +627,11 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
Functor func,
DimensionsTransform merge_dims) {
int numel = (*outs)[0]->numel();
const int threads = 256;
int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
const unsigned int threads = 256;
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
unsigned int num_rows =
((numel + VecSize - 1) / VecSize + threads - 1) / threads;
unsigned int blocks = num_rows < maxGridDimX ? num_rows : maxGridDimX;

int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
int tail_tid = numel % (VecSize * threads);
Expand Down
16 changes: 14 additions & 2 deletions paddle/pten/kernels/gpu/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ struct ReduceConfig {
: reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}

// get the parameters of reduceKernel
void Run() {
void Run(const paddle::platform::Place& place) {
// step1: update the reduce_dim left_dim and x_dim
SetReduceDim();

Expand All @@ -319,6 +319,9 @@ struct ReduceConfig {

// step4: set the block and grid for launch kernel
SetBlockDim();

// step5: limit the grid to prevent thead overflow
LimitGridDim(place);
}

// when should_reduce_again is true, we need malloc temp space for temp data
Expand Down Expand Up @@ -601,6 +604,15 @@ struct ReduceConfig {
grid = grid_dim;
}

void LimitGridDim(const paddle::platform::Place& place) {
auto* ctx = static_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place));
dim3 max_grid_dim = ctx->GetCUDAMaxGridDimSize();
grid.x = grid.x < max_grid_dim.x ? grid.x : max_grid_dim.x;
grid.y = grid.y < max_grid_dim.y ? grid.y : max_grid_dim.y;
grid.z = grid.z < max_grid_dim.z ? grid.z : max_grid_dim.z;
}

public:
std::vector<int> reduce_dims_origin;
std::vector<int> reduce_dim;
Expand Down Expand Up @@ -1075,7 +1087,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,

auto x_dim = paddle::framework::vectorize<int>(x.dims());
auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
config.Run();
config.Run(x.place());
int numel = x.numel();
// after config.run()
// SetOutputData for ReduceHigherDim when should_reduce_again is true,
Expand Down