From 552ccb7febd4f660d01b43567104bb4c02e0c7bb Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Tue, 14 Dec 2021 05:30:03 +0000 Subject: [PATCH] add launch bound to limit the registers usage for volta architecture --- paddle/fluid/operators/math/segment_pooling.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index 67cf3162460073..0cbfaa4c5df7bd 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -120,8 +120,9 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input, } template -__global__ void SegmentOpsKernel(const Index* segment_ids, const T* input, - T* output, Helper h, Pool pool) { +__global__ void __launch_bounds__(1024, 1) + SegmentOpsKernel(const Index* segment_ids, const T* input, T* output, + Helper h, Pool pool) { CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) { Index segment_offset, dim_index_base, actual_height; Index inner_dim_size = h.inner_dim_size;