add launch bound to limit the registers usage for volta architecture

PaddlePaddle · Dec 14, 2021 · 552ccb7 · 552ccb7 · paddle-bot-old · Dec 14, 2021
1 parent 4c1e27c
commit 552ccb7
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
@@ -120,8 +120,9 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
 }
 
 template <typename T, typename Index, typename Helper, typename Pool>
-__global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
-                                 T* output, Helper h, Pool pool) {
+__global__ void __launch_bounds__(1024, 1)
+    SegmentOpsKernel(const Index* segment_ids, const T* input, T* output,
+                     Helper h, Pool pool) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;