PaddlePaddle · XiaoguangHu01 · Jun 22, 2021 · Jun 18, 2021 · jeff41404 · Jun 18, 2021
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
@@ -55,6 +55,7 @@ __forceinline__ __device__ T blockReduceSum(T val) {
   int wid = threadIdx.x / warpSize;
 
   val = warpReduceSum(val);
+  __syncthreads();
   if (lane == 0) shared[wid] = val;
 
   __syncthreads();

diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
@@ -64,17 +64,16 @@ static __forceinline__ __device__ U WarpReduceSum(U val) {
 }
 
 template <typename U>
-__forceinline__ __device__ U BlockReduceSum(U val) {
-  static __shared__ U shared[32];
+__forceinline__ __device__ U BlockReduceSum(U val, U *shared) {
   int lane = threadIdx.x % warpSize;
   int wid = threadIdx.x / warpSize;
 
   val = WarpReduceSum(val);  // Each warp performs partial reduction
 
+  __syncthreads();
   if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
 
   __syncthreads();  // Wait for all partial reductions
-
   // read from shared memory only if that warp existed
   val =
       (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<U>(0);
@@ -183,6 +182,9 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
                                  int64_t feature_size) {
   __shared__ U mean_share;
   __shared__ U var_share;
+  __shared__ U shared_mean[32];  // threadIdx.x / warpSize <= kMaxBlockDim /
+                                 // warpSize <= 1024/32 = 32;
+  __shared__ U shared_var[32];
 
   int64_t beg_idx = blockIdx.x * feature_size + threadIdx.x;
   int64_t end_idx = (blockIdx.x + 1) * feature_size;
@@ -196,8 +198,8 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
     var_val += (tmp * tmp);
   }
 
-  mean_val = BlockReduceSum<U>(mean_val);
-  var_val = BlockReduceSum<U>(var_val);
+  mean_val = BlockReduceSum<U>(mean_val, shared_mean);
+  var_val = BlockReduceSum<U>(var_val, shared_var);
 
   if (threadIdx.x == 0) {
     auto scale = static_cast<float>(1.) / static_cast<float>(feature_size);
@@ -541,8 +543,11 @@ __global__ void LayerNormBackwardGradientAll(
     }
   }
 
-  d_scale_partial = BlockReduceSum<U>(d_scale_partial);
-  d_bias_partial = BlockReduceSum<U>(d_bias_partial);
+  __shared__ U shared_scale[32];  // threadIdx.x / warpSize <= kMaxBlockDim /
+                                  // warpSize <= 1024/32 = 32;
+  __shared__ U shared_bias[32];
+  d_scale_partial = BlockReduceSum<U>(d_scale_partial, shared_scale);
+  d_bias_partial = BlockReduceSum<U>(d_bias_partial, shared_bias);
 
   if (threadIdx.x == 0) {
     d_scale[blockIdx.x + col_offset] = d_scale_partial;

diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/fluid/operators/math/math_cuda_utils.h
@@ -188,6 +188,7 @@ __inline__ __device__ T blockReduceSum(T val, unsigned mask) {
 
   val = warpReduceSum<T>(val, mask);
 
+  __syncthreads();
   if (lane == 0) shared[wid] = val;
 
   __syncthreads();