Optimized Res-block Fusion without SE (#1678)

* misc changes to cudnn backend - replace all cudaMemcpyAsync used for loading weights with cudaMemcpy as source (in CPU memory) could be deleted before the async version of the function actually does the copy. - minor naming/style changes. - add comment explaining what the policy map layer does and how the layout conversion from CHW to HWC works. * fix typo in comment * clang-format * address review comment * Add 320 and 352 channel support for fused SE layer - just add template instantiations. - verified that it works and provides a (very) slight speedup. * Update fp16_kernels.cu * Simpler kernel for res-block fusion without SE - use constant block size of 64, splitting channel dimension also into multiple blocks as needed. - This allows arbitrarily large filter counts without running out of register file. * minor refactoring - allow using res block fusing opt for alternate layers (that don't have SE) even on GPUs that don't have enough shared memory. * minor functional fix * a few more fixes to get correct output hopefully functionally correct now. * fix cudnn backend build - missed the fact that it also uses Res block fusion :-/ * fix build errors * some more fixes * minor cleanup * remove --use_fast_math - as it doesn't improve performance. - some minor cleanup * fix indentation
LeelaChessZero · Mar 3, 2022 · 025105e · 025105e
1 parent ca01eee
commit 025105e
Show file tree

Hide file tree

Showing 8 changed files with 333 additions and 126 deletions.
diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
@@ -556,16 +556,22 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
                           const T* b1, const T* w2, const T* b2,
                           cudaStream_t stream) {
   // Each thread processes entire chess board
-  if (C > kMaxResBlockFusingChannels) {
+  if (use_se == false) {
+    dim3 grid_dim(DivUp(C, kOpInpTransformBlockSize), N, 1);
+    OutputTransform_relu_InputTransform_kernel<float, activation, use_bias, use_skip>
+        <<<grid_dim, kOpInpTransformBlockSize, 0, stream>>>(N, C, output, input,
+                                                            (float*)skip, bias);
+  } else if (C > kMaxResBlockFusingChannels) {
     throw Exception(
         "res block fusing opt not supported for the given data type and no "
         "of filters\n");
   } else {
-    OutputTransform_SE_relu_InputTransform_kernel<float, use_se, activation,
+    OutputTransform_SE_relu_InputTransform_kernel<float, activation,
                                                   use_bias, use_skip>
         <<<N, C, 0, stream>>>(N, C, se_K, output, input, (float*)skip, bias, w1,
                               b1, w2, b2);
   }
+
   ReportCUDAErrors(cudaGetLastError());
 }
 
@@ -843,6 +849,7 @@ template void OutputTransform<float, true, RELU, true, true, false, false>(
     const float* w2, const float* b2, cudaStream_t stream);
 
 template void OutputTransform<float, false, RELU, true, true, false, false>(
+
     int N, int C, int se_K, float* output, const float* input,
     const float* skip, const float* bias, const float* w1, const float* b1,
     const float* w2, const float* b2, cudaStream_t stream);
@@ -867,6 +874,11 @@ template void OutputTransform<float, false, RELU, true, false, false, true>(
     const float* skip, const float* bias, const float* w1, const float* b1,
     const float* w2, const float* b2, cudaStream_t stream);
 
+template void OutputTransform<float, true, RELU, true, true, true, true>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, cudaStream_t stream);
+
 template void OutputTransform<float, true, MISH, true, true, false, false>(
     int N, int C, int se_K, float* output, const float* input,
     const float* skip, const float* bias, const float* w1, const float* b1,
@@ -897,6 +909,11 @@ template void OutputTransform<float, false, MISH, true, false, false, true>(
     const float* skip, const float* bias, const float* w1, const float* b1,
     const float* w2, const float* b2, cudaStream_t stream);
 
+template void OutputTransform<float, true, MISH, true, true, true, true>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, cudaStream_t stream);
+
 template void OutputTransform<float, false, NONE, true, false, false, false>(
     int N, int C, int se_K, float* output, const float* input,
     const float* skip, const float* bias, const float* w1, const float* b1,

diff --git a/src/neural/cuda/cuda_common.h b/src/neural/cuda/cuda_common.h
@@ -55,6 +55,10 @@ static constexpr int kMaxResBlockFusingSeKFp16Ampere =
     512;  // (use a different kernel with reduced register pressure)
 static constexpr int kMaxResBlockFusingSeK =
     128;  // limit on (num_filters / se_ratio)
+static constexpr int kMaxResBlockFusingSeFp16AmpereSmem =
+    72 * kMaxResBlockFusingSeKFp16Ampere *
+    sizeof(half);  // shared memory used by the special
+                   // kernel
 
 #ifdef USE_CUDNN
 void CudnnError(cudnnStatus_t status, const char* file, const int& line);

diff --git a/src/neural/cuda/fp16_kernels.cu b/src/neural/cuda/fp16_kernels.cu
@@ -207,7 +207,7 @@ bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip,
 // 'C' threads per block
 // 'N' blocks
 // Every thread generates an entire board/plane (8x8 elements).
-template <bool use_se, ActivationFunction activation, bool use_bias,
+template <ActivationFunction activation, bool use_bias,
           bool use_skip>
 __global__ __launch_bounds__(kMaxResBlockFusingSeKFp16Ampere,1)
 void OutputInputTransformKernel_fp16_shmem_board(
@@ -248,105 +248,97 @@ void OutputInputTransformKernel_fp16_shmem_board(
   float S = 0;
   float B = 0;
 
-  if (use_bias || use_se) {
 #pragma unroll
-    for (int y = 0; y < 8; y++) {
-      half boardRow[8];
-      copyAs<uint4>(&boardRow, &BOARD(y, 0));
+  for (int y = 0; y < 8; y++) {
+    half boardRow[8];
+    copyAs<uint4>(&boardRow, &BOARD(y, 0));
 #pragma unroll
-      for (int x = 0; x < 8; x++) {
-        if (use_bias) boardRow[x] += b;
-        if (use_se) S += (float)boardRow[x];
-      }
-      if (use_bias) copyAs<uint4>(&BOARD(y, 0), &boardRow);
+    for (int x = 0; x < 8; x++) {
+      if (use_bias) boardRow[x] += b;
+      S += (float)boardRow[x];
     }
+    if (use_bias) copyAs<uint4>(&BOARD(y, 0), &boardRow);
   }
 
-  if (use_se) {
-    __shared__ float shared_data[kMaxResBlockFusingSeKFp16Ampere];
-    float avg = S / 64;
-    shared_data[k] = avg;
+  __shared__ float shared_data[kMaxResBlockFusingSeKFp16Ampere];
+  float avg = S / 64;
+  shared_data[k] = avg;
 
-    int lane = k & 0x1F;
-    int warp = k >> 5;
-    __syncthreads();
+  int lane = k & 0x1F;
+  int warp = k >> 5;
+  __syncthreads();
 
-    // First fully-connected layer for SE
+  // First fully-connected layer for SE
 
-    // As se_K << C, we want to loop over se_K instead of C
-    // even if it means taking the sum across threads
+  // As se_K << C, we want to loop over se_K instead of C
+  // even if it means taking the sum across threads
 
-    __shared__ float shared_sums[kMaxResBlockFusingSeKFp16Ampere / 32]
-                                [kMaxResBlockFusingSeK];  // per-warp sums
+  __shared__ float shared_sums[kMaxResBlockFusingSeKFp16Ampere / 32]
+                              [kMaxResBlockFusingSeK];  // per-warp sums
 
-    for (int i = 0; i < se_K; i++) {
-      float val = shared_data[k] * float(readw1(k, i));
-      val = warpReduce(val);
-      if (lane == 0) shared_sums[warp][i] = val;
-    }
-    __syncthreads();
-    if (k < se_K) {
-      S = 0;
-      for (int i = 0; i < C / 32; i++) S += shared_sums[i][k];
-
-      S += (float)b1[k];
-      S = activate(S, activation);
-      shared_data[k] = S;
-    }
+  for (int i = 0; i < se_K; i++) {
+    float val = shared_data[k] * float(readw1(k, i));
+    val = warpReduce(val);
+    if (lane == 0) shared_sums[warp][i] = val;
+  }
+  __syncthreads();
+  if (k < se_K) {
+    S = 0;
+    for (int i = 0; i < C / 32; i++) S += shared_sums[i][k];
 
-    __syncthreads();
+    S += (float)b1[k];
+    S = activate(S, activation);
+    shared_data[k] = S;
+  }
 
-    // Second fully-connected layer for SE
-    S = 0;
-    for (int i = 0; i < se_K; i++) {
-      float val = shared_data[i];
-      S += val * float(readw2(i, k));
-      B += val * float(readw2(i, k + C));
-    }
-    S += (float)b2[k];
-    B += (float)b2[k + C];
+  __syncthreads();
 
-    // Sigmoid (only on the scale part).
-    S = 1.0f / (1.0f + exp(-S));
+  // Second fully-connected layer for SE
+  S = 0;
+  for (int i = 0; i < se_K; i++) {
+    float val = shared_data[i];
+    S += val * float(readw2(i, k));
+    B += val * float(readw2(i, k + C));
   }
+  S += (float)b2[k];
+  B += (float)b2[k + C];
 
-  // Scale/bias, add skip connection, perform relu, and write to output.
-  if (use_se || use_skip || activation != NONE) {
-    for (int h = 0; h < 8; h++) {
-      half boardRow[8];
-      copyAs<uint4>(&boardRow[0], &BOARD(h, 0));
+  // Sigmoid (only on the scale part).
+  S = 1.0f / (1.0f + exp(-S));
+
+  // Scale/bias, add skip connection, perform activation, and write to output.
+  for (int h = 0; h < 8; h++) {
+    half boardRow[8];
+    copyAs<uint4>(&boardRow[0], &BOARD(h, 0));
 
-      if (use_se) {
-#pragma unroll
-        for (int w = 0; w < 8; w++) {
-          boardRow[w] = (half)(float(boardRow[w]) * S + B);
-        }
-      }
-
-      // residual add
-      if (use_skip) {
-        half skipInp[8];
-        copyAs<uint4>(&skipInp[0], &skip[INDEX_NHCW(n, k, h, 0)]);
 #pragma unroll
-        for (int w = 0; w < 8; w++) boardRow[w] += skipInp[w];
-      }
+    for (int w = 0; w < 8; w++) {
+      boardRow[w] = (half)(float(boardRow[w]) * S + B);
+    }
 
-      // relu
-      if (activation != NONE) {
+    // residual add
+    if (use_skip) {
+      half skipInp[8];
+      copyAs<uint4>(&skipInp[0], &skip[INDEX_NHCW(n, k, h, 0)]);
 #pragma unroll
-        for (int w = 0; w < 8; w++)
-          boardRow[w] = (half)activate((float)boardRow[w], activation);
-      }
+      for (int w = 0; w < 8; w++) boardRow[w] += skipInp[w];
+    }
 
-      // write un-transformed output to 'skip' if required
-      if (use_skip) {
-        copyAs<uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &boardRow[0]);
-      }
+    if (activation != NONE) {
+#pragma unroll
+      for (int w = 0; w < 8; w++)
+        boardRow[w] = (half)activate((float)boardRow[w], activation);
+    }
 
-      copyAs<uint4>(&BOARD(h, 0), &boardRow);
+    // write un-transformed output to 'skip' if required
+    if (use_skip) {
+      copyAs<uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &boardRow[0]);
     }
+
+    copyAs<uint4>(&BOARD(h, 0), &boardRow);
   }
 
+
   // Perform input transform.
 
   int c = k;
@@ -434,17 +426,24 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
                           const T* b1, const T* w2, const T* b2,
                           cudaStream_t stream) {
   // Each thread processes entire chess board.
-  if (C > kMaxResBlockFusingChannels) {
+  if (use_se == false) {
+    dim3 grid_dim(DivUp(C, kOpInpTransformBlockSize), N, 1);
+    OutputTransform_relu_InputTransform_kernel<half, activation, use_bias,
+                                               use_skip>
+        <<<grid_dim, kOpInpTransformBlockSize, 0, stream>>>(N, C, output, input,
+                                                            (half*)skip, bias);
+  } else if (C > kMaxResBlockFusingChannels) {
     // Use special kernel with reduced register pressure - only works on Ampere,
     // and only for fp16.
     if (C <= kMaxResBlockFusingSeKFp16Ampere) {
       cudaFuncSetAttribute(
-          OutputInputTransformKernel_fp16_shmem_board<use_se, activation,
+          OutputInputTransformKernel_fp16_shmem_board<activation,
                                                       use_bias, use_skip>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, 72 * 1024);
-      OutputInputTransformKernel_fp16_shmem_board<use_se, activation, use_bias,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          72 * C * sizeof(half));
+      OutputInputTransformKernel_fp16_shmem_board<activation, use_bias,
                                                   use_skip>
-          <<<N, C, 72 * 1024, stream>>>(
+          <<<N, C, kMaxResBlockFusingSeFp16AmpereSmem, stream>>>(
               N, C, se_K, (half*)output, (const half*)input, (half*)skip,
               (half*)bias, (half*)w1, (half*)b1, (half*)w2, (half*)b2);
     } else {
@@ -453,7 +452,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
           "of filters\n");
     }
   } else {
-    OutputTransform_SE_relu_InputTransform_kernel<half, use_se, activation,
+    OutputTransform_SE_relu_InputTransform_kernel<half, activation,
                                                   use_bias, use_skip>
         <<<N, C, 0, stream>>>(N, C, se_K, output, input, (half*)skip, bias, w1,
                               b1, w2, b2);
@@ -501,6 +500,11 @@ template void OutputTransform<half, false, RELU, true, false, false, true>(
     const half* bias, const half* w1, const half* b1, const half* w2,
     const half* b2, cudaStream_t stream);
 
+template void OutputTransform<half, true, RELU, true, true, true, true>(
+    int N, int C, int se_K, half* output, const half* input, const half* skip,
+    const half* bias, const half* w1, const half* b1, const half* w2,
+    const half* b2, cudaStream_t stream);
+
 template void OutputTransform<half, true, MISH, true, true, false, false>(
     int N, int C, int se_K, half* output, const half* input, const half* skip,
     const half* bias, const half* w1, const half* b1, const half* w2,
@@ -531,6 +535,11 @@ template void OutputTransform<half, false, MISH, true, false, false, true>(
     const half* bias, const half* w1, const half* b1, const half* w2,
     const half* b2, cudaStream_t stream);
 
+template void OutputTransform<half, true, MISH, true, true, true, true>(
+    int N, int C, int se_K, half* output, const half* input, const half* skip,
+    const half* bias, const half* w1, const half* b1, const half* w2,
+    const half* b2, cudaStream_t stream);
+
 template void OutputTransform<half, false, NONE, true, false, false, false>(
     int N, int C, int se_K, half* output, const half* input, const half* skip,
     const half* bias, const half* w1, const half* b1, const half* w2,

diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
@@ -1055,13 +1055,15 @@ Conv1Layer<DataType>::~Conv1Layer() {
 template <typename DataType>
 ResidualBlock<DataType>::ResidualBlock(BaseLayer<DataType>* ip, int C, bool se,
                                        int se_k, bool use_gemm_ex, bool first,
-                                       bool last, ActivationFunction activation)
+
+                                       bool last, ActivationFunction activation, int shared_mem_size)
     : BaseLayer<DataType>(C, 8, 8, ip, ip->isNHWC(), use_gemm_ex),
       has_se_(se),
       se_k_(se_k),
       c_input_(C),
       first_block_(first),
       last_block_(last),
+      shared_mem_size_(shared_mem_size),
       act_(activation) {
   if (act_ != RELU && act_ != MISH) {
     throw Exception("Unsupported activation for residual block.");
@@ -1229,6 +1231,12 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
       transformed_input, transformed_weights1_, transformed_output, N * 4, C, C,
       36, cublas);
 
+  const bool fp16 = std::is_same<half, DataType>::value;
+  bool allowFusing =
+      (C <= kMaxResBlockFusingChannels) ||
+      (fp16 && (shared_mem_size_ >= kMaxResBlockFusingSeFp16AmpereSmem) &&
+       (C <= kMaxResBlockFusingSeKFp16Ampere));
+
   if (act_ == RELU) {
     if (last_block_) {
       if (has_se_)
@@ -1240,11 +1248,19 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
             N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
             w2_, b2_, stream);
     } else {
-      if (has_se_)
-        OutputInputTransform<DataType, true, RELU, true, true>(
-            N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
-            w2_, b2_, stream);
-      else
+      if (has_se_) {
+        if (allowFusing) {
+          OutputInputTransform<DataType, true, RELU, true, true>(
+              N, C, se_k_, output, transformed_output, input, biases1_, w1_,
+              b1_, w2_, b2_, stream);
+        } else {
+          OutputTransform<DataType, true, RELU, true, true, true, true>(
+              N, C, se_k_, (DataType*)input, transformed_output, input,
+              biases1_, w1_, b1_, w2_, b2_, stream);
+          InputTransform<DataType, true>(N, C, output, (DataType*)input,
+                                         stream);
+        }
+      } else
         OutputInputTransform<DataType, false, RELU, true, true>(
             N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
             w2_, b2_, stream);
@@ -1260,11 +1276,19 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
             N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
             w2_, b2_, stream);
     } else {
-      if (has_se_)
-        OutputInputTransform<DataType, true, MISH, true, true>(
-            N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
-            w2_, b2_, stream);
-      else
+      if (has_se_) {
+        if (allowFusing) {
+          OutputInputTransform<DataType, true, MISH, true, true>(
+              N, C, se_k_, output, transformed_output, input, biases1_, w1_,
+              b1_, w2_, b2_, stream);
+        } else {
+          OutputTransform<DataType, true, MISH, true, true, true, true>(
+              N, C, se_k_, (DataType*)input, transformed_output, input,
+              biases1_, w1_, b1_, w2_, b2_, stream);
+          InputTransform<DataType, true>(N, C, output, (DataType*)input,
+                                         stream);
+        }
+      } else
         OutputInputTransform<DataType, false, MISH, true, true>(
             N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
             w2_, b2_, stream);