Skip to content

Commit

Permalink
Optimized Res-block Fusion without SE (#1678)
Browse files Browse the repository at this point in the history
* misc changes to cudnn backend

- replace all cudaMemcpyAsync used for loading weights with cudaMemcpy as  source (in CPU memory) could be deleted before the async version of the function actually does the copy.
- minor naming/style changes.
- add comment explaining what the policy map layer does and how the layout conversion from CHW to HWC works.

* fix typo in comment

* clang-format

* address review comment

* Add 320 and 352 channel support for fused SE layer

- just add template instantiations.
- verified that it works and provides a (very) slight speedup.

* Update fp16_kernels.cu

* Simpler kernel for res-block fusion without SE

 - use constant block size of 64, splitting channel dimension also into multiple blocks as needed.
 - This allows arbitrarily large filter counts without running out of register file.

* minor refactoring

 - allow using res block fusing opt for alternate layers (that don't have SE) even on GPUs that don't have enough shared memory.

* minor functional fix

* a few more fixes to get correct output

hopefully functionally correct now.

* fix cudnn backend build

 - missed the fact that it also uses Res block fusion :-/

* fix build errors

* some more fixes

* minor cleanup

* remove --use_fast_math

- as it doesn't improve performance.
- some minor cleanup

* fix indentation
  • Loading branch information
ankan-ban authored Mar 3, 2022
1 parent ca01eee commit 025105e
Show file tree
Hide file tree
Showing 8 changed files with 333 additions and 126 deletions.
21 changes: 19 additions & 2 deletions src/neural/cuda/common_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -556,16 +556,22 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
const T* b1, const T* w2, const T* b2,
cudaStream_t stream) {
// Each thread processes entire chess board
if (C > kMaxResBlockFusingChannels) {
if (use_se == false) {
dim3 grid_dim(DivUp(C, kOpInpTransformBlockSize), N, 1);
OutputTransform_relu_InputTransform_kernel<float, activation, use_bias, use_skip>
<<<grid_dim, kOpInpTransformBlockSize, 0, stream>>>(N, C, output, input,
(float*)skip, bias);
} else if (C > kMaxResBlockFusingChannels) {
throw Exception(
"res block fusing opt not supported for the given data type and no "
"of filters\n");
} else {
OutputTransform_SE_relu_InputTransform_kernel<float, use_se, activation,
OutputTransform_SE_relu_InputTransform_kernel<float, activation,
use_bias, use_skip>
<<<N, C, 0, stream>>>(N, C, se_K, output, input, (float*)skip, bias, w1,
b1, w2, b2);
}

ReportCUDAErrors(cudaGetLastError());
}

Expand Down Expand Up @@ -843,6 +849,7 @@ template void OutputTransform<float, true, RELU, true, true, false, false>(
const float* w2, const float* b2, cudaStream_t stream);

template void OutputTransform<float, false, RELU, true, true, false, false>(

int N, int C, int se_K, float* output, const float* input,
const float* skip, const float* bias, const float* w1, const float* b1,
const float* w2, const float* b2, cudaStream_t stream);
Expand All @@ -867,6 +874,11 @@ template void OutputTransform<float, false, RELU, true, false, false, true>(
const float* skip, const float* bias, const float* w1, const float* b1,
const float* w2, const float* b2, cudaStream_t stream);

template void OutputTransform<float, true, RELU, true, true, true, true>(
int N, int C, int se_K, float* output, const float* input,
const float* skip, const float* bias, const float* w1, const float* b1,
const float* w2, const float* b2, cudaStream_t stream);

template void OutputTransform<float, true, MISH, true, true, false, false>(
int N, int C, int se_K, float* output, const float* input,
const float* skip, const float* bias, const float* w1, const float* b1,
Expand Down Expand Up @@ -897,6 +909,11 @@ template void OutputTransform<float, false, MISH, true, false, false, true>(
const float* skip, const float* bias, const float* w1, const float* b1,
const float* w2, const float* b2, cudaStream_t stream);

template void OutputTransform<float, true, MISH, true, true, true, true>(
int N, int C, int se_K, float* output, const float* input,
const float* skip, const float* bias, const float* w1, const float* b1,
const float* w2, const float* b2, cudaStream_t stream);

template void OutputTransform<float, false, NONE, true, false, false, false>(
int N, int C, int se_K, float* output, const float* input,
const float* skip, const float* bias, const float* w1, const float* b1,
Expand Down
4 changes: 4 additions & 0 deletions src/neural/cuda/cuda_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ static constexpr int kMaxResBlockFusingSeKFp16Ampere =
512; // (use a different kernel with reduced register pressure)
static constexpr int kMaxResBlockFusingSeK =
128; // limit on (num_filters / se_ratio)
static constexpr int kMaxResBlockFusingSeFp16AmpereSmem =
72 * kMaxResBlockFusingSeKFp16Ampere *
sizeof(half); // shared memory used by the special
// kernel

#ifdef USE_CUDNN
void CudnnError(cudnnStatus_t status, const char* file, const int& line);
Expand Down
173 changes: 91 additions & 82 deletions src/neural/cuda/fp16_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip,
// 'C' threads per block
// 'N' blocks
// Every thread generates an entire board/plane (8x8 elements).
template <bool use_se, ActivationFunction activation, bool use_bias,
template <ActivationFunction activation, bool use_bias,
bool use_skip>
__global__ __launch_bounds__(kMaxResBlockFusingSeKFp16Ampere,1)
void OutputInputTransformKernel_fp16_shmem_board(
Expand Down Expand Up @@ -248,105 +248,97 @@ void OutputInputTransformKernel_fp16_shmem_board(
float S = 0;
float B = 0;

if (use_bias || use_se) {
#pragma unroll
for (int y = 0; y < 8; y++) {
half boardRow[8];
copyAs<uint4>(&boardRow, &BOARD(y, 0));
for (int y = 0; y < 8; y++) {
half boardRow[8];
copyAs<uint4>(&boardRow, &BOARD(y, 0));
#pragma unroll
for (int x = 0; x < 8; x++) {
if (use_bias) boardRow[x] += b;
if (use_se) S += (float)boardRow[x];
}
if (use_bias) copyAs<uint4>(&BOARD(y, 0), &boardRow);
for (int x = 0; x < 8; x++) {
if (use_bias) boardRow[x] += b;
S += (float)boardRow[x];
}
if (use_bias) copyAs<uint4>(&BOARD(y, 0), &boardRow);
}

if (use_se) {
__shared__ float shared_data[kMaxResBlockFusingSeKFp16Ampere];
float avg = S / 64;
shared_data[k] = avg;
__shared__ float shared_data[kMaxResBlockFusingSeKFp16Ampere];
float avg = S / 64;
shared_data[k] = avg;

int lane = k & 0x1F;
int warp = k >> 5;
__syncthreads();
int lane = k & 0x1F;
int warp = k >> 5;
__syncthreads();

// First fully-connected layer for SE
// First fully-connected layer for SE

// As se_K << C, we want to loop over se_K instead of C
// even if it means taking the sum across threads
// As se_K << C, we want to loop over se_K instead of C
// even if it means taking the sum across threads

__shared__ float shared_sums[kMaxResBlockFusingSeKFp16Ampere / 32]
[kMaxResBlockFusingSeK]; // per-warp sums
__shared__ float shared_sums[kMaxResBlockFusingSeKFp16Ampere / 32]
[kMaxResBlockFusingSeK]; // per-warp sums

for (int i = 0; i < se_K; i++) {
float val = shared_data[k] * float(readw1(k, i));
val = warpReduce(val);
if (lane == 0) shared_sums[warp][i] = val;
}
__syncthreads();
if (k < se_K) {
S = 0;
for (int i = 0; i < C / 32; i++) S += shared_sums[i][k];

S += (float)b1[k];
S = activate(S, activation);
shared_data[k] = S;
}
for (int i = 0; i < se_K; i++) {
float val = shared_data[k] * float(readw1(k, i));
val = warpReduce(val);
if (lane == 0) shared_sums[warp][i] = val;
}
__syncthreads();
if (k < se_K) {
S = 0;
for (int i = 0; i < C / 32; i++) S += shared_sums[i][k];

__syncthreads();
S += (float)b1[k];
S = activate(S, activation);
shared_data[k] = S;
}

// Second fully-connected layer for SE
S = 0;
for (int i = 0; i < se_K; i++) {
float val = shared_data[i];
S += val * float(readw2(i, k));
B += val * float(readw2(i, k + C));
}
S += (float)b2[k];
B += (float)b2[k + C];
__syncthreads();

// Sigmoid (only on the scale part).
S = 1.0f / (1.0f + exp(-S));
// Second fully-connected layer for SE
S = 0;
for (int i = 0; i < se_K; i++) {
float val = shared_data[i];
S += val * float(readw2(i, k));
B += val * float(readw2(i, k + C));
}
S += (float)b2[k];
B += (float)b2[k + C];

// Scale/bias, add skip connection, perform relu, and write to output.
if (use_se || use_skip || activation != NONE) {
for (int h = 0; h < 8; h++) {
half boardRow[8];
copyAs<uint4>(&boardRow[0], &BOARD(h, 0));
// Sigmoid (only on the scale part).
S = 1.0f / (1.0f + exp(-S));

// Scale/bias, add skip connection, perform activation, and write to output.
for (int h = 0; h < 8; h++) {
half boardRow[8];
copyAs<uint4>(&boardRow[0], &BOARD(h, 0));

if (use_se) {
#pragma unroll
for (int w = 0; w < 8; w++) {
boardRow[w] = (half)(float(boardRow[w]) * S + B);
}
}

// residual add
if (use_skip) {
half skipInp[8];
copyAs<uint4>(&skipInp[0], &skip[INDEX_NHCW(n, k, h, 0)]);
#pragma unroll
for (int w = 0; w < 8; w++) boardRow[w] += skipInp[w];
}
for (int w = 0; w < 8; w++) {
boardRow[w] = (half)(float(boardRow[w]) * S + B);
}

// relu
if (activation != NONE) {
// residual add
if (use_skip) {
half skipInp[8];
copyAs<uint4>(&skipInp[0], &skip[INDEX_NHCW(n, k, h, 0)]);
#pragma unroll
for (int w = 0; w < 8; w++)
boardRow[w] = (half)activate((float)boardRow[w], activation);
}
for (int w = 0; w < 8; w++) boardRow[w] += skipInp[w];
}

// write un-transformed output to 'skip' if required
if (use_skip) {
copyAs<uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &boardRow[0]);
}
if (activation != NONE) {
#pragma unroll
for (int w = 0; w < 8; w++)
boardRow[w] = (half)activate((float)boardRow[w], activation);
}

copyAs<uint4>(&BOARD(h, 0), &boardRow);
// write un-transformed output to 'skip' if required
if (use_skip) {
copyAs<uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &boardRow[0]);
}

copyAs<uint4>(&BOARD(h, 0), &boardRow);
}


// Perform input transform.

int c = k;
Expand Down Expand Up @@ -434,17 +426,24 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
const T* b1, const T* w2, const T* b2,
cudaStream_t stream) {
// Each thread processes entire chess board.
if (C > kMaxResBlockFusingChannels) {
if (use_se == false) {
dim3 grid_dim(DivUp(C, kOpInpTransformBlockSize), N, 1);
OutputTransform_relu_InputTransform_kernel<half, activation, use_bias,
use_skip>
<<<grid_dim, kOpInpTransformBlockSize, 0, stream>>>(N, C, output, input,
(half*)skip, bias);
} else if (C > kMaxResBlockFusingChannels) {
// Use special kernel with reduced register pressure - only works on Ampere,
// and only for fp16.
if (C <= kMaxResBlockFusingSeKFp16Ampere) {
cudaFuncSetAttribute(
OutputInputTransformKernel_fp16_shmem_board<use_se, activation,
OutputInputTransformKernel_fp16_shmem_board<activation,
use_bias, use_skip>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 72 * 1024);
OutputInputTransformKernel_fp16_shmem_board<use_se, activation, use_bias,
cudaFuncAttributeMaxDynamicSharedMemorySize,
72 * C * sizeof(half));
OutputInputTransformKernel_fp16_shmem_board<activation, use_bias,
use_skip>
<<<N, C, 72 * 1024, stream>>>(
<<<N, C, kMaxResBlockFusingSeFp16AmpereSmem, stream>>>(
N, C, se_K, (half*)output, (const half*)input, (half*)skip,
(half*)bias, (half*)w1, (half*)b1, (half*)w2, (half*)b2);
} else {
Expand All @@ -453,7 +452,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
"of filters\n");
}
} else {
OutputTransform_SE_relu_InputTransform_kernel<half, use_se, activation,
OutputTransform_SE_relu_InputTransform_kernel<half, activation,
use_bias, use_skip>
<<<N, C, 0, stream>>>(N, C, se_K, output, input, (half*)skip, bias, w1,
b1, w2, b2);
Expand Down Expand Up @@ -501,6 +500,11 @@ template void OutputTransform<half, false, RELU, true, false, false, true>(
const half* bias, const half* w1, const half* b1, const half* w2,
const half* b2, cudaStream_t stream);

template void OutputTransform<half, true, RELU, true, true, true, true>(
int N, int C, int se_K, half* output, const half* input, const half* skip,
const half* bias, const half* w1, const half* b1, const half* w2,
const half* b2, cudaStream_t stream);

template void OutputTransform<half, true, MISH, true, true, false, false>(
int N, int C, int se_K, half* output, const half* input, const half* skip,
const half* bias, const half* w1, const half* b1, const half* w2,
Expand Down Expand Up @@ -531,6 +535,11 @@ template void OutputTransform<half, false, MISH, true, false, false, true>(
const half* bias, const half* w1, const half* b1, const half* w2,
const half* b2, cudaStream_t stream);

template void OutputTransform<half, true, MISH, true, true, true, true>(
int N, int C, int se_K, half* output, const half* input, const half* skip,
const half* bias, const half* w1, const half* b1, const half* w2,
const half* b2, cudaStream_t stream);

template void OutputTransform<half, false, NONE, true, false, false, false>(
int N, int C, int se_K, half* output, const half* input, const half* skip,
const half* bias, const half* w1, const half* b1, const half* w2,
Expand Down
46 changes: 35 additions & 11 deletions src/neural/cuda/layers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1055,13 +1055,15 @@ Conv1Layer<DataType>::~Conv1Layer() {
template <typename DataType>
ResidualBlock<DataType>::ResidualBlock(BaseLayer<DataType>* ip, int C, bool se,
int se_k, bool use_gemm_ex, bool first,
bool last, ActivationFunction activation)

bool last, ActivationFunction activation, int shared_mem_size)
: BaseLayer<DataType>(C, 8, 8, ip, ip->isNHWC(), use_gemm_ex),
has_se_(se),
se_k_(se_k),
c_input_(C),
first_block_(first),
last_block_(last),
shared_mem_size_(shared_mem_size),
act_(activation) {
if (act_ != RELU && act_ != MISH) {
throw Exception("Unsupported activation for residual block.");
Expand Down Expand Up @@ -1229,6 +1231,12 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
transformed_input, transformed_weights1_, transformed_output, N * 4, C, C,
36, cublas);

const bool fp16 = std::is_same<half, DataType>::value;
bool allowFusing =
(C <= kMaxResBlockFusingChannels) ||
(fp16 && (shared_mem_size_ >= kMaxResBlockFusingSeFp16AmpereSmem) &&
(C <= kMaxResBlockFusingSeKFp16Ampere));

if (act_ == RELU) {
if (last_block_) {
if (has_se_)
Expand All @@ -1240,11 +1248,19 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
w2_, b2_, stream);
} else {
if (has_se_)
OutputInputTransform<DataType, true, RELU, true, true>(
N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
w2_, b2_, stream);
else
if (has_se_) {
if (allowFusing) {
OutputInputTransform<DataType, true, RELU, true, true>(
N, C, se_k_, output, transformed_output, input, biases1_, w1_,
b1_, w2_, b2_, stream);
} else {
OutputTransform<DataType, true, RELU, true, true, true, true>(
N, C, se_k_, (DataType*)input, transformed_output, input,
biases1_, w1_, b1_, w2_, b2_, stream);
InputTransform<DataType, true>(N, C, output, (DataType*)input,
stream);
}
} else
OutputInputTransform<DataType, false, RELU, true, true>(
N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
w2_, b2_, stream);
Expand All @@ -1260,11 +1276,19 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
w2_, b2_, stream);
} else {
if (has_se_)
OutputInputTransform<DataType, true, MISH, true, true>(
N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
w2_, b2_, stream);
else
if (has_se_) {
if (allowFusing) {
OutputInputTransform<DataType, true, MISH, true, true>(
N, C, se_k_, output, transformed_output, input, biases1_, w1_,
b1_, w2_, b2_, stream);
} else {
OutputTransform<DataType, true, MISH, true, true, true, true>(
N, C, se_k_, (DataType*)input, transformed_output, input,
biases1_, w1_, b1_, w2_, b2_, stream);
InputTransform<DataType, true>(N, C, output, (DataType*)input,
stream);
}
} else
OutputInputTransform<DataType, false, MISH, true, true>(
N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
w2_, b2_, stream);
Expand Down
Loading

0 comments on commit 025105e

Please sign in to comment.