add a batch norm inference kernel. #3309

qingqing01 · 2017-08-07T11:13:12Z

… bn_infer

qingqing01 · 2017-08-07T11:33:36Z

cudnn lib有bug，在cudnn 5.1上 n > 1024时出错，可以使用下面代码验证:

#include <cuda.h>
#include <cudnn.h>
#include <iostream>
#include <sstream>
#include <fstream>

#include <stdio.h>

#define TOSTR_(s)   #s
#define TOSTR(s)    TOSTR_(s)
#define CUDNN_VERSION_STR  TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL)

#define FatalError(s) {                                                \
    std::stringstream _where, _message;                                \
    _where << __FILE__ << ':' << __LINE__;                             \
    _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\
    std::cerr << _message.str() << "\nAborting...\n";                  \
    cudaDeviceReset();                                                 \
    exit(EXIT_FAILURE);                                                \
}

#define checkCUDNN(status) {                                           \
    std::stringstream _error;                                          \
    if (status != CUDNN_STATUS_SUCCESS) {                              \
      _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \
      FatalError(_error.str());                                        \
    }                                                                  \
}

#define checkCUDA(status) {                                      \
    std::stringstream _error;                                          \
    if (status != 0) {                                                 \
      _error << "Cuda failure\nError: " << cudaGetErrorString(status); \
      FatalError(_error.str());                                        \
    }                                                                  \
}



#include <sys/time.h>
#include <unistd.h>


void create(float** h_v, float** d_v, int n) {
  *h_v = (float *)malloc(n * sizeof(float));
  checkCUDA(cudaMalloc(d_v, n * sizeof(float)));
  for(int i = 0; i < n; i++)
    (*h_v)[i] = 1.0f;
  checkCUDA(cudaMemcpy(*d_v, *h_v, n * sizeof(float), cudaMemcpyHostToDevice));
}

int main(int argc, char *argv[]) {   

  int version = (int)cudnnGetVersion();
  printf("cudnnGetVersion() : %d , CUDNN_VERSION from cudnn.h : %d (%s)\n",
      version, CUDNN_VERSION, CUDNN_VERSION_STR);
  cudaSetDevice(0);

  /* input dim */
  int n, c, h, w;
  n = 1025;
  c = 512;
  h = 1;
  w = 1;

  /* Handles */
  cudnnHandle_t cudnnHandle;
  cudnnTensorDescriptor_t ioDesc, bnDesc;
  
  /* Create Handles and Descriptor*/
  checkCUDNN( cudnnCreate(&cudnnHandle));
  checkCUDNN( cudnnCreateTensorDescriptor(&ioDesc));
  checkCUDNN( cudnnCreateTensorDescriptor(&bnDesc));

  /* some constants */
  cudnnDataType_t dataType = CUDNN_DATA_FLOAT;
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;

  /* initilize input and output buffers */
  float* h_input;
  float* d_input;
  float* h_scale;
  float* d_scale;
  float* h_bias;
  float* d_bias;
  float* h_estimated_mean;
  float* d_estimated_mean;
  float* h_estimated_var;
  float* d_estimated_var;

  float* h_output;
  float* d_output;

  create(&h_input, &d_input, n * c * h * w);
  create(&h_output, &d_output, n * c * h * w);
  create(&h_scale, &d_scale, c);
  create(&h_bias, &d_bias, c);
  create(&h_estimated_mean, &d_estimated_mean, c);
  create(&h_estimated_var, &d_estimated_var, c);

  /* initilize handles */
  const int stride_w = 1;
  const int stride_h = w * stride_w;
  const int stride_c = h * stride_h;
  const int stride_n = c * stride_c;

  printf("set cudnn tensor\n");
  checkCUDNN(cudnnSetTensor4dDescriptorEx(ioDesc, dataType, n, c,
      h, w, stride_n, stride_c, stride_h, stride_w));
  checkCUDNN(cudnnSetTensor4dDescriptorEx(bnDesc, dataType, 1, c,
      1, 1, c, 1, 1, 1));

  float alpha, beta;
  alpha = 1.0f;
  beta = 0.0f;
  double epsilon = 1E-5;

  checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle,
                                          mode,
                                          &alpha,
                                          &beta,
                                          ioDesc,
                                          d_input,
                                          ioDesc,
                                          d_output,
                                          bnDesc,
                                          d_scale,
                                          d_bias,
                                          d_estimated_mean,
                                          d_estimated_var,
                                          epsilon));
  checkCUDA(cudaMemcpy(h_output, d_output, (n * c * h * w) * sizeof(float),
      cudaMemcpyDeviceToHost));

  free(h_input);
  free(h_output);
  free(h_scale);
  free(h_bias);
  free(h_estimated_mean);
  free(h_estimated_var);
  checkCUDA(cudaFree(d_input));
  checkCUDA(cudaFree(d_output));
  checkCUDA(cudaFree(d_scale));
  checkCUDA(cudaFree(d_bias));
  checkCUDA(cudaFree(d_estimated_mean));
  checkCUDA(cudaFree(d_estimated_var));

  /* Destroy Handles */
  checkCUDNN( cudnnDestroyTensorDescriptor(ioDesc) );
  checkCUDNN( cudnnDestroyTensorDescriptor(bnDesc) );
  checkCUDNN( cudnnDestroy(cudnnHandle) );
  return 0;
}

hedaoyuan · 2017-08-07T11:34:02Z

paddle/cuda/src/hl_batch_norm.cu

+                                  size_t height,
+                                  size_t width) {
+  dim3 block(256, 1);
+  dim3 grid(1, batchSize);


gird(batchSize, 1) is better，Maximum x-dimension is 2^32 - 1, Maximum y dimension is 65536.

hedaoyuan · 2017-08-07T11:39:32Z

paddle/cuda/src/hl_batch_norm.cu

+                                   size_t channel,
+                                   size_t height,
+                                   size_t width) {
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;


blockIdx.x * blockDim.x can be removed, blockIdx.x is always equal 0.

hedaoyuan · 2017-08-07T11:42:44Z

paddle/cuda/src/hl_batch_norm.cu

+  const int num = channel * height * width;
+  const int batch = blockIdx.y;
+  for (int i = tid; i < num; i += blockDim.x) {
+    const int c = (i / (height * width)) % channel;


Can remove % channel, i / (height * width) is smaller than the channel.

hedaoyuan · 2017-08-07T11:47:54Z

paddle/gserver/layers/CudnnBatchNormLayer.cpp

-                                    movingVar,
-                                    EPS);
+    if (batchSize > 1024) {
+      // there is a bug in cudnn library when the batch size


Some places say this is a limitation of CUDNN, not bug.

Modify the comments.

… bn_infer

hedaoyuan

LGTM

add a batch norm inference kernel.

73192bb

qingqing01 requested review from kuke, pkuyym, xinghai-sun and hedaoyuan August 7, 2017 11:13

qingqing01 added 3 commits August 7, 2017 19:18

modify code comments.

bf08e5d

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

3e4c671

… bn_infer

Remove the warning in hl_batch_norm_forward_inference function.

da7b9a5

hedaoyuan reviewed Aug 7, 2017

View reviewed changes

qingqing01 added 2 commits August 7, 2017 20:27

update cuda kernel.

7da1db0

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

0698423

… bn_infer

hedaoyuan approved these changes Aug 7, 2017

View reviewed changes

wangkuiyi merged commit 81c3136 into PaddlePaddle:develop Aug 7, 2017

kuke mentioned this pull request Aug 10, 2017

Update readme in DS2 PaddlePaddle/models#195

Merged

qingqing01 deleted the bn_infer branch March 7, 2018 12:03

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

add a batch norm inference kernel. #3309

add a batch norm inference kernel. #3309

qingqing01 commented Aug 7, 2017

qingqing01 commented Aug 7, 2017 •

edited

Loading

hedaoyuan Aug 7, 2017

qingqing01 Aug 7, 2017

hedaoyuan Aug 7, 2017

qingqing01 Aug 7, 2017

hedaoyuan Aug 7, 2017

qingqing01 Aug 7, 2017

hedaoyuan Aug 7, 2017

qingqing01 Aug 7, 2017

hedaoyuan left a comment

add a batch norm inference kernel. #3309

add a batch norm inference kernel. #3309

Conversation

qingqing01 commented Aug 7, 2017

qingqing01 commented Aug 7, 2017 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

hedaoyuan left a comment

Choose a reason for hiding this comment

qingqing01 commented Aug 7, 2017 •

edited

Loading