Merge pull request #5 from listenlink/opencl

Split legacy and non legacy version of cll_backward kernel to reduce branching
BVLC · Mar 3, 2016 · 284fa17 · 284fa17
2 parents 6ff13d0 + 4c04e01
commit 284fa17
Show file tree

Hide file tree

Showing 14 changed files with 3,432 additions and 37 deletions.
diff --git a/include/caffe/greentea/cl_kernels.hpp b/include/caffe/greentea/cl_kernels.hpp
@@ -11,6 +11,7 @@
 #include "viennacl/ocl/platform.hpp"
 namespace caffe {
 viennacl::ocl::program & RegisterKernels(viennacl::ocl::context *ctx);
+viennacl::ocl::program & submit_conv_spatial_program(viennacl::ocl::context &ctx, string name, string options);
 }
 #endif
 #endif
diff --git a/include/caffe/greentea/greentea.hpp b/include/caffe/greentea/greentea.hpp
@@ -8,6 +8,8 @@
 #ifndef CAFFE_GREENTEA_HPP_
 #define CAFFE_GREENTEA_HPP_
 
+#define VIENNACL_PROFILING_ENABLED
+
 #ifdef CMAKE_BUILD
   #include "caffe_config.h"
 #endif

diff --git a/include/caffe/layers/conv_spatial_layer.hpp b/include/caffe/layers/conv_spatial_layer.hpp
@@ -0,0 +1,206 @@
+#ifndef CAFFE_CONV_SPATIAL_LAYER_HPP_
+#define CAFFE_CONV_SPATIAL_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/base_conv_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class ConvolutionLayerSpatial : public BaseConvolutionLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ConvolutionParameter convolution_param,
+   *    with ConvolutionLayer options:
+   *  - num_output. The number of filters.
+   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
+   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
+   *  filters.
+   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
+   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
+   *  for different strides. By default the convolution is dense with stride 1.
+   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
+   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
+   *  different padding. Input padding is computed implicitly instead of
+   *  actually padding.
+   *  - group (\b optional, default 1). The number of filter groups. Group
+   *  convolution is a method for reducing parameterization by selectively
+   *  connecting input and output channels. The input and output channel dimensions must be divisible
+   *  by the number of groups. For group @f$ \geq 1 @f$, the
+   *  convolutional filters' input and output channels are separated s.t. each
+   *  group takes 1 / group of the input channels and makes 1 / group of the
+   *  output channels. Concretely 4 input channels, 8 output channels, and
+   *  2 groups separate input channels 1-2 and output channels 1-4 into the
+   *  first group and input channels 3-4 and output channels 5-8 into the second
+   *  group.
+   *  - bias_term (\b optional, default true). Whether to have a bias.
+   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
+   *    kernels + stream parallelism) engines.
+   */
+  explicit ConvolutionLayerSpatial(const LayerParameter& param) :
+      BaseConvolutionLayer<Dtype>(param) {
+  }
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const {
+    return "Convolution";
+  }
+
+  virtual inline int MinBottomBlobs() const {
+    return 1;
+  }
+  virtual inline int MinTopBlobs() const {
+    return 1;
+  }
+  virtual inline bool EqualNumBottomTopBlobs() const {
+    return true;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  virtual inline bool reverse_dimensions() {
+    return false;
+  }
+  virtual void compute_output_shape();
+
+  struct kernelConfig {
+    string kernelName;
+    float executionTime;
+    size_t local_work_size[3];
+    size_t global_work_size[3];
+    int workItem_output[3];
+    bool verified;
+    bool autoTune;
+    bool tested;
+    bool swizzle_weights;
+    bool batched_execute;
+    bool use_null_local;
+    int kernelType;
+
+    kernelConfig() {
+    }
+    kernelConfig(string name, size_t* global_size, size_t* local_size,
+        int* workItem, bool tune, bool swizzle, bool batched, bool null_local, int type = 0) {
+      kernelName = name;
+      for (int x = 0; x < 3; x++) {
+        local_work_size[x] = local_size[x];
+        global_work_size[x] = global_size[x];
+        workItem_output[x] = workItem[x];
+      }
+      autoTune = tune;
+      swizzle_weights = swizzle;
+      batched_execute = batched;
+      use_null_local = null_local;
+      verified = false;
+      tested = false;
+      kernelType = type;
+    }
+  };
+
+#ifndef CPU_ONLY
+#ifdef USE_GREENTEA
+  virtual bool generate_kernel(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
+  virtual bool generate_batched_kernel(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
+  virtual void setup_convolution(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top);
+  virtual void create_convolution_kernel(const vector<Blob<float>*>& bottom,
+        const vector<Blob<float>*>& top,int kernelType, int blockWidth, int blockHeight, int blockDepth);
+  virtual bool setup_IDLF(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
+  virtual bool create_basic_kernel(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
+  virtual bool create_verification_kernel(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top);
+  virtual cl_int convolve(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int index, int numImages,
+      kernelConfig* config);
+  virtual cl_int batched_convolve(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int index, int numImages,
+      kernelConfig* config);
+  virtual float timed_convolve(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int index, int numImages,
+      kernelConfig* config);
+  virtual bool verify_result(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, int index, int numImages,
+      kernelConfig* config);
+  virtual bool tune_local_size(const vector<Blob<float>*>& bottom,
+      const vector<Blob<float>*>& top, kernelConfig*);
+  virtual void swizzleWeights(int swizzle_factor);
+  virtual void pad_image(int image_offset, kernelConfig* config, int imgNum);
+  virtual void generate_key();
+  virtual std::string generate_unique_key();
+  virtual std::string generate_specific_key(int type, int blockWidth, int blockHeight, int blockDepth);
+  virtual void calculate_global_size(int batch, int* workItemOutput,
+      size_t* localSizes, size_t* globalSizes);
+#endif
+#endif
+
+  const float* bottom_data;
+  float* top_data;
+  float* col_data;
+  const float* weight;
+  float* swizzled_weights;
+  int weight_offset;
+  int col_offset;
+  int top_offset;
+  int output_h_, output_w_;
+  int padded_height_, padded_width_;
+  const float* bias_;
+  int bias_offset_;
+  int bottom_index_;
+
+  int kernel_h_;
+  int kernel_w_;
+  int height_;
+  int width_;
+  int pad_h_;
+  int pad_w_;
+  int stride_h_;
+  int stride_w_;
+
+  /// M_ is the channel dimension of the output for a single group, which is the
+  /// leading dimension of the filter matrix.
+  int M_;
+  /// K_ is the dimension of an unrolled input for a single group, which is the
+  /// leading dimension of the data matrix.
+  int K_;
+  /// N_ is the spatial dimension of the output, the H x W, which are the last
+  /// dimensions of the data and filter matrices.
+  int N_;
+
+  bool tuned_;
+
+  std::string key_;
+  std::string kernel_name_;
+  std::string verification_kernel;
+  Blob<Dtype> col_buffer_;
+  Blob<Dtype> swizzled_weights_;
+  Blob<Dtype> bias_multiplier_;
+
+  int kernel_index_;
+  int kernel_uid_;
+
+  vector<kernelConfig*> kernelQueue;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CONV_SPATIAL_LAYER_HPP_
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
@@ -4,7 +4,9 @@
 #include <boost/date_time/posix_time/posix_time.hpp>
 
 #include "caffe/util/device_alternate.hpp"
-
+#ifdef USE_GREENTEA
+#include "caffe/greentea/greentea.hpp"
+#endif
 namespace caffe {
 
 class Timer {
@@ -32,6 +34,10 @@ class Timer {
   cudaEvent_t start_gpu_;
   cudaEvent_t stop_gpu_;
 #endif  // USE_CUDA
+#ifdef USE_GREENTEA
+  cl_event start_gpu_;
+  cl_event stop_gpu_;
+#endif //USE_GREENTEA
 #endif  // !CPU_ONLY
   boost::posix_time::ptime start_cpu_;
   boost::posix_time::ptime stop_cpu_;

diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
@@ -187,7 +187,7 @@ root_solver_(true) {}
 
 Caffe::~Caffe() {}
 
-void Caffe::set_random_seed(const size_t seed) {
+void Caffe::set_random_seed(const size_t seed, device* device_context) {
   // RNG seed
   Get().random_generator_.reset(new RNG(seed));
 }

diff --git a/src/caffe/greentea/cl_kernels.cpp b/src/caffe/greentea/cl_kernels.cpp
diff --git a/src/caffe/greentea/cl_kernels.sh b/src/caffe/greentea/cl_kernels.sh
@@ -32,6 +32,7 @@ echo "#include <string>" >> $SOURCE
 echo "namespace caffe {" >> $SOURCE
 
 echo "viennacl::ocl::program & RegisterKernels(viennacl::ocl::context *ctx);" >> $HEADER
+echo "viennacl::ocl::program & submit_conv_spatial_program(viennacl::ocl::context &ctx, string name, string options);" >> $HEADER
 echo "}" >> $HEADER
 echo "#endif" >> $HEADER
 
@@ -141,6 +142,22 @@ echo "  viennacl::ocl::program &program = ctx->add_program(kernel_program," >> $
 echo "      \"kernel_program\");" >> $SOURCE
 echo "  return program;" >> $SOURCE
 echo "}" >> $SOURCE
+echo "viennacl::ocl::program & submit_conv_spatial_program(viennacl::ocl::context &ctx, string name, string options)" >> $SOURCE
+echo "{" >> $SOURCE
+echo "  static const char* core_defines =" >> $SOURCE
+echo "  \"#define Dtype float\n\"" >> $SOURCE
+echo "  \"#define Dtype2 float2\n\"" >> $SOURCE
+echo "  \"#define Dtype4 float4\n\"" >> $SOURCE
+echo "  \"#define Dtype8 float8\n\"" >> $SOURCE
+echo "  \"#define Dtype16 float16\n\"" >> $SOURCE
+echo "  \"#define OCL_KERNEL_LOOP(i, n)\"" >> $SOURCE
+echo "  \" for (int i = get_global_id(0); i < (n); i += get_global_size(0))\n\";" >> $SOURCE
+echo "  string sources = core_defines;" >> $SOURCE
+echo "  sources += conv_layer_spatial_float;" >> $SOURCE
+echo "  ctx.build_options(options);" >> $SOURCE
+echo "  viennacl::ocl::program &program = ctx.add_program(sources, name);" >> $SOURCE
+echo "  return program;" >> $SOURCE
+echo "}" >> $SOURCE
 echo "}  // namespace caffe" >> $SOURCE
 
 echo "#endif" >> $HEADER

diff --git a/src/caffe/greentea/cl_kernels/contrastive_loss.cl b/src/caffe/greentea/cl_kernels/contrastive_loss.cl
@@ -3,27 +3,44 @@
 #endif
 
 __kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels,
-                            const Dtype margin, const int legacy_version,
-                            const Dtype alpha, __global const Dtype* y,
+                            const Dtype margin, const Dtype alpha, __global const Dtype* y,
                             __global const Dtype* diff, __global const Dtype* dist_sq,
                             __global Dtype *bottom_diff) {
   for (int_tp i = get_global_id(0); i < count;
       i += get_global_size(0)) {
     int_tp n = i / channels;  // the num index, to access y and dist_sq
-    if ((int_tp)(y[n])) {  // similar pairs
+    if (trunc(y[n]) != 0.) {  // similar pairs
       bottom_diff[i] = alpha * diff[i];
     } else {  // dissimilar pairs
-      Dtype mdist = 0.0;
-      Dtype beta = 0.0;
-      if (legacy_version == 1) {
-        mdist = (margin - dist_sq[n]);
-        beta = -alpha;
+      Dtype mdist = 0.;
+      Dtype beta = 0.;
+      Dtype dist = sqrt(dist_sq[n]);
+      mdist = (margin - dist);
+      beta = -alpha * mdist / (dist + 1e-4) * diff[i];
+      if (mdist > 0.) {
+        bottom_diff[i] = beta;
       } else {
-        Dtype dist = sqrt(dist_sq[n]);
-        mdist = (margin - dist);
-        beta = -alpha * mdist / (dist + 1e-4) * diff[i];
+        bottom_diff[i] = 0;
       }
-      if (mdist > 0.0) {
+    }
+  }
+}
+
+__kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels,
+    const Dtype margin, const Dtype alpha, __global Dtype* y,
+    __global Dtype* diff, __global Dtype* dist_sq,
+    __global Dtype* bottom_diff) {
+    for (int_tp i = get_global_id(0); i < count;
+      i += get_global_size(0)) {
+    int n = i / channels;  // the num index, to access y and dist_sq
+    if (trunc(y[n]) != 0.) {  // similar pairs
+      bottom_diff[i] = alpha * diff[i];
+    } else {  // dissimilar pairs
+      Dtype mdist = 0.;
+      Dtype beta = 0.;
+      mdist = (margin - dist_sq[n]);
+      beta = -alpha;
+      if (mdist > 0.) {
         bottom_diff[i] = beta;
       } else {
         bottom_diff[i] = 0;