Skip to content

Commit

Permalink
Merge pull request #5 from listenlink/opencl
Browse files Browse the repository at this point in the history
Split legacy and non legacy version of cll_backward kernel to reduce branching
  • Loading branch information
gongzg committed Mar 3, 2016
2 parents 6ff13d0 + 4c04e01 commit 284fa17
Show file tree
Hide file tree
Showing 14 changed files with 3,432 additions and 37 deletions.
1 change: 1 addition & 0 deletions include/caffe/greentea/cl_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "viennacl/ocl/platform.hpp"
namespace caffe {
viennacl::ocl::program & RegisterKernels(viennacl::ocl::context *ctx);
viennacl::ocl::program & submit_conv_spatial_program(viennacl::ocl::context &ctx, string name, string options);
}
#endif
#endif
2 changes: 2 additions & 0 deletions include/caffe/greentea/greentea.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#ifndef CAFFE_GREENTEA_HPP_
#define CAFFE_GREENTEA_HPP_

#define VIENNACL_PROFILING_ENABLED

#ifdef CMAKE_BUILD
#include "caffe_config.h"
#endif
Expand Down
206 changes: 206 additions & 0 deletions include/caffe/layers/conv_spatial_layer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#ifndef CAFFE_CONV_SPATIAL_LAYER_HPP_
#define CAFFE_CONV_SPATIAL_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/base_conv_layer.hpp"

namespace caffe {

template <typename Dtype>
class ConvolutionLayerSpatial : public BaseConvolutionLayer<Dtype> {
public:
/**
* @param param provides ConvolutionParameter convolution_param,
* with ConvolutionLayer options:
* - num_output. The number of filters.
* - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
* kernel_size for square filters or kernel_h and kernel_w for rectangular
* filters.
* - stride / stride_h / stride_w (\b optional, default 1). The filter
* stride, given by stride_size for equal dimensions or stride_h and stride_w
* for different strides. By default the convolution is dense with stride 1.
* - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
* convolution, given by pad for equal dimensions or pad_h and pad_w for
* different padding. Input padding is computed implicitly instead of
* actually padding.
* - group (\b optional, default 1). The number of filter groups. Group
* convolution is a method for reducing parameterization by selectively
* connecting input and output channels. The input and output channel dimensions must be divisible
* by the number of groups. For group @f$ \geq 1 @f$, the
* convolutional filters' input and output channels are separated s.t. each
* group takes 1 / group of the input channels and makes 1 / group of the
* output channels. Concretely 4 input channels, 8 output channels, and
* 2 groups separate input channels 1-2 and output channels 1-4 into the
* first group and input channels 3-4 and output channels 5-8 into the second
* group.
* - bias_term (\b optional, default true). Whether to have a bias.
* - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
* kernels + stream parallelism) engines.
*/
explicit ConvolutionLayerSpatial(const LayerParameter& param) :
BaseConvolutionLayer<Dtype>(param) {
}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);

virtual inline const char* type() const {
return "Convolution";
}

virtual inline int MinBottomBlobs() const {
return 1;
}
virtual inline int MinTopBlobs() const {
return 1;
}
virtual inline bool EqualNumBottomTopBlobs() const {
return true;
}

protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

virtual inline bool reverse_dimensions() {
return false;
}
virtual void compute_output_shape();

struct kernelConfig {
string kernelName;
float executionTime;
size_t local_work_size[3];
size_t global_work_size[3];
int workItem_output[3];
bool verified;
bool autoTune;
bool tested;
bool swizzle_weights;
bool batched_execute;
bool use_null_local;
int kernelType;

kernelConfig() {
}
kernelConfig(string name, size_t* global_size, size_t* local_size,
int* workItem, bool tune, bool swizzle, bool batched, bool null_local, int type = 0) {
kernelName = name;
for (int x = 0; x < 3; x++) {
local_work_size[x] = local_size[x];
global_work_size[x] = global_size[x];
workItem_output[x] = workItem[x];
}
autoTune = tune;
swizzle_weights = swizzle;
batched_execute = batched;
use_null_local = null_local;
verified = false;
tested = false;
kernelType = type;
}
};

#ifndef CPU_ONLY
#ifdef USE_GREENTEA
virtual bool generate_kernel(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
virtual bool generate_batched_kernel(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
virtual void setup_convolution(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top);
virtual void create_convolution_kernel(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top,int kernelType, int blockWidth, int blockHeight, int blockDepth);
virtual bool setup_IDLF(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
virtual bool create_basic_kernel(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int blockWidth, int blockHeight, int blockDepth);
virtual bool create_verification_kernel(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top);
virtual cl_int convolve(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int index, int numImages,
kernelConfig* config);
virtual cl_int batched_convolve(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int index, int numImages,
kernelConfig* config);
virtual float timed_convolve(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int index, int numImages,
kernelConfig* config);
virtual bool verify_result(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, int index, int numImages,
kernelConfig* config);
virtual bool tune_local_size(const vector<Blob<float>*>& bottom,
const vector<Blob<float>*>& top, kernelConfig*);
virtual void swizzleWeights(int swizzle_factor);
virtual void pad_image(int image_offset, kernelConfig* config, int imgNum);
virtual void generate_key();
virtual std::string generate_unique_key();
virtual std::string generate_specific_key(int type, int blockWidth, int blockHeight, int blockDepth);
virtual void calculate_global_size(int batch, int* workItemOutput,
size_t* localSizes, size_t* globalSizes);
#endif
#endif

const float* bottom_data;
float* top_data;
float* col_data;
const float* weight;
float* swizzled_weights;
int weight_offset;
int col_offset;
int top_offset;
int output_h_, output_w_;
int padded_height_, padded_width_;
const float* bias_;
int bias_offset_;
int bottom_index_;

int kernel_h_;
int kernel_w_;
int height_;
int width_;
int pad_h_;
int pad_w_;
int stride_h_;
int stride_w_;

/// M_ is the channel dimension of the output for a single group, which is the
/// leading dimension of the filter matrix.
int M_;
/// K_ is the dimension of an unrolled input for a single group, which is the
/// leading dimension of the data matrix.
int K_;
/// N_ is the spatial dimension of the output, the H x W, which are the last
/// dimensions of the data and filter matrices.
int N_;

bool tuned_;

std::string key_;
std::string kernel_name_;
std::string verification_kernel;
Blob<Dtype> col_buffer_;
Blob<Dtype> swizzled_weights_;
Blob<Dtype> bias_multiplier_;

int kernel_index_;
int kernel_uid_;

vector<kernelConfig*> kernelQueue;
};

} // namespace caffe

#endif // CAFFE_CONV_SPATIAL_LAYER_HPP_
8 changes: 7 additions & 1 deletion include/caffe/util/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
#include <boost/date_time/posix_time/posix_time.hpp>

#include "caffe/util/device_alternate.hpp"

#ifdef USE_GREENTEA
#include "caffe/greentea/greentea.hpp"
#endif
namespace caffe {

class Timer {
Expand Down Expand Up @@ -32,6 +34,10 @@ class Timer {
cudaEvent_t start_gpu_;
cudaEvent_t stop_gpu_;
#endif // USE_CUDA
#ifdef USE_GREENTEA
cl_event start_gpu_;
cl_event stop_gpu_;
#endif //USE_GREENTEA
#endif // !CPU_ONLY
boost::posix_time::ptime start_cpu_;
boost::posix_time::ptime stop_cpu_;
Expand Down
2 changes: 1 addition & 1 deletion src/caffe/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ root_solver_(true) {}

Caffe::~Caffe() {}

void Caffe::set_random_seed(const size_t seed) {
void Caffe::set_random_seed(const size_t seed, device* device_context) {
// RNG seed
Get().random_generator_.reset(new RNG(seed));
}
Expand Down
24 changes: 22 additions & 2 deletions src/caffe/greentea/cl_kernels.cpp

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions src/caffe/greentea/cl_kernels.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ echo "#include <string>" >> $SOURCE
echo "namespace caffe {" >> $SOURCE

echo "viennacl::ocl::program & RegisterKernels(viennacl::ocl::context *ctx);" >> $HEADER
echo "viennacl::ocl::program & submit_conv_spatial_program(viennacl::ocl::context &ctx, string name, string options);" >> $HEADER
echo "}" >> $HEADER
echo "#endif" >> $HEADER

Expand Down Expand Up @@ -141,6 +142,22 @@ echo " viennacl::ocl::program &program = ctx->add_program(kernel_program," >> $
echo " \"kernel_program\");" >> $SOURCE
echo " return program;" >> $SOURCE
echo "}" >> $SOURCE
echo "viennacl::ocl::program & submit_conv_spatial_program(viennacl::ocl::context &ctx, string name, string options)" >> $SOURCE
echo "{" >> $SOURCE
echo " static const char* core_defines =" >> $SOURCE
echo " \"#define Dtype float\n\"" >> $SOURCE
echo " \"#define Dtype2 float2\n\"" >> $SOURCE
echo " \"#define Dtype4 float4\n\"" >> $SOURCE
echo " \"#define Dtype8 float8\n\"" >> $SOURCE
echo " \"#define Dtype16 float16\n\"" >> $SOURCE
echo " \"#define OCL_KERNEL_LOOP(i, n)\"" >> $SOURCE
echo " \" for (int i = get_global_id(0); i < (n); i += get_global_size(0))\n\";" >> $SOURCE
echo " string sources = core_defines;" >> $SOURCE
echo " sources += conv_layer_spatial_float;" >> $SOURCE
echo " ctx.build_options(options);" >> $SOURCE
echo " viennacl::ocl::program &program = ctx.add_program(sources, name);" >> $SOURCE
echo " return program;" >> $SOURCE
echo "}" >> $SOURCE
echo "} // namespace caffe" >> $SOURCE

echo "#endif" >> $HEADER
Expand Down
41 changes: 29 additions & 12 deletions src/caffe/greentea/cl_kernels/contrastive_loss.cl
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,44 @@
#endif

__kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels,
const Dtype margin, const int legacy_version,
const Dtype alpha, __global const Dtype* y,
const Dtype margin, const Dtype alpha, __global const Dtype* y,
__global const Dtype* diff, __global const Dtype* dist_sq,
__global Dtype *bottom_diff) {
for (int_tp i = get_global_id(0); i < count;
i += get_global_size(0)) {
int_tp n = i / channels; // the num index, to access y and dist_sq
if ((int_tp)(y[n])) { // similar pairs
if (trunc(y[n]) != 0.) { // similar pairs
bottom_diff[i] = alpha * diff[i];
} else { // dissimilar pairs
Dtype mdist = 0.0;
Dtype beta = 0.0;
if (legacy_version == 1) {
mdist = (margin - dist_sq[n]);
beta = -alpha;
Dtype mdist = 0.;
Dtype beta = 0.;
Dtype dist = sqrt(dist_sq[n]);
mdist = (margin - dist);
beta = -alpha * mdist / (dist + 1e-4) * diff[i];
if (mdist > 0.) {
bottom_diff[i] = beta;
} else {
Dtype dist = sqrt(dist_sq[n]);
mdist = (margin - dist);
beta = -alpha * mdist / (dist + 1e-4) * diff[i];
bottom_diff[i] = 0;
}
if (mdist > 0.0) {
}
}
}

__kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels,
const Dtype margin, const Dtype alpha, __global Dtype* y,
__global Dtype* diff, __global Dtype* dist_sq,
__global Dtype* bottom_diff) {
for (int_tp i = get_global_id(0); i < count;
i += get_global_size(0)) {
int n = i / channels; // the num index, to access y and dist_sq
if (trunc(y[n]) != 0.) { // similar pairs
bottom_diff[i] = alpha * diff[i];
} else { // dissimilar pairs
Dtype mdist = 0.;
Dtype beta = 0.;
mdist = (margin - dist_sq[n]);
beta = -alpha;
if (mdist > 0.) {
bottom_diff[i] = beta;
} else {
bottom_diff[i] = 0;
Expand Down
Loading

0 comments on commit 284fa17

Please sign in to comment.