Skip to content

Commit

Permalink
CUB Memory Manager + cuDNN v4 and v5 support
Browse files Browse the repository at this point in the history
  • Loading branch information
drnikolaev committed Apr 5, 2016
1 parent c137188 commit 4044ed7
Show file tree
Hide file tree
Showing 48 changed files with 1,194 additions and 319 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ env:
- WITH_CUDA=true WITH_CMAKE=true WITH_IO=true
- WITH_CUDA=false WITH_CMAKE=false WITH_IO=false
- WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3
# Currently there is no way to install cudnn via apt-get. Uncomment when it's available.
# - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true WITH_CUDNN=true
# - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true WITH_CUDNN=true

language: cpp

Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ configure_file(cmake/Templates/caffe_config.h.in "${PROJECT_BINARY_DIR}/caffe_co

# ---[ Includes
set(Caffe_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
include_directories(${Caffe_INCLUDE_DIR} ${PROJECT_BINARY_DIR})
set(THIRDPARTY_DIR ${PROJECT_SOURCE_DIR}/3rdparty)
include_directories(${Caffe_INCLUDE_DIR} ${PROJECT_BINARY_DIR} ${THIRDPARTY_DIR})
include_directories(BEFORE src) # This is needed for gtest.

# ---[ Subdirectories
Expand Down
19 changes: 15 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,19 @@ $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example.)
endif
include $(CONFIG_FILE)

# Rectify input parameters
ifeq ($(CPU_ONLY),1)
USE_CUDNN=0
endif

PROJECT_DIR=$(PWD)

BUILD_DIR_LINK := $(BUILD_DIR)
ifeq ($(RELEASE_BUILD_DIR),)
RELEASE_BUILD_DIR := .$(BUILD_DIR)_release
RELEASE_BUILD_DIR := $(PROJECT_DIR)/.$(BUILD_DIR)_release
endif
ifeq ($(DEBUG_BUILD_DIR),)
DEBUG_BUILD_DIR := .$(BUILD_DIR)_debug
DEBUG_BUILD_DIR := $(PROJECT_DIR)/.$(BUILD_DIR)_debug
endif

DEBUG ?= 0
Expand All @@ -24,6 +31,8 @@ else
OTHER_BUILD_DIR := $(DEBUG_BUILD_DIR)
endif

THIRDPARTY_DIR=$(PROJECT_DIR)/3rdparty

# All of the directories containing code.
SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print)
Expand Down Expand Up @@ -171,7 +180,7 @@ ifneq ("$(wildcard $(CUDA_DIR)/lib64)","")
endif
CUDA_LIB_DIR += $(CUDA_DIR)/lib

INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include $(THIRDPARTY_DIR)
ifneq ($(CPU_ONLY), 1)
INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
LIBRARY_DIRS += $(CUDA_LIB_DIR)
Expand Down Expand Up @@ -325,6 +334,8 @@ endif
# cuDNN acceleration configuration.
ifeq ($(USE_CUDNN), 1)
LIBRARIES += cudnn
INCLUDE_DIRS += ${CUDNN_DIR}/include
LIBRARY_DIRS += ${CUDNN_DIR}/install/cuda/lib64
COMMON_FLAGS += -DUSE_CUDNN
endif

Expand Down Expand Up @@ -440,7 +451,7 @@ endif
# Define build targets
##############################
.PHONY: all lib test clean docs linecount lint lintclean tools examples $(DIST_ALIASES) \
py mat py$(PROJECT) mat$(PROJECT) proto runtest \
py mat py$(PROJECT) mat$(PROJECT) thirdparty proto runtest \
superclean supercleanlist supercleanfiles warn everything

all: lib tools examples
Expand Down
1 change: 1 addition & 0 deletions Makefile.config.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Contributions simplifying and improving our build system are welcome!

# cuDNN acceleration switch (uncomment to build with cuDNN).
# cuDNN version 4 or higher is required.
# USE_CUDNN := 1

# CPU-only switch (uncomment to build without GPU support).
Expand Down
6 changes: 6 additions & 0 deletions include/caffe/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ class Caffe {
inline static curandGenerator_t curand_generator() {
return Get().curand_generator_;
}
#ifdef USE_CUDNN
inline static cudnnHandle_t cudnn_handle() { return Get().cudnn_handle_; }
#endif
#endif

// Returns the mode: running on CPU or GPU.
Expand Down Expand Up @@ -168,6 +171,9 @@ class Caffe {
#ifndef CPU_ONLY
cublasHandle_t cublas_handle_;
curandGenerator_t curand_generator_;
#ifdef USE_CUDNN
cudnnHandle_t cudnn_handle_;
#endif
#endif
shared_ptr<RNG> random_generator_;

Expand Down
21 changes: 20 additions & 1 deletion include/caffe/layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ class Layer {
* layer.
*/
explicit Layer(const LayerParameter& param)
: layer_param_(param), is_shared_(false) {
: layer_param_(param), is_shared_(false),
forward_passed_(false), backward_passed_(false) {
// Set phase and copy blobs (if there are any).
phase_ = param.phase();
if (layer_param_.blobs_size() > 0) {
Expand Down Expand Up @@ -316,6 +317,21 @@ class Layer {
param_propagate_down_[param_id] = value;
}

bool IsForwardPassed() const {
return forward_passed_;
}

void ForwardPassed(bool passed) {
forward_passed_ = passed;
}

bool IsBackwardPassed() const {
return backward_passed_;
}

void BackwardPassed(bool passed) {
backward_passed_ = passed;
}

protected:
/** The protobuf that stores the layer parameters */
Expand Down Expand Up @@ -431,6 +447,9 @@ class Layer {
/** Whether this layer is actually shared by other nets*/
bool is_shared_;

bool forward_passed_;
bool backward_passed_;

/** The mutex for sequential forward if this layer is shared */
shared_ptr<boost::mutex> forward_mutex_;

Expand Down
45 changes: 45 additions & 0 deletions include/caffe/layers/cudnn_batch_norm_layer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef CAFFE_CUDNN_BATCH_NORM_LAYER_HPP_
#define CAFFE_CUDNN_BATCH_NORM_LAYER_HPP_

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/batch_norm_layer.hpp"

namespace caffe {

#ifdef USE_CUDNN
template <typename Dtype>
class CuDNNBatchNormLayer : public BatchNormLayer<Dtype> {
public:
explicit CuDNNBatchNormLayer(const LayerParameter& param)
: BatchNormLayer<Dtype>(param), epsilon_(1e-4), handles_setup_(false) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual ~CuDNNBatchNormLayer();

protected:
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

// cuDNN descriptors / handles
cudnnTensorDescriptor_t bottom_desc_, top_desc_;
cudnnTensorDescriptor_t scale_bias_mean_var_desc_;
cudnnBatchNormMode_t mode_;

double epsilon_;
Blob<Dtype> save_mean_, save_inv_var_;
bool handles_setup_;
};
#endif

} // namespace caffe

#endif // CAFFE_CUDNN_BATCH_NORM_LAYER_HPP_
12 changes: 6 additions & 6 deletions include/caffe/layers/cudnn_conv_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/conv_layer.hpp"
#ifndef CPU_ONLY
#include "caffe/util/gpu_memory.hpp"
#endif

namespace caffe {

Expand Down Expand Up @@ -44,8 +47,6 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

bool handles_setup_;
cudnnHandle_t* handle_;
cudaStream_t* stream_;

// algorithms for forward and backwards convolutions
cudnnConvolutionFwdAlgo_t *fwd_algo_;
Expand All @@ -56,14 +57,13 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
cudnnTensorDescriptor_t bias_desc_;
cudnnFilterDescriptor_t filter_desc_;
vector<cudnnConvolutionDescriptor_t> conv_descs_;
int bottom_offset_, top_offset_, bias_offset_;

int bottom_offset_, top_offset_, weight_offset_, bias_offset_;

size_t *workspace_fwd_sizes_;
size_t *workspace_bwd_data_sizes_;
size_t *workspace_bwd_filter_sizes_;
size_t workspaceSizeInBytes; // size of underlying storage
void *workspaceData; // underlying storage
void **workspace; // aliases into workspaceData
gpu_memory::buffer workspace;
};
#endif

Expand Down
11 changes: 6 additions & 5 deletions include/caffe/layers/cudnn_lcn_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

#include "caffe/layers/lrn_layer.hpp"
#include "caffe/layers/power_layer.hpp"
#ifndef CPU_ONLY
#include "caffe/util/gpu_memory.hpp"
#endif

namespace caffe {

Expand All @@ -17,8 +20,7 @@ template <typename Dtype>
class CuDNNLCNLayer : public LRNLayer<Dtype> {
public:
explicit CuDNNLCNLayer(const LayerParameter& param)
: LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
tempData1(NULL), tempData2(NULL) {}
: LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize_(0) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
Expand All @@ -32,15 +34,14 @@ class CuDNNLCNLayer : public LRNLayer<Dtype> {
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

bool handles_setup_;
cudnnHandle_t handle_;
cudnnLRNDescriptor_t norm_desc_;
cudnnTensorDescriptor_t bottom_desc_, top_desc_;

int size_, pre_pad_;
Dtype alpha_, beta_, k_;

size_t tempDataSize;
void *tempData1, *tempData2;
size_t tempDataSize_;
gpu_memory::buffer temp1_, temp2_;
};
#endif

Expand Down
2 changes: 1 addition & 1 deletion include/caffe/layers/cudnn_relu_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

bool handles_setup_;
cudnnHandle_t handle_;
cudnnTensorDescriptor_t bottom_desc_;
cudnnTensorDescriptor_t top_desc_;
cudnnActivationDescriptor_t activ_desc_;
};
#endif

Expand Down
2 changes: 1 addition & 1 deletion include/caffe/layers/cudnn_sigmoid_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

bool handles_setup_;
cudnnHandle_t handle_;
cudnnTensorDescriptor_t bottom_desc_;
cudnnTensorDescriptor_t top_desc_;
cudnnActivationDescriptor_t activ_desc_;
};
#endif

Expand Down
2 changes: 1 addition & 1 deletion include/caffe/layers/cudnn_tanh_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

bool handles_setup_;
cudnnHandle_t handle_;
cudnnTensorDescriptor_t bottom_desc_;
cudnnTensorDescriptor_t top_desc_;
cudnnActivationDescriptor_t activ_desc_;
};
#endif

Expand Down
2 changes: 2 additions & 0 deletions include/caffe/parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class GPUParams : public Params<Dtype> {
using Params<Dtype>::size_;
using Params<Dtype>::data_;
using Params<Dtype>::diff_;
private:
int buffer_device_;
};

class DevicePair {
Expand Down
20 changes: 16 additions & 4 deletions include/caffe/util/cudnn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "caffe/proto/caffe.pb.h"

#define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= (major * 1000 + minor * 100 + patch))
(CUDNN_VERSION >= (major * 1000 + minor * 100 + patch))

#define CUDNN_CHECK(condition) \
do { \
Expand Down Expand Up @@ -91,8 +91,13 @@ template <typename Dtype>
inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
int n, int c, int h, int w) {
CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
n, c, h, w));
CUDNN_TENSOR_NCHW, n, c, h, w));
#else
CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(*desc, dataType<Dtype>::type,
CUDNN_TENSOR_NCHW, n, c, h, w));
#endif
}

template <typename Dtype>
Expand Down Expand Up @@ -123,8 +128,15 @@ inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
LOG(FATAL) << "Unknown pooling method.";
}
CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
pad_h, pad_w, stride_h, stride_w));
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode,
CUDNN_PROPAGATE_NAN, h, w,
pad_h, pad_w, stride_h, stride_w));
#else
CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(*pool_desc, *mode,
CUDNN_PROPAGATE_NAN, h, w,
pad_h, pad_w, stride_h, stride_w));
#endif
}

} // namespace cudnn
Expand Down
Loading

0 comments on commit 4044ed7

Please sign in to comment.