[Caffe] Optional NNPACK integration

The approach is pretty straightforward and very similar to the CuDNN integration: - Add some CMake magic to detect NNPACK if it's installed on the user's system. - Add some Makefile.config options - Add some thread-local state (a threadpool for NNPACK to caffe::Caffe). - Add an NNPACK `Engine` parameter to Convolution/Pooling/InnerProduct layers. - In LayerFactory, ifdef around whether NNPACK is enabled and dispatch to NNPACK*Layer or *Layer depending on the Engine parameter. - In NNPACK*Layer, ensure the NNPACK preconditions hold and call the appropriate nnpack_* function, otherwise fall back to *Layer. I'm not sure how common it is to be running CNN inference on Haswell and beyond CPUs for Caffe users, but if it is it might be a nice boost for them.
ajtulloch · Jun 2, 2016 · 05151a4 · 05151a4
1 parent 4f7b90d
commit 05151a4
Show file tree

Hide file tree

Showing 21 changed files with 892 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,6 +28,7 @@ include(cmake/ConfigGen.cmake)
 # ---[ Options
 caffe_option(CPU_ONLY "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
 caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
+caffe_option(USE_NNPACK "Build Caffe with NNPACK library support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
 set(python_version "2" CACHE STRING "Specify which Python version to use")

diff --git a/Makefile b/Makefile
@@ -328,6 +328,15 @@ ifeq ($(USE_CUDNN), 1)
  COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# NNPACK acceleration configuration.
+ifeq ($(USE_NNPACK), 1)
+ LIBRARIES += nnpack
+ COMMON_FLAGS += -DUSE_NNPACK
+ INCLUDE_DIRS += $(NNPACK_INCLUDE)
+ INCLUDE_DIRS += $(NNPACK_INCLUDE)/../third-party/pthreadpool/include
+ LIBRARY_DIRS += $(NNPACK_LIB)
+endif
+
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
  COMMON_FLAGS += -DUSE_OPENCV

diff --git a/Makefile.config.example b/Makefile.config.example
@@ -17,6 +17,11 @@
 # possibility of simultaneous read and write
 # ALLOW_LMDB_NOLOCK := 1
 
+# NNPACK acceleration switch (uncomment to build with NNPACK).
+# USE_NNPACK := 1
+# NNPACK_INCLUDE := /path/to/NNPACK/include
+# NNPACK_LIB := /path/to/nnpack/NNPACK/lib
+
 # Uncomment if you're using OpenCV 3
 # OPENCV_VERSION := 3
 

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -67,6 +67,14 @@ if(NOT HAVE_CUDA)
  add_definitions(-DCPU_ONLY)
 endif()
 
+# ---[ NNPACK
+if(USE_NNPACK)
+ find_package(NNPACK REQUIRED)
+ include_directories(SYSTEM ${NNPACK_INCLUDE_DIR})
+ include_directories(SYSTEM ${NNPACK_INCLUDE_DIR}/../third-party/pthreadpool/include)
+ list(APPEND Caffe_LINKER_LIBS ${NNPACK_LIB})
+endif()
+
 # ---[ OpenCV
 if(USE_OPENCV)
  find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)

diff --git a/cmake/Modules/FindNNPACK.cmake b/cmake/Modules/FindNNPACK.cmake
@@ -0,0 +1,54 @@
+SET(NNPACK_INCLUDE_SEARCH_PATHS
+ /usr/include
+ /usr/local/include
+ /opt/NNPACK/include
+ $ENV{NNPACK_ROOT}
+ $ENV{NNPACK_ROOT}/include
+)
+
+SET(NNPACK_LIB_SEARCH_PATHS
+ /lib/
+ /lib64/
+ /usr/lib
+ /usr/lib64
+ /usr/local/lib
+ /usr/local/lib64
+ /opt/NNPACK/lib
+ $ENV{NNPACK_ROOT}
+ $ENV{NNPACK_ROOT}/lib
+ )
+
+FIND_PATH(NNPACK_INCLUDE_DIR NAMES nnpack.h PATHS ${NNPACK_INCLUDE_SEARCH_PATHS})
+FIND_LIBRARY(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_LIB_SEARCH_PATHS})
+
+SET(NNPACK_FOUND ON)
+
+# Check include files
+IF(NOT NNPACK_INCLUDE_DIR)
+ SET(NNPACK_FOUND OFF)
+ MESSAGE(STATUS "Could not find NNPACK include. Turning NNPACK_FOUND off")
+ENDIF()
+
+# Check libraries
+IF(NOT NNPACK_LIB)
+ SET(NNPACK_FOUND OFF)
+ MESSAGE(STATUS "Could not find NNPACK lib. Turning NNPACK_FOUND off")
+ENDIF()
+
+IF (NNPACK_FOUND)
+ add_definitions(-DUSE_NNPACK)
+ IF (NOT NNPACK_FIND_QUIETLY)
+ MESSAGE(STATUS "Found NNPACK libraries: ${NNPACK_LIB}")
+ MESSAGE(STATUS "Found NNPACK include: ${NNPACK_INCLUDE_DIR}")
+ ENDIF (NOT NNPACK_FIND_QUIETLY)
+ELSE (NNPACK_FOUND)
+ IF (NNPACK_FIND_REQUIRED)
+ MESSAGE(FATAL_ERROR "Could not find NNPACK")
+ ENDIF (NNPACK_FIND_REQUIRED)
+ENDIF (NNPACK_FOUND)
+
+MARK_AS_ADVANCED(
+ NNPACK_INCLUDE_DIR
+ NNPACK_LIB
+ NNPACK
+)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
@@ -136,6 +136,9 @@ function(caffe_print_configuration_summary)
  if(USE_OPENCV)
  caffe_status(" OpenCV : Yes (ver. ${OpenCV_VERSION})")
  endif()
+ if(USE_NNPACK)
+ caffe_status(" NNPACK : Yes")
+ endif()
  caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
  caffe_status("")
  if(HAVE_CUDA)

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
@@ -18,6 +18,10 @@
 
 #include "caffe/util/device_alternate.hpp"
 
+#ifdef USE_NNPACK
+#include "caffe/nnpack_pool.h"
+#endif
+
 // Convert macro to string
 #define STRINGIFY(m) #m
 #define AS_STRING(m) STRINGIFY(m)
@@ -138,6 +142,13 @@ class Caffe {
  }
 #endif
 
+#ifdef USE_NNPACK
+ template<typename Dtype> static bool nnpack_supported();
+ inline static pthreadpool_t nnpack_threadpool() {
+ return Get().nnpack_threadpool_.pool();
+ }
+#endif
+
  // Returns the mode: running on CPU or GPU.
  inline static Brew mode() { return Get().mode_; }
  // The setters for the variables
@@ -175,6 +186,10 @@ class Caffe {
  int solver_count_;
  bool root_solver_;
 
+#ifdef USE_NNPACK
+ NNPACKPool nnpack_threadpool_;
+#endif
+
  private:
  // The private constructor to avoid duplicate instantiation.
  Caffe();

diff --git a/include/caffe/nnpack_pool.h b/include/caffe/nnpack_pool.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <boost/noncopyable.hpp>
+
+#include "nnpack.h"
+
+namespace caffe {
+class NNPACKPool : public boost::noncopyable {
+ public:
+ NNPACKPool() {
+#ifdef USE_MKL
+ const size_t num_mkl_threads = mkl_get_max_threads();
+#else
+ // Can we do better here?
+ const size_t num_mkl_threads = 1;
+#endif
+ if (num_mkl_threads > 1) {
+ pool_ = pthreadpool_create(num_mkl_threads);
+ } else {
+ pool_ = NULL;
+ }
+
+ }
+ ~NNPACKPool() {
+ if (pool_) {
+ pthreadpool_destroy(pool_);
+ }
+ pool_ = NULL;
+ }
+
+ pthreadpool_t pool() { return pool_; };
+
+ private:
+ pthreadpool_t pool_;
+};
+
+}
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
@@ -49,6 +49,19 @@ void GlobalInit(int* pargc, char*** pargv) {
  ::google::InstallFailureSignalHandler();
 }
 
+#ifdef USE_NNPACK
+ template<>
+ bool Caffe::nnpack_supported<double>() {
+ return false;
+ }
+
+ template<>
+ bool Caffe::nnpack_supported<float>() {
+ static enum nnp_status nnpack_status = nnp_initialize();
+ return nnpack_status == nnp_status_success;
+ }
+#endif
+
 #ifdef CPU_ONLY // CPU-only Caffe.
 
 Caffe::Caffe()

diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
@@ -16,6 +16,8 @@
 #include "caffe/layers/tanh_layer.hpp"
 #include "caffe/proto/caffe.pb.h"
 
+#include "caffe/layers/inner_product_layer.hpp"
+
 #ifdef USE_CUDNN
 #include "caffe/layers/cudnn_conv_layer.hpp"
 #include "caffe/layers/cudnn_lcn_layer.hpp"
@@ -27,6 +29,12 @@
 #include "caffe/layers/cudnn_tanh_layer.hpp"
 #endif
 
+#ifdef USE_NNPACK
+#include "caffe/layers/nnpack_convolution_layer.hpp"
+#include "caffe/layers/nnpack_pooling_layer.hpp"
+#include "caffe/layers/nnpack_inner_product_layer.hpp"
+#endif
+
 #ifdef WITH_PYTHON_LAYER
 #include "caffe/layers/python_layer.hpp"
 #endif
@@ -57,6 +65,17 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
  }
  if (engine == ConvolutionParameter_Engine_CAFFE) {
  return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
+#ifdef USE_NNPACK
+ } else if (engine == ConvolutionParameter_Engine_NNPACK) {
+ // If we're in CPU mode and on supported processor, we can use NNPACK.
+ // Otherwise, we can't fall-through (since we'll get an unknown
+ // layer, so just return the default ConvolutionLayer
+ if ((Caffe::mode() == Caffe::CPU) && Caffe::nnpack_supported<Dtype>()) {
+ return shared_ptr<Layer<Dtype> >(
+ new NNPackConvolutionLayer<Dtype>(param));
+ }
+ return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
+#endif
 #ifdef USE_CUDNN
  } else if (engine == ConvolutionParameter_Engine_CUDNN) {
  if (use_dilation) {
@@ -84,6 +103,16 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
  }
  if (engine == PoolingParameter_Engine_CAFFE) {
  return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+#ifdef USE_NNPACK
+ } else if (engine == PoolingParameter_Engine_NNPACK) {
+ // If we're in CPU mode and on supported processor, we can use NNPACK.
+ // Otherwise, we can't fall-through (since we'll get an unknown
+ // layer, so just return the default ConvolutionLayer
+ if ((Caffe::mode() == Caffe::CPU) && Caffe::nnpack_supported<Dtype>()) {
+ return shared_ptr<Layer<Dtype> >(new NNPackPoolingLayer<Dtype>(param));
+ }
+ return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+#endif
 #ifdef USE_CUDNN
  } else if (engine == PoolingParameter_Engine_CUDNN) {
  if (param.top_size() > 1) {
@@ -109,6 +138,34 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 
 REGISTER_LAYER_CREATOR(Pooling, GetPoolingLayer);
 
+// Get pooling layer according to engine.
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetInnerProductLayer(const LayerParameter& param) {
+ InnerProductParameter_Engine engine = param.inner_product_param().engine();
+ if (engine == InnerProductParameter_Engine_DEFAULT) {
+ engine = InnerProductParameter_Engine_CAFFE;
+ }
+ if (engine == InnerProductParameter_Engine_CAFFE) {
+ return shared_ptr<Layer<Dtype> >(new InnerProductLayer<Dtype>(param));
+#ifdef USE_NNPACK
+ } else if (engine == InnerProductParameter_Engine_NNPACK) {
+ // If we're in CPU mode and on supported processor, we can use NNPACK.
+ // Otherwise, we can't fall-through (since we'll get an unknown
+ // layer, so just return the default InnerProductLayer
+ if ((Caffe::mode() == Caffe::CPU) && Caffe::nnpack_supported<Dtype>()) {
+ return shared_ptr<Layer<Dtype> >(
+ new NNPackInnerProductLayer<Dtype>(param));
+ }
+ return shared_ptr<Layer<Dtype> >(new InnerProductLayer<Dtype>(param));
+#endif
+ } else {
+ LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+ }
+ return shared_ptr<Layer<Dtype> >();
+}
+
+REGISTER_LAYER_CREATOR(InnerProduct, GetInnerProductLayer);
+
 // Get LRN layer according to engine
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
@@ -145,6 +145,5 @@ STUB_GPU(InnerProductLayer);
 #endif
 
 INSTANTIATE_CLASS(InnerProductLayer);
-REGISTER_LAYER_CLASS(InnerProduct);
 
 } // namespace caffe