Backport #16798, #16836 and #16838 to 1.6 (#16874)

* Add unoptimized symbol to executor for sharing (#16798) * Add unoptimized symbol to executor for sharing * Copy the symbol in Reshape * Added test for multiple reshapes * Mixed precison binary op backward (use in) for numpy (#16791) * mixed precison binary op backward * reduce unix cpu runtime * USE_NVRTC -> ENABLE_CUDA_RTC to fix maven build. Add compile-guard to fusion. (#16838) * Rename USE_NVRTC -> ENABLE_CUDA_RTC to fix maven build. Compile-guard fusion framework. * Fix fusion-not-supported warning. * Fix compile guards * Fix cmake build so -DMXNET_ENABLE_CUDA_RTC=1 is passed to nvcc * Minimize side-effects of prev change * Fix InferAttr/InferShapeAttr not calling inference for all nodes in a graph (#16836) * Fix the attribute inference omitting nodes * Add test * Cleaning * Fix lint * Fix TransposeShape * Fix WhileLoopType * Changing a/b test for fusion to a/(b+1) to increase numerical stability * Revert "Mixed precison binary op backward (use in) for numpy (#16791)" This reverts commit 8b58b78.
apache · Nov 22, 2019 · e73c186 · e73c186
1 parent 200f0ec
commit e73c186
Show file tree

Hide file tree

Showing 21 changed files with 266 additions and 107 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -633,14 +633,17 @@ if(USE_CUDA)
   else()
     list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
     # define preprocessor macro so that we will not include the generated forcelink header
+    if(ENABLE_CUDA_RTC)
+      add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+    endif()
+    # Create '.cmake' files for cuda compiles given definitions added thus far
     mshadow_cuda_compile(cuda_objs ${CUDA})
     if(MSVC)
         if(ENABLE_CUDA_RTC)
             FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
             list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
             set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
             list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
-            add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
         endif()
         FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
         list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
@@ -652,7 +655,6 @@ if(USE_CUDA)
         list(APPEND mxnet_LINKER_LIBS cufft cusolver)
         if(ENABLE_CUDA_RTC)
             list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
-            add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
         endif()
         link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
     endif()

diff --git a/appveyor.yml b/appveyor.yml
@@ -69,7 +69,7 @@ before_build:
 
         set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build
 
-        cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
+        cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DENABLE_CUDA_RTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
 
 build_script:
     - cmd: >-

diff --git a/ci/build_windows.py b/ci/build_windows.py
@@ -54,7 +54,7 @@ class BuildFlavour(Enum):
     'WIN_CPU': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -67,7 +67,7 @@ class BuildFlavour(Enum):
     , 'WIN_CPU_MKLDNN': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -80,7 +80,7 @@ class BuildFlavour(Enum):
     , 'WIN_CPU_MKLDNN_MKL': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -93,7 +93,7 @@ class BuildFlavour(Enum):
     , 'WIN_CPU_MKL': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -106,7 +106,7 @@ class BuildFlavour(Enum):
     , 'WIN_GPU': (
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DUSE_NVRTC=ON '
+        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON  '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -122,7 +122,7 @@ class BuildFlavour(Enum):
     , 'WIN_GPU_MKLDNN': (
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DUSE_NVRTC=ON '
+        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '

diff --git a/make/maven/maven_darwin_mkl.mk b/make/maven/maven_darwin_mkl.mk
@@ -77,7 +77,7 @@ USE_CUDNN = 0
 # CUDA_ARCH :=
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-USE_NVRTC = 0
+ENABLE_CUDA_RTC = 0
 
 # use openmp for parallelization
 USE_OPENMP = 0

diff --git a/make/maven/maven_linux_cu90mkl.mk b/make/maven/maven_linux_cu90mkl.mk
@@ -80,7 +80,7 @@ USE_NCCL = 1
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVTX=1
-USE_NVRTC = 1
+ENABLE_CUDA_RTC = 1
 
 # use openmp for parallelization
 USE_OPENMP = 1

diff --git a/make/maven/maven_linux_cu92mkl.mk b/make/maven/maven_linux_cu92mkl.mk
@@ -80,7 +80,7 @@ USE_NCCL = 1
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVTX=1
-USE_NVRTC = 1
+ENABLE_CUDA_RTC = 1
 
 # use openmp for parallelization
 USE_OPENMP = 1

diff --git a/make/maven/maven_linux_mkl.mk b/make/maven/maven_linux_mkl.mk
@@ -76,7 +76,7 @@ USE_CUDNN = 0
 # CUDA_ARCH :=
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-USE_NVRTC = 0
+ENABLE_CUDA_RTC = 0
 
 # use openmp for parallelization
 USE_OPENMP = 1

diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
@@ -221,6 +221,11 @@ Graph FusePointwiseForward(Graph&& g);
  */
 Graph FusePointwiseBackward(Graph&& g);
 
+/*!
+ * \brief Issue a one-time warning that fusion is not possible for this platform or build.
+ */
+void WarnFusionNotSupported();
+
 /*!
  * \brief Infer shapes in the graph given the information.
  * \param graph The input graph.

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
@@ -50,7 +50,7 @@ static const std::string GetDefaultSubgraphBackend() {
 #endif
 }
 
-GraphExecutor::GraphExecutor() {
+GraphExecutor::GraphExecutor(const nnvm::Symbol& symbol) {
   log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
   need_grad_ = false;
   is_dynamic_ = false;
@@ -60,6 +60,7 @@ GraphExecutor::GraphExecutor() {
     LOG(INFO) << "MXNET_SUBGRAPH_BACKEND=NONE is detected, subgraph backend is not in use";
   }
   engine_ref_ = Engine::_GetSharedRef();
+  symbol_ = symbol.Copy();
 }
 
 GraphExecutor::~GraphExecutor() {
@@ -890,10 +891,9 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
                                  std::vector<NDArray>* arg_grads,
                                  std::vector<NDArray>* aux_states) {
   nnvm::Graph g;
-  g.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
-    graph_.outputs.begin() + num_forward_outputs_);
   nnvm::Symbol symbol;
-  symbol.outputs = g.outputs;
+  symbol.outputs = symbol_.outputs;
+  g.outputs = symbol_.outputs;
   const nnvm::IndexedGraph& idx = g.indexed_graph();
   mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
   for (size_t i = 0; i < num_forward_inputs_; ++i) {
@@ -977,8 +977,8 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
       }
     }
   }
-  auto exec = new GraphExecutor();
-  exec->Init(symbol, default_ctx, ctx_map,
+  auto exec = new GraphExecutor(symbol);
+  exec->Init(symbol.Copy(), default_ctx, ctx_map,
              *in_args, *arg_grads, grad_req_types, *aux_states,
              this);
   return exec;
@@ -1001,7 +1001,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
   // setup gradient
   nnvm::Graph g = InitFullGraph(symbol, grad_req_types);
 
-#if MXNET_USE_CUDA && !defined(_WIN32)
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
   if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) {
     nnvm::Graph unoptimized_graph;
     common::CopyGraph(&unoptimized_graph, g, false);
@@ -1034,7 +1034,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
         << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
      }
   }
-#endif  // MXNET_USE_CUDA
+#else
+  // Only warn user if MXNET_USE_FUSION env var is explicitly set
+  if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) {
+    WarnFusionNotSupported();
+  }
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
 
   // create "device" and "context" attrs for the graph
   g = AssignContext(g, default_ctx, ctx_map,
@@ -1969,7 +1974,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
                                std::vector<NDArray>* aux_states,
                                std::unordered_map<std::string, NDArray>* shared_buffer,
                                Executor* shared_exec) {
-  auto exec = new exec::GraphExecutor();
+  auto exec = new exec::GraphExecutor(symbol);
   bool init = false;
   if (!exec->subgraph_property().empty()) {
     static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
@@ -1989,6 +1994,8 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
       symbol = exec::BuildSubgraph(symbol, backend, arg_shape_map, arg_dtype_map, arg_stype_map,
                                    default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes,
                                    &tmp_grad_req_types, &tmp_aux_state_ctxes, verbose);
+      // Subgraph cannot be recreated from unoptimized symbol
+      exec = new exec::GraphExecutor(symbol);
       exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
                  tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map,
                  tmp_grad_req_types, shared_arg_names, &tmp_in_args, &tmp_arg_grads,
@@ -2043,7 +2050,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          const std::vector<OpReqType> &grad_req_type,
                          const std::vector<NDArray> &aux_states,
                          Executor* shared_exec) {
-  auto exec = new exec::GraphExecutor();
+  auto exec = new exec::GraphExecutor(symbol);
   static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
   std::vector<NDArray> tmp_in_args = in_args;
   std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
@@ -2058,6 +2065,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
       symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args,
                                    &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states,
                                    verbose);
+      // Subgraph cannot be recreated from unoptimized symbol
+      exec = new exec::GraphExecutor(symbol);
     }
   }
   exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store,

diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
@@ -58,7 +58,7 @@ class GraphExecutor : public Executor {
  public:
   using Executor::MonitorCallback;
 
-  GraphExecutor();
+  explicit GraphExecutor(const nnvm::Symbol& symbol);
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
   void PartialForward(bool is_train, int step, int *step_left) override;
@@ -267,6 +267,9 @@ class GraphExecutor : public Executor {
   std::string subgraph_property_;
   // ref of engine
   std::shared_ptr<Engine> engine_ref_;
+  // Unoptimized copy of the symbol for sharing with
+  // child executors
+  nnvm::Symbol symbol_;
 };
 
 }  // namespace exec