Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Backport #16798, #16836 and #16838 to 1.6 (#16874)
Browse files Browse the repository at this point in the history
* Add unoptimized symbol to executor for sharing (#16798)

* Add unoptimized symbol to executor for sharing

* Copy the symbol in Reshape

* Added test for multiple reshapes

* Mixed precison binary op backward (use in) for numpy (#16791)

* mixed precison binary op backward

* reduce unix cpu runtime

* USE_NVRTC -> ENABLE_CUDA_RTC to fix maven build.  Add compile-guard to fusion. (#16838)

* Rename USE_NVRTC -> ENABLE_CUDA_RTC to fix maven build.  Compile-guard fusion framework.

* Fix fusion-not-supported warning.

* Fix compile guards

* Fix cmake build so -DMXNET_ENABLE_CUDA_RTC=1 is passed to nvcc

* Minimize side-effects of prev change

* Fix InferAttr/InferShapeAttr not calling inference for all nodes in a graph (#16836)

* Fix the attribute inference omitting nodes

* Add test

* Cleaning

* Fix lint

* Fix TransposeShape

* Fix WhileLoopType

* Changing a/b test for fusion to a/(b+1) to increase numerical stability

* Revert "Mixed precison binary op backward (use in) for numpy (#16791)"

This reverts commit 8b58b78.
  • Loading branch information
ptrendx committed Nov 22, 2019
1 parent 200f0ec commit e73c186
Show file tree
Hide file tree
Showing 21 changed files with 266 additions and 107 deletions.
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -633,14 +633,17 @@ if(USE_CUDA)
else()
list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
# define preprocessor macro so that we will not include the generated forcelink header
if(ENABLE_CUDA_RTC)
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
endif()
# Create '.cmake' files for cuda compiles given definitions added thus far
mshadow_cuda_compile(cuda_objs ${CUDA})
if(MSVC)
if(ENABLE_CUDA_RTC)
FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
endif()
FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
Expand All @@ -652,7 +655,6 @@ if(USE_CUDA)
list(APPEND mxnet_LINKER_LIBS cufft cusolver)
if(ENABLE_CUDA_RTC)
list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
endif()
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
endif()
Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ before_build:
set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build
cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DENABLE_CUDA_RTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
build_script:
- cmd: >-
Expand Down
12 changes: 6 additions & 6 deletions ci/build_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class BuildFlavour(Enum):
'WIN_CPU': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -67,7 +67,7 @@ class BuildFlavour(Enum):
, 'WIN_CPU_MKLDNN': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -80,7 +80,7 @@ class BuildFlavour(Enum):
, 'WIN_CPU_MKLDNN_MKL': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -93,7 +93,7 @@ class BuildFlavour(Enum):
, 'WIN_CPU_MKL': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -106,7 +106,7 @@ class BuildFlavour(Enum):
, 'WIN_GPU': (
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DUSE_NVRTC=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -122,7 +122,7 @@ class BuildFlavour(Enum):
, 'WIN_GPU_MKLDNN': (
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DUSE_NVRTC=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_darwin_mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ USE_CUDNN = 0
# CUDA_ARCH :=

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVRTC = 0
ENABLE_CUDA_RTC = 0

# use openmp for parallelization
USE_OPENMP = 0
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_linux_cu90mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ USE_NCCL = 1

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVTX=1
USE_NVRTC = 1
ENABLE_CUDA_RTC = 1

# use openmp for parallelization
USE_OPENMP = 1
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_linux_cu92mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ USE_NCCL = 1

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVTX=1
USE_NVRTC = 1
ENABLE_CUDA_RTC = 1

# use openmp for parallelization
USE_OPENMP = 1
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_linux_mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ USE_CUDNN = 0
# CUDA_ARCH :=

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVRTC = 0
ENABLE_CUDA_RTC = 0

# use openmp for parallelization
USE_OPENMP = 1
Expand Down
5 changes: 5 additions & 0 deletions src/executor/exec_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,11 @@ Graph FusePointwiseForward(Graph&& g);
*/
Graph FusePointwiseBackward(Graph&& g);

/*!
* \brief Issue a one-time warning that fusion is not possible for this platform or build.
*/
void WarnFusionNotSupported();

/*!
* \brief Infer shapes in the graph given the information.
* \param graph The input graph.
Expand Down
29 changes: 19 additions & 10 deletions src/executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static const std::string GetDefaultSubgraphBackend() {
#endif
}

GraphExecutor::GraphExecutor() {
GraphExecutor::GraphExecutor(const nnvm::Symbol& symbol) {
log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
need_grad_ = false;
is_dynamic_ = false;
Expand All @@ -60,6 +60,7 @@ GraphExecutor::GraphExecutor() {
LOG(INFO) << "MXNET_SUBGRAPH_BACKEND=NONE is detected, subgraph backend is not in use";
}
engine_ref_ = Engine::_GetSharedRef();
symbol_ = symbol.Copy();
}

GraphExecutor::~GraphExecutor() {
Expand Down Expand Up @@ -890,10 +891,9 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
std::vector<NDArray>* arg_grads,
std::vector<NDArray>* aux_states) {
nnvm::Graph g;
g.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
graph_.outputs.begin() + num_forward_outputs_);
nnvm::Symbol symbol;
symbol.outputs = g.outputs;
symbol.outputs = symbol_.outputs;
g.outputs = symbol_.outputs;
const nnvm::IndexedGraph& idx = g.indexed_graph();
mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
for (size_t i = 0; i < num_forward_inputs_; ++i) {
Expand Down Expand Up @@ -977,8 +977,8 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
}
}
}
auto exec = new GraphExecutor();
exec->Init(symbol, default_ctx, ctx_map,
auto exec = new GraphExecutor(symbol);
exec->Init(symbol.Copy(), default_ctx, ctx_map,
*in_args, *arg_grads, grad_req_types, *aux_states,
this);
return exec;
Expand All @@ -1001,7 +1001,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
// setup gradient
nnvm::Graph g = InitFullGraph(symbol, grad_req_types);

#if MXNET_USE_CUDA && !defined(_WIN32)
#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) {
nnvm::Graph unoptimized_graph;
common::CopyGraph(&unoptimized_graph, g, false);
Expand Down Expand Up @@ -1034,7 +1034,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
<< "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
}
}
#endif // MXNET_USE_CUDA
#else
// Only warn user if MXNET_USE_FUSION env var is explicitly set
if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) {
WarnFusionNotSupported();
}
#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)

// create "device" and "context" attrs for the graph
g = AssignContext(g, default_ctx, ctx_map,
Expand Down Expand Up @@ -1969,7 +1974,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
std::vector<NDArray>* aux_states,
std::unordered_map<std::string, NDArray>* shared_buffer,
Executor* shared_exec) {
auto exec = new exec::GraphExecutor();
auto exec = new exec::GraphExecutor(symbol);
bool init = false;
if (!exec->subgraph_property().empty()) {
static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
Expand All @@ -1989,6 +1994,8 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
symbol = exec::BuildSubgraph(symbol, backend, arg_shape_map, arg_dtype_map, arg_stype_map,
default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes,
&tmp_grad_req_types, &tmp_aux_state_ctxes, verbose);
// Subgraph cannot be recreated from unoptimized symbol
exec = new exec::GraphExecutor(symbol);
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map,
tmp_grad_req_types, shared_arg_names, &tmp_in_args, &tmp_arg_grads,
Expand Down Expand Up @@ -2043,7 +2050,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
const std::vector<OpReqType> &grad_req_type,
const std::vector<NDArray> &aux_states,
Executor* shared_exec) {
auto exec = new exec::GraphExecutor();
auto exec = new exec::GraphExecutor(symbol);
static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
std::vector<NDArray> tmp_in_args = in_args;
std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
Expand All @@ -2058,6 +2065,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args,
&tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states,
verbose);
// Subgraph cannot be recreated from unoptimized symbol
exec = new exec::GraphExecutor(symbol);
}
}
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store,
Expand Down
5 changes: 4 additions & 1 deletion src/executor/graph_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class GraphExecutor : public Executor {
public:
using Executor::MonitorCallback;

GraphExecutor();
explicit GraphExecutor(const nnvm::Symbol& symbol);
virtual ~GraphExecutor();
void Forward(bool is_train) override;
void PartialForward(bool is_train, int step, int *step_left) override;
Expand Down Expand Up @@ -267,6 +267,9 @@ class GraphExecutor : public Executor {
std::string subgraph_property_;
// ref of engine
std::shared_ptr<Engine> engine_ref_;
// Unoptimized copy of the symbol for sharing with
// child executors
nnvm::Symbol symbol_;
};

} // namespace exec
Expand Down
Loading

0 comments on commit e73c186

Please sign in to comment.