Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Backport #16798, #16836 and #16838 to 1.6 #16874

Merged
merged 5 commits into from
Nov 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -633,14 +633,17 @@ if(USE_CUDA)
else()
list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
# define preprocessor macro so that we will not include the generated forcelink header
if(ENABLE_CUDA_RTC)
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
endif()
# Create '.cmake' files for cuda compiles given definitions added thus far
mshadow_cuda_compile(cuda_objs ${CUDA})
if(MSVC)
if(ENABLE_CUDA_RTC)
FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
endif()
FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
Expand All @@ -652,7 +655,6 @@ if(USE_CUDA)
list(APPEND mxnet_LINKER_LIBS cufft cusolver)
if(ENABLE_CUDA_RTC)
list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
endif()
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
endif()
Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ before_build:

set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build

cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DENABLE_CUDA_RTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"

build_script:
- cmd: >-
Expand Down
12 changes: 6 additions & 6 deletions ci/build_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class BuildFlavour(Enum):
'WIN_CPU': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -67,7 +67,7 @@ class BuildFlavour(Enum):
, 'WIN_CPU_MKLDNN': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -80,7 +80,7 @@ class BuildFlavour(Enum):
, 'WIN_CPU_MKLDNN_MKL': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -93,7 +93,7 @@ class BuildFlavour(Enum):
, 'WIN_CPU_MKL': (
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DUSE_NVRTC=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -106,7 +106,7 @@ class BuildFlavour(Enum):
, 'WIN_GPU': (
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DUSE_NVRTC=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -122,7 +122,7 @@ class BuildFlavour(Enum):
, 'WIN_GPU_MKLDNN': (
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DUSE_NVRTC=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_darwin_mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ USE_CUDNN = 0
# CUDA_ARCH :=

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVRTC = 0
ENABLE_CUDA_RTC = 0

# use openmp for parallelization
USE_OPENMP = 0
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_linux_cu90mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ USE_NCCL = 1

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVTX=1
USE_NVRTC = 1
ENABLE_CUDA_RTC = 1

# use openmp for parallelization
USE_OPENMP = 1
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_linux_cu92mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ USE_NCCL = 1

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVTX=1
USE_NVRTC = 1
ENABLE_CUDA_RTC = 1

# use openmp for parallelization
USE_OPENMP = 1
Expand Down
2 changes: 1 addition & 1 deletion make/maven/maven_linux_mkl.mk
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ USE_CUDNN = 0
# CUDA_ARCH :=

# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
USE_NVRTC = 0
ENABLE_CUDA_RTC = 0

# use openmp for parallelization
USE_OPENMP = 1
Expand Down
5 changes: 5 additions & 0 deletions src/executor/exec_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,11 @@ Graph FusePointwiseForward(Graph&& g);
*/
Graph FusePointwiseBackward(Graph&& g);

/*!
* \brief Issue a one-time warning that fusion is not possible for this platform or build.
*/
void WarnFusionNotSupported();

/*!
* \brief Infer shapes in the graph given the information.
* \param graph The input graph.
Expand Down
29 changes: 19 additions & 10 deletions src/executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static const std::string GetDefaultSubgraphBackend() {
#endif
}

GraphExecutor::GraphExecutor() {
GraphExecutor::GraphExecutor(const nnvm::Symbol& symbol) {
log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
need_grad_ = false;
is_dynamic_ = false;
Expand All @@ -60,6 +60,7 @@ GraphExecutor::GraphExecutor() {
LOG(INFO) << "MXNET_SUBGRAPH_BACKEND=NONE is detected, subgraph backend is not in use";
}
engine_ref_ = Engine::_GetSharedRef();
symbol_ = symbol.Copy();
}

GraphExecutor::~GraphExecutor() {
Expand Down Expand Up @@ -890,10 +891,9 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
std::vector<NDArray>* arg_grads,
std::vector<NDArray>* aux_states) {
nnvm::Graph g;
g.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
graph_.outputs.begin() + num_forward_outputs_);
nnvm::Symbol symbol;
symbol.outputs = g.outputs;
symbol.outputs = symbol_.outputs;
g.outputs = symbol_.outputs;
const nnvm::IndexedGraph& idx = g.indexed_graph();
mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
for (size_t i = 0; i < num_forward_inputs_; ++i) {
Expand Down Expand Up @@ -977,8 +977,8 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
}
}
}
auto exec = new GraphExecutor();
exec->Init(symbol, default_ctx, ctx_map,
auto exec = new GraphExecutor(symbol);
exec->Init(symbol.Copy(), default_ctx, ctx_map,
*in_args, *arg_grads, grad_req_types, *aux_states,
this);
return exec;
Expand All @@ -1001,7 +1001,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
// setup gradient
nnvm::Graph g = InitFullGraph(symbol, grad_req_types);

#if MXNET_USE_CUDA && !defined(_WIN32)
#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) {
nnvm::Graph unoptimized_graph;
common::CopyGraph(&unoptimized_graph, g, false);
Expand Down Expand Up @@ -1034,7 +1034,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
<< "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
}
}
#endif // MXNET_USE_CUDA
#else
// Only warn user if MXNET_USE_FUSION env var is explicitly set
if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) {
WarnFusionNotSupported();
}
#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)

// create "device" and "context" attrs for the graph
g = AssignContext(g, default_ctx, ctx_map,
Expand Down Expand Up @@ -1969,7 +1974,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
std::vector<NDArray>* aux_states,
std::unordered_map<std::string, NDArray>* shared_buffer,
Executor* shared_exec) {
auto exec = new exec::GraphExecutor();
auto exec = new exec::GraphExecutor(symbol);
bool init = false;
if (!exec->subgraph_property().empty()) {
static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
Expand All @@ -1989,6 +1994,8 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
symbol = exec::BuildSubgraph(symbol, backend, arg_shape_map, arg_dtype_map, arg_stype_map,
default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes,
&tmp_grad_req_types, &tmp_aux_state_ctxes, verbose);
// Subgraph cannot be recreated from unoptimized symbol
exec = new exec::GraphExecutor(symbol);
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map,
tmp_grad_req_types, shared_arg_names, &tmp_in_args, &tmp_arg_grads,
Expand Down Expand Up @@ -2043,7 +2050,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
const std::vector<OpReqType> &grad_req_type,
const std::vector<NDArray> &aux_states,
Executor* shared_exec) {
auto exec = new exec::GraphExecutor();
auto exec = new exec::GraphExecutor(symbol);
static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
std::vector<NDArray> tmp_in_args = in_args;
std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
Expand All @@ -2058,6 +2065,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args,
&tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states,
verbose);
// Subgraph cannot be recreated from unoptimized symbol
exec = new exec::GraphExecutor(symbol);
}
}
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store,
Expand Down
5 changes: 4 additions & 1 deletion src/executor/graph_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class GraphExecutor : public Executor {
public:
using Executor::MonitorCallback;

GraphExecutor();
explicit GraphExecutor(const nnvm::Symbol& symbol);
virtual ~GraphExecutor();
void Forward(bool is_train) override;
void PartialForward(bool is_train, int step, int *step_left) override;
Expand Down Expand Up @@ -267,6 +267,9 @@ class GraphExecutor : public Executor {
std::string subgraph_property_;
// ref of engine
std::shared_ptr<Engine> engine_ref_;
// Unoptimized copy of the symbol for sharing with
// child executors
nnvm::Symbol symbol_;
};

} // namespace exec
Expand Down
Loading