From e3e63fe7034254f087fb836d6838e6a44b05eb2e Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Tue, 19 Nov 2019 23:24:43 -0800 Subject: [PATCH 1/5] Add unoptimized symbol to executor for sharing (#16798) * Add unoptimized symbol to executor for sharing * Copy the symbol in Reshape * Added test for multiple reshapes --- src/executor/graph_executor.cc | 20 ++++++++++++-------- src/executor/graph_executor.h | 5 ++++- tests/python/gpu/test_fusion.py | 25 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 7fa1de373d07..9c6a9c1d8c48 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -50,7 +50,7 @@ static const std::string GetDefaultSubgraphBackend() { #endif } -GraphExecutor::GraphExecutor() { +GraphExecutor::GraphExecutor(const nnvm::Symbol& symbol) { log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false); need_grad_ = false; is_dynamic_ = false; @@ -60,6 +60,7 @@ GraphExecutor::GraphExecutor() { LOG(INFO) << "MXNET_SUBGRAPH_BACKEND=NONE is detected, subgraph backend is not in use"; } engine_ref_ = Engine::_GetSharedRef(); + symbol_ = symbol.Copy(); } GraphExecutor::~GraphExecutor() { @@ -890,10 +891,9 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping, std::vector* arg_grads, std::vector* aux_states) { nnvm::Graph g; - g.outputs = std::vector(graph_.outputs.begin(), - graph_.outputs.begin() + num_forward_outputs_); nnvm::Symbol symbol; - symbol.outputs = g.outputs; + symbol.outputs = symbol_.outputs; + g.outputs = symbol_.outputs; const nnvm::IndexedGraph& idx = g.indexed_graph(); mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape()); for (size_t i = 0; i < num_forward_inputs_; ++i) { @@ -977,8 +977,8 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping, } } } - auto exec = new GraphExecutor(); - exec->Init(symbol, default_ctx, ctx_map, + auto exec = new GraphExecutor(symbol); + exec->Init(symbol.Copy(), default_ctx, ctx_map, *in_args, *arg_grads, grad_req_types, *aux_states, this); return exec; @@ -1969,7 +1969,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, std::vector* aux_states, std::unordered_map* shared_buffer, Executor* shared_exec) { - auto exec = new exec::GraphExecutor(); + auto exec = new exec::GraphExecutor(symbol); bool init = false; if (!exec->subgraph_property().empty()) { static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1); @@ -1989,6 +1989,8 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol, symbol = exec::BuildSubgraph(symbol, backend, arg_shape_map, arg_dtype_map, arg_stype_map, default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes, &tmp_grad_req_types, &tmp_aux_state_ctxes, verbose); + // Subgraph cannot be recreated from unoptimized symbol + exec = new exec::GraphExecutor(symbol); exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes, tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map, tmp_grad_req_types, shared_arg_names, &tmp_in_args, &tmp_arg_grads, @@ -2043,7 +2045,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol, const std::vector &grad_req_type, const std::vector &aux_states, Executor* shared_exec) { - auto exec = new exec::GraphExecutor(); + auto exec = new exec::GraphExecutor(symbol); static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1); std::vector tmp_in_args = in_args; std::vector tmp_arg_grad_store = arg_grad_store; @@ -2058,6 +2060,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol, symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args, &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states, verbose); + // Subgraph cannot be recreated from unoptimized symbol + exec = new exec::GraphExecutor(symbol); } } exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h index f150165796ad..bfa6980a8e29 100644 --- a/src/executor/graph_executor.h +++ b/src/executor/graph_executor.h @@ -58,7 +58,7 @@ class GraphExecutor : public Executor { public: using Executor::MonitorCallback; - GraphExecutor(); + explicit GraphExecutor(const nnvm::Symbol& symbol); virtual ~GraphExecutor(); void Forward(bool is_train) override; void PartialForward(bool is_train, int step, int *step_left) override; @@ -267,6 +267,9 @@ class GraphExecutor : public Executor { std::string subgraph_property_; // ref of engine std::shared_ptr engine_ref_; + // Unoptimized copy of the symbol for sharing with + // child executors + nnvm::Symbol symbol_; }; } // namespace exec diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index 5606eb19a9c5..693336f22496 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -239,6 +239,31 @@ def test_fusion_compiler_cache(): check_fused_symbol(a+b, ctx=mx.gpu(1), a=arr1, b=arr2) +@with_seed() +def test_fusion_reshape_executor(): + a = mx.sym.Variable("data1") + b = mx.sym.Variable("data2") + c = a + b + 1 + sym = mx.sym.relu(c) + orig_shape = (10,10) + e = sym.simple_bind(ctx=mx.gpu(), data1=orig_shape, data2=orig_shape) + data = mx.nd.zeros(orig_shape, ctx=mx.gpu()) + out = e.forward(is_train=False) + assert out[0].sum().asscalar() == 100 + changed_shape = (80, 2) + new_shape = {'data1': changed_shape, 'data2': changed_shape} + data = mx.nd.zeros(new_shape['data1'], ctx=mx.gpu()) + f = e.reshape(allow_up_sizing=True, **new_shape) + out = f.forward(is_train=False, data1=data, data2=data) + assert out[0].sum().asscalar() == 160 + # Reshape again + changed_shape = (30, 5) + new_shape = {'data1': changed_shape, 'data2': changed_shape} + data = mx.nd.zeros(new_shape['data1'], ctx=mx.gpu()) + f = e.reshape(allow_up_sizing=True, **new_shape) + out = f.forward(is_train=False, data1=data, data2=data) + assert out[0].sum().asscalar() == 150 + if __name__ == '__main__': import nose nose.runmodule() From 8b58b78c78c080650ea10bb205eb897da81b07b2 Mon Sep 17 00:00:00 2001 From: Hao Jin Date: Wed, 20 Nov 2019 15:15:58 -0800 Subject: [PATCH 2/5] Mixed precison binary op backward (use in) for numpy (#16791) * mixed precison binary op backward * reduce unix cpu runtime --- .../numpy/np_elemwise_broadcast_op.cc | 17 ++- .../numpy/np_elemwise_broadcast_op.cu | 4 + src/operator/numpy/np_elemwise_broadcast_op.h | 104 +++++++++++++++++- .../tensor/elemwise_binary_broadcast_op.h | 26 +++++ src/operator/tensor/elemwise_unary_op.h | 4 +- tests/python/unittest/test_numpy_op.py | 20 ++-- 6 files changed, 162 insertions(+), 13 deletions(-) diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc index a76e59d30dc6..acf0395123fc 100644 --- a/src/operator/numpy/np_elemwise_broadcast_op.cc +++ b/src/operator/numpy/np_elemwise_broadcast_op.cc @@ -147,7 +147,22 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_multiply) "FCompute", NumpyBinaryBroadcastComputeWithBool) #endif -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"}); +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"}); + +NNVM_REGISTER_OP(_backward_npi_broadcast_mul) +.set_num_inputs(3) +.set_num_outputs(2) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", + [](const NodeAttrs& attrs){ + return std::vector >{{0, 1}}; + }) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", NumpyBinaryBackwardUseIn); MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod) .set_attr("FCompute", BinaryBroadcastCompute) diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu index a0a277df211f..d9499625e34d 100644 --- a/src/operator/numpy/np_elemwise_broadcast_op.cu +++ b/src/operator/numpy/np_elemwise_broadcast_op.cu @@ -64,6 +64,10 @@ NNVM_REGISTER_OP(_npi_multiply) NumpyBinaryBroadcastComputeWithBool); #endif +NNVM_REGISTER_OP(_backward_npi_broadcast_mul) +.set_attr("FCompute", NumpyBinaryBackwardUseIn); + NNVM_REGISTER_OP(_npi_mod) .set_attr("FCompute", BinaryBroadcastCompute); diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h index 1a4596fba91c..179acd279ab6 100644 --- a/src/operator/numpy/np_elemwise_broadcast_op.h +++ b/src/operator/numpy/np_elemwise_broadcast_op.h @@ -25,6 +25,7 @@ #ifndef MXNET_OPERATOR_NUMPY_NP_ELEMWISE_BROADCAST_OP_H_ #define MXNET_OPERATOR_NUMPY_NP_ELEMWISE_BROADCAST_OP_H_ +#include #include #include @@ -381,11 +382,13 @@ void NumpyBinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs, } template -void MixedBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs, +void NumpyBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; CHECK_EQ(inputs.size(), 3U); CHECK_EQ(outputs.size(), 2U); @@ -396,7 +399,104 @@ void MixedBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs, return; } - PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_); + const TBlob& ograd = inputs[0]; + const TBlob& lgrad = outputs[0]; + const TBlob& rgrad = outputs[1]; + + if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) { + // If any of the inputs is a float, it's the same type as the output + // So 2 of the 3 tensors have the same data type + Stream *s = ctx.get_stream(); + mxnet::TShape new_lshape, new_rshape, new_oshape; + using namespace broadcast; + const bool need_bc = BinaryBroadcastShapeCompact(lgrad.shape_, rgrad.shape_, ograd.shape_, + &new_lshape, &new_rshape, &new_oshape) != 0; + + // Prepare all the temporary memory + size_t workspace_size_l = 0, workspace_size_r = 0; + TBlob temp_tblob; // The TBlob for casted input data + TBlob temp_igrad; // The TBlob for casted grad results + size_t tensor_size = (lgrad.type_flag_ != ograd.type_flag_) ? lgrad.Size() : rgrad.Size(); + Tensor workspace; + + MSHADOW_TYPE_SWITCH(ograd.type_flag_, OType, { + BROADCAST_NDIM_SWITCH(new_oshape.ndim(), ndim, { + workspace_size_l = ReduceWorkspaceSize( + s, new_lshape, req[0], new_oshape, new_lshape, new_rshape); + workspace_size_r = ReduceWorkspaceSize( + s, new_rshape, req[1], new_oshape, new_lshape, new_rshape); + }); + size_t workspace_size = std::max(workspace_size_l, workspace_size_r); + size_t cast_tensor_size = tensor_size * sizeof(OType); + // Allocate the temporary memories now + Tensor temp_space = + ctx.requested[0].get_space_typed( + Shape1(workspace_size + cast_tensor_size * 2), s); + // Tensor for temp_tblob + Tensor temp_tblob_tensor( + reinterpret_cast(temp_space.dptr_), + Shape1(tensor_size), s); + // Tensor for temp_igrad + Tensor temp_igrad_tensor( + reinterpret_cast(temp_space.dptr_) + tensor_size, + Shape1(tensor_size), s); + temp_tblob = + TBlob(temp_tblob_tensor) + .reshape(((lgrad.type_flag_ != ograd.type_flag_) ? lhs.shape_ : rhs.shape_)); + temp_igrad = + TBlob(temp_igrad_tensor) + .reshape(((lgrad.type_flag_ != ograd.type_flag_) ? lhs.shape_ : rhs.shape_)); + if (temp_igrad.Size() != 0) { + Kernel::Launch(s, temp_igrad.Size(), temp_igrad.dptr()); + } + workspace = + Tensor(temp_space.dptr_ + 2 * cast_tensor_size, Shape1(workspace_size), s); + }); + // Cast the input that does not have consistent type to temp_tblob + CastCompute( + attrs, ctx, {((lgrad.type_flag_ != ograd.type_flag_) ? lhs : rhs)}, {kWriteTo}, {temp_tblob}); + if (!need_bc) { + if (lhs.type_flag_ != ograd.type_flag_) { + ElemwiseBinaryOp::BackwardUseIn( + attrs, ctx, {ograd, temp_tblob, rhs}, {kWriteTo, req[1]}, {temp_igrad, rgrad}); + } else { + ElemwiseBinaryOp::BackwardUseIn( + attrs, ctx, {ograd, lhs, temp_tblob}, {req[0], kWriteTo}, {lgrad, temp_igrad}); + } + } else { + if (lhs.type_flag_ != ograd.type_flag_) { + MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, { + BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, { + BinaryBroadcastBackwardUseInImplWithWorkspace( + ctx, {ograd, temp_tblob, rhs}, {kWriteTo, req[1]}, {temp_igrad, rgrad}, + workspace, new_lshape, new_rshape, new_oshape); + }); + }); + } else { + MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, { + BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, { + BinaryBroadcastBackwardUseInImplWithWorkspace( + ctx, {ograd, lhs, temp_tblob}, {req[0], kWriteTo}, {lgrad, temp_igrad}, + workspace, new_lshape, new_rshape, new_oshape); + }); + }); + } + } + + // If both inputs are floating numbers, cast the igrad to the input that has + // the different data type + if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) { + if (lhs.type_flag_ != ograd.type_flag_) { + CastCompute(attrs, ctx, {temp_igrad}, {req[0]}, {lgrad}); + } else { + CastCompute(attrs, ctx, {temp_igrad}, {req[1]}, {rgrad}); + } + } + } else { + // Case where both inputs are integer types, should not even do + // backward computation for this case. + PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_); + } } } // namespace op diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index b48ed389ba98..69fc965c03b8 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -671,6 +671,32 @@ BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs); +template +void BinaryBroadcastBackwardUseInImplWithWorkspace(const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs, + const mshadow::Tensor& workspace, + const mxnet::TShape& new_lshape, + const mxnet::TShape& new_rshape, + const mxnet::TShape& new_oshape) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace broadcast; + Stream *s = ctx.get_stream(); + const TBlob lgrad = outputs[0].reshape(new_lshape); + const TBlob rgrad = outputs[1].reshape(new_rshape); + const TBlob ograd = inputs[0].reshape(new_oshape); + const TBlob lhs = inputs[1].reshape(new_lshape); + const TBlob rhs = inputs[2].reshape(new_rshape); + if (ograd.Size() != 0) { + Reduce(s, lgrad, req[0], workspace, + ograd, lhs, rhs); + Reduce(s, rgrad, req[1], workspace, + ograd, lhs, rhs); + } +} + template inline void BinaryBroadcastBackwardUseInImpl(const OpContext& ctx, const std::vector& inputs, diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index 188ccd68a340..8886e15e3972 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -453,8 +453,8 @@ void CastCompute(const nnvm::NodeAttrs& attrs, Tensor out = outputs[0].FlatTo1D(s); MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, SrcDType, { Tensor data = inputs[0].FlatTo1D(s); - if (outputs[0].type_flag_ != inputs[0].type_flag_ || - req[0] != kWriteInplace) { + if ((outputs[0].type_flag_ != inputs[0].type_flag_ || + req[0] != kWriteInplace) && outputs[0].Size() != 0) { Assign(out, req[0], tcast(data)); } }); diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 9a36e061ed4c..85eaadd4ffe8 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -1683,7 +1683,9 @@ def hybrid_forward(self, F, a, b, *args, **kwargs): @with_seed() @use_np def test_np_mixed_precision_binary_funcs(): - def check_mixed_precision_binary_func(func, low, high, lshape, rshape, ltype, rtype): + itypes = [np.bool, np.int8, np.int32, np.int64] + ftypes = [np.float16, np.float32, np.float64] + def check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, ltype, rtype): class TestMixedBinary(HybridBlock): def __init__(self, func): super(TestMixedBinary, self).__init__() @@ -1717,13 +1719,15 @@ def hybrid_forward(self, F, a, b, *args, **kwargs): use_broadcast=False, equal_nan=True) funcs = { - 'add': (-1.0, 1.0), - 'subtract': (-1.0, 1.0), - 'multiply': (-1.0, 1.0), + 'add': (-1.0, 1.0, None, None), + 'subtract': (-1.0, 1.0, None, None), + 'multiply': (-1.0, 1.0, lambda y, x1, x2: _np.broadcast_to(x2, y.shape), + lambda y, x1, x2: _np.broadcast_to(x1, y.shape)) } shape_pairs = [((3, 2), (3, 2)), ((3, 2), (3, 1)), + ((3, 0), (3, 0)), ((3, 1), (3, 0)), ((0, 2), (1, 2)), ((2, 3, 4), (3, 1)), @@ -1733,16 +1737,16 @@ def hybrid_forward(self, F, a, b, *args, **kwargs): itypes = [np.bool, np.int8, np.int32, np.int64] ftypes = [np.float16, np.float32, np.float64] for func, func_data in funcs.items(): - low, high = func_data + low, high, lgrad, rgrad = func_data for lshape, rshape in shape_pairs: for type1, type2 in itertools.product(itypes, ftypes): - check_mixed_precision_binary_func(func, low, high, lshape, rshape, type1, type2) - check_mixed_precision_binary_func(func, low, high, lshape, rshape, type2, type1) + check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2) + check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type2, type1) for type1, type2 in itertools.product(ftypes, ftypes): if type1 == type2: continue - check_mixed_precision_binary_func(func, low, high, lshape, rshape, type1, type2) + check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2) @with_seed() From 79f9f84eadd85e118d409f1e10d4fba3ff226c81 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Wed, 20 Nov 2019 15:43:44 -0800 Subject: [PATCH 3/5] USE_NVRTC -> ENABLE_CUDA_RTC to fix maven build. Add compile-guard to fusion. (#16838) * Rename USE_NVRTC -> ENABLE_CUDA_RTC to fix maven build. Compile-guard fusion framework. * Fix fusion-not-supported warning. * Fix compile guards * Fix cmake build so -DMXNET_ENABLE_CUDA_RTC=1 is passed to nvcc * Minimize side-effects of prev change --- CMakeLists.txt | 6 ++++-- appveyor.yml | 2 +- ci/build_windows.py | 12 ++++++------ make/maven/maven_darwin_mkl.mk | 2 +- make/maven/maven_linux_cu90mkl.mk | 2 +- make/maven/maven_linux_cu92mkl.mk | 2 +- make/maven/maven_linux_mkl.mk | 2 +- src/executor/exec_pass.h | 5 +++++ src/executor/graph_executor.cc | 9 +++++++-- src/executor/pointwise_fusion_pass.cc | 22 +++++++++++++++++++--- src/imperative/cached_op.cc | 13 ++++++++----- src/operator/fusion/fused_op-inl.h | 4 ++-- src/operator/fusion/fused_op.cc | 4 ++-- src/operator/fusion/fused_op.cu | 5 +++++ src/operator/fusion/fused_op.h | 7 +++---- 15 files changed, 66 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0eba24f61d14..6d329f5f1079 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -633,6 +633,10 @@ if(USE_CUDA) else() list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES}) # define preprocessor macro so that we will not include the generated forcelink header + if(ENABLE_CUDA_RTC) + add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) + endif() + # Create '.cmake' files for cuda compiles given definitions added thus far mshadow_cuda_compile(cuda_objs ${CUDA}) if(MSVC) if(ENABLE_CUDA_RTC) @@ -640,7 +644,6 @@ if(USE_CUDA) list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY}) set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib") list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY}) - add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) endif() FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator @@ -652,7 +655,6 @@ if(USE_CUDA) list(APPEND mxnet_LINKER_LIBS cufft cusolver) if(ENABLE_CUDA_RTC) list(APPEND mxnet_LINKER_LIBS nvrtc cuda) - add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) endif() link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") endif() diff --git a/appveyor.yml b/appveyor.yml index d44f52a0a9a9..9fa495002a1f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -69,7 +69,7 @@ before_build: set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build - cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64" + cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DENABLE_CUDA_RTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64" build_script: - cmd: >- diff --git a/ci/build_windows.py b/ci/build_windows.py index 4673bd535e3e..ce77c316ab20 100755 --- a/ci/build_windows.py +++ b/ci/build_windows.py @@ -54,7 +54,7 @@ class BuildFlavour(Enum): 'WIN_CPU': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' @@ -67,7 +67,7 @@ class BuildFlavour(Enum): , 'WIN_CPU_MKLDNN': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' @@ -80,7 +80,7 @@ class BuildFlavour(Enum): , 'WIN_CPU_MKLDNN_MKL': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=mkl ' @@ -93,7 +93,7 @@ class BuildFlavour(Enum): , 'WIN_CPU_MKL': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=mkl ' @@ -106,7 +106,7 @@ class BuildFlavour(Enum): , 'WIN_GPU': ( '-DUSE_CUDA=ON ' '-DUSE_CUDNN=ON ' - '-DUSE_NVRTC=ON ' + '-DENABLE_CUDA_RTC=ON ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' @@ -122,7 +122,7 @@ class BuildFlavour(Enum): , 'WIN_GPU_MKLDNN': ( '-DUSE_CUDA=ON ' '-DUSE_CUDNN=ON ' - '-DUSE_NVRTC=ON ' + '-DENABLE_CUDA_RTC=ON ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' diff --git a/make/maven/maven_darwin_mkl.mk b/make/maven/maven_darwin_mkl.mk index a7f2bdb027d4..9bf3fc46ce0b 100644 --- a/make/maven/maven_darwin_mkl.mk +++ b/make/maven/maven_darwin_mkl.mk @@ -77,7 +77,7 @@ USE_CUDNN = 0 # CUDA_ARCH := # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) -USE_NVRTC = 0 +ENABLE_CUDA_RTC = 0 # use openmp for parallelization USE_OPENMP = 0 diff --git a/make/maven/maven_linux_cu90mkl.mk b/make/maven/maven_linux_cu90mkl.mk index e9ba46509973..e8caf73f186e 100644 --- a/make/maven/maven_linux_cu90mkl.mk +++ b/make/maven/maven_linux_cu90mkl.mk @@ -80,7 +80,7 @@ USE_NCCL = 1 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) USE_NVTX=1 -USE_NVRTC = 1 +ENABLE_CUDA_RTC = 1 # use openmp for parallelization USE_OPENMP = 1 diff --git a/make/maven/maven_linux_cu92mkl.mk b/make/maven/maven_linux_cu92mkl.mk index caa1c59c01d5..930341e71cb1 100644 --- a/make/maven/maven_linux_cu92mkl.mk +++ b/make/maven/maven_linux_cu92mkl.mk @@ -80,7 +80,7 @@ USE_NCCL = 1 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) USE_NVTX=1 -USE_NVRTC = 1 +ENABLE_CUDA_RTC = 1 # use openmp for parallelization USE_OPENMP = 1 diff --git a/make/maven/maven_linux_mkl.mk b/make/maven/maven_linux_mkl.mk index 3c8534a7e2aa..10aee5f35a46 100644 --- a/make/maven/maven_linux_mkl.mk +++ b/make/maven/maven_linux_mkl.mk @@ -76,7 +76,7 @@ USE_CUDNN = 0 # CUDA_ARCH := # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) -USE_NVRTC = 0 +ENABLE_CUDA_RTC = 0 # use openmp for parallelization USE_OPENMP = 1 diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index a5f125affcb0..55d431cf3298 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -221,6 +221,11 @@ Graph FusePointwiseForward(Graph&& g); */ Graph FusePointwiseBackward(Graph&& g); +/*! + * \brief Issue a one-time warning that fusion is not possible for this platform or build. + */ +void WarnFusionNotSupported(); + /*! * \brief Infer shapes in the graph given the information. * \param graph The input graph. diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 9c6a9c1d8c48..508fbba97be3 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1001,7 +1001,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); -#if MXNET_USE_CUDA && !defined(_WIN32) +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) { nnvm::Graph unoptimized_graph; common::CopyGraph(&unoptimized_graph, g, false); @@ -1034,7 +1034,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } -#endif // MXNET_USE_CUDA +#else + // Only warn user if MXNET_USE_FUSION env var is explicitly set + if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) { + WarnFusionNotSupported(); + } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 6fe21402cb3a..6a0d5f4efe87 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -36,10 +36,26 @@ #include "../operator/fusion/fused_op.h" #include "../operator/operator_common.h" -#if MXNET_USE_CUDA - namespace mxnet { namespace exec { + +void WarnFusionNotSupported() { + static bool issued_warning = false; + if (!issued_warning) { + issued_warning = true; +#if defined(_WIN32) + LOG(WARNING) << "Omitting dynamic fused op creation- not enabled on Windows. " + << "Unset env var MXNET_USE_FUSION=1 to quiet this message."; +#else + LOG(WARNING) << "Omitting dynamic fused op creation- needs MXNet lib built with " + << "USE_CUDA=1 and ENABLE_CUDA_RTC=1. Unset env var MXNET_USE_FUSION=1 " + << "to quiet this message."; +#endif // defined(_WIN32) + } +} + +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC + namespace { bool IsFusionCompatible(nnvm::Node* n) { using namespace mxnet::fusion; @@ -304,8 +320,8 @@ Graph FusePointwiseBackward(Graph &&g) { ret.outputs = g.outputs; return ret; } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC } // namespace exec } // namespace mxnet -#endif // MXNET_USE_CUDA diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 269729c18f58..24270f210888 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -167,10 +167,8 @@ void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) { void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph, const Context& context, size_t num_forward_outputs, const bool inlining) { -#if MXNET_USE_CUDA && !defined(_WIN32) - if (context.dev_mask() == kGPU && - !inlining && - dmlc::GetEnv("MXNET_USE_FUSION", true)) { +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) + if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", true)) { nnvm::Graph unoptimized_graph; common::CopyGraph(&unoptimized_graph, *full_graph, false); @@ -202,7 +200,12 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } -#endif // MXNET_USE_CUDA +#else + // Only warn user if MXNET_USE_FUSION env var is explicitly set + if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", false)) { + exec::WarnFusionNotSupported(); + } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) *fwd_graph = nnvm::Graph(); fwd_graph->outputs = std::vector(full_graph->outputs.begin(), diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 2966fe2ae910..e86ce7682ad8 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -24,7 +24,7 @@ #include #include -#if MXNET_USE_CUDA +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC namespace mxnet { @@ -992,6 +992,6 @@ const char kernel_end[] = R"code(} } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC #endif // MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 5c83c30308c7..5e2d782dd9e0 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -23,7 +23,7 @@ #include "../operator_common.h" #include "../../executor/exec_pass.h" -#if MXNET_USE_CUDA +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC namespace mxnet { @@ -302,4 +302,4 @@ NNVM_REGISTER_OP(_FusedOpOutHelper) } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 78988f13510e..62f340d0e00b 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -17,6 +17,9 @@ * under the License. */ +// Additional use of MXNET_USE_CUDA is not needed to guard a '.cu' file. +#if MXNET_ENABLE_CUDA_RTC + #include #include #include @@ -787,3 +790,5 @@ NNVM_REGISTER_OP(_FusedOp) .set_attr("FCompute", FusedOpForwardGPU); } // namespace mxnet + +#endif // MXNET_ENABLE_CUDA_RTC diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 24603ac1932f..7d714677e941 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -20,7 +20,6 @@ #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_H_ #define MXNET_OPERATOR_FUSION_FUSED_OP_H_ - #include #include #include @@ -29,8 +28,7 @@ #include #include -#if MXNET_USE_CUDA - +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC namespace mxnet { @@ -202,5 +200,6 @@ using FusedOpHelperParamPtr = std::shared_ptr; } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC + #endif // MXNET_OPERATOR_FUSION_FUSED_OP_H_ From bf849d618afd4283c12f2a886349f4ad8f9ce783 Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Wed, 20 Nov 2019 20:53:42 -0800 Subject: [PATCH 4/5] Fix InferAttr/InferShapeAttr not calling inference for all nodes in a graph (#16836) * Fix the attribute inference omitting nodes * Add test * Cleaning * Fix lint * Fix TransposeShape * Fix WhileLoopType * Changing a/b test for fusion to a/(b+1) to increase numerical stability --- src/executor/infer_graph_attr_pass.cc | 201 ++++++++++++++++++-------- src/operator/control_flow.cc | 2 +- src/operator/tensor/matrix_op-inl.h | 4 +- tests/python/gpu/test_fusion.py | 2 +- tests/python/unittest/test_symbol.py | 17 ++- 5 files changed, 159 insertions(+), 67 deletions(-) diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 80e4084c478e..4b6ee2e1dc0d 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -67,6 +67,7 @@ template inline void GetAttrFromForwardNode(const uint32_t nid, const nnvm::IndexedGraph &idx, std::vector* rshape_ptr, + std::vector* inference_finished, IsNone fis_none) { std::vector& rshape = *rshape_ptr; const nnvm::IndexedGraph::Node& inode = idx[nid]; @@ -83,18 +84,23 @@ inline void GetAttrFromForwardNode(const uint32_t nid, // input gradient list const std::vector& igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); const nnvm::Node* igrad_node = nullptr; + bool all_attrs_known = true; // Input gradient assignement for (size_t i = 0; i < igrad.size(); ++i) { if (igrad[i].node->op() == inode.source->op()) { uint32_t eid = idx.entry_id(nid, igrad[i].index); - if (fis_none(rshape[eid])) { - rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; - } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { + if (fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { // Need to skip empty forward shape, because it may not be // available now and it is possible to infer the forward // shape in one of the next a few passes - CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) - << "Backward shape inconsistent with the forward shape"; + all_attrs_known = false; + } else { + if (fis_none(rshape[eid])) { + rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; + } else { + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + << "Backward shape inconsistent with the forward shape"; + } } if (igrad_node == nullptr) { igrad_node = igrad[i].node.get(); @@ -113,14 +119,20 @@ inline void GetAttrFromForwardNode(const uint32_t nid, if (fis_none(rshape[eid])) { rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)]; } + if (fis_none(rshape[eid])) { + // If the attr is still unknown + all_attrs_known = false; + } } } + (*inference_finished)[nid] = all_attrs_known; } template void GetAttrFromFusedNode(uint32_t nid, const nnvm::IndexedGraph& idx, std::vector* rshape_ptr, + std::vector* inference_finished, IsNone fis_none, const std::string& infer_fusion_name) { std::vector& rshape = *rshape_ptr; @@ -147,19 +159,24 @@ void GetAttrFromFusedNode(uint32_t nid, // input gradient list const std::vector& igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd); const nnvm::Node* igrad_node = nullptr; + bool all_attrs_known = true; // Set the attributes of output gradients // using attributes of forward node inputs for (size_t i = 0; i < igrad.size(); ++i) { if (igrad[i].node->op() == inode.source->op()) { uint32_t eid = idx.entry_id(nid, igrad[i].index); - if (fis_none(rshape[eid])) { - rshape[eid] = input_attrs[i]; - } else if (!fis_none(input_attrs[i])) { + if (fis_none(input_attrs[i])) { // Need to skip empty forward shape, because it may not be // available now and it is possible to infer the forward // shape in one of the next a few passes - CHECK_EQ(rshape[eid], input_attrs[i]) - << "Backward shape inconsistent with the forward shape"; + all_attrs_known = false; + } else { + if (fis_none(rshape[eid])) { + rshape[eid] = input_attrs[i]; + } else { + CHECK_EQ(rshape[eid], input_attrs[i]) + << "Backward shape inconsistent with the forward shape"; + } } if (igrad_node == nullptr) { igrad_node = igrad[i].node.get(); @@ -180,8 +197,13 @@ void GetAttrFromFusedNode(uint32_t nid, if (fis_none(rshape[eid])) { rshape[eid] = output_attrs[e.index]; } + if (fis_none(rshape[eid])) { + // If the attr is still unknown + all_attrs_known = false; + } } } + (*inference_finished)[nid] = all_attrs_known; } template @@ -270,6 +292,9 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, Op::GetAttr("TIsBackward"); // reshape shape vector AttrVector rshape; + // vector holding information which operators + // finished attribute inference + std::vector inference_finished(idx.num_nodes(), false); // dispatch mode vector DispatchModeVector dispatch_modes; if (ret.attrs.count(attr_name) != 0) { @@ -340,6 +365,7 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, // inference step function for nid auto infer_step = [&](uint32_t nid, bool last_iter) { + if (inference_finished[nid]) return; const auto& inode = idx[nid]; const uint32_t num_inputs = inode.inputs.size(); const uint32_t num_outputs = inode.source->num_outputs(); @@ -355,6 +381,9 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, CHECK(is >> rshape[out_ent_id]) << "Invalid attribute"; } } + if (!fis_none(rshape[out_ent_id])) { + inference_finished[nid] = true; + } // assign a default value to node attribute if (dispatch_mode_name != nullptr) { op::dispatch_mode_assign(&dispatch_modes[nid], default_mode_val); @@ -370,47 +399,66 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, static auto& is_fusion_helper = Op::GetAttr("TIsFusionHelper"); if (!is_fusion_helper.get(fwd_ptr->op(), false)) { - GetAttrFromForwardNode(nid, idx, &rshape, fis_none); + GetAttrFromForwardNode(nid, idx, &rshape, &inference_finished, fis_none); } else { - GetAttrFromFusedNode(nid, idx, &rshape, fis_none, infer_fusion_name); + GetAttrFromFusedNode(nid, idx, &rshape, &inference_finished, + fis_none, infer_fusion_name); } } else { DispatchMode* dispatch_mode = nullptr; - bool forward_known = true; // Forward operator inference. ishape.resize(num_inputs, empty_val); for (uint32_t i = 0; i < ishape.size(); ++i) { ishape[i] = rshape[idx.entry_id(inode.inputs[i])]; - if (fis_none(ishape[i])) forward_known = false; } oshape.resize(num_outputs, empty_val); for (uint32_t i = 0; i < oshape.size(); ++i) { oshape[i] = rshape[idx.entry_id(nid, i)]; - if (fis_none(oshape[i])) forward_known = false; } if (dispatch_mode_name != nullptr) { dispatch_mode = &dispatch_modes[nid]; - if (dispatch_modes[nid] == DispatchMode::kUndefined) forward_known = false; } auto finfer = finfer_shape.get(inode.source->op(), fdefault); - if (!forward_known) { - if (finfer != nullptr) { - // Call inference function of the operator. - try { - static auto& is_fusion = Op::GetAttr("TIsFusion"); - if (is_fusion.get(inode.source->op(), false)) { - ProvideAttrToFusion(nid, idx, rshape, provide_fusion_name); - } - forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, - nid, &ishape, &oshape, dispatch_mode); - } catch (const std::exception& e) { - throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); + if (finfer != nullptr) { + // Call inference function of the operator. + try { + static auto& is_fusion = Op::GetAttr("TIsFusion"); + if (is_fusion.get(inode.source->op(), false)) { + ProvideAttrToFusion(nid, idx, rshape, provide_fusion_name); } - } else { + ApplyOpInferAttr(ret, finfer, inode.source->attrs, + nid, &ishape, &oshape, dispatch_mode); + bool finished = true; + for (const auto& attr : ishape) { + if (fis_none(attr)) finished = false; + } + for (const auto& attr : oshape) { + if (fis_none(attr)) finished = false; + } + inference_finished[nid] = finished; + } catch (const std::exception& e) { + throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); + } + } else { + // Operator does not provide sttribute inference function, + // so we need to test if everything was inferred by other operators + bool all_attrs_known = true; + for (const auto& attr : ishape) { + if (fis_none(attr)) { + all_attrs_known = false; + } + } + for (const auto& attr : oshape) { + if (fis_none(attr)) { + all_attrs_known = false; + } + } + inference_finished[nid] = all_attrs_known; + if (!all_attrs_known) { CHECK(!last_iter) << "Attribute " << infer_name - << " is not registed by op " << inode.source->op()->name - << " we are not able to complete the inference because of this"; + << " is not registered by op " << inode.source->op()->name + << ". We are not able to complete the inference because of this"; } } // Save to the result map. @@ -427,16 +475,18 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, size_t num_unknown_dispatch_mode = dispatch_mode_name ? node_end - node_start : 0; size_t num_unknown_entry_attr = entry_end - entry_start; size_t num_unknown = num_unknown_entry_attr + num_unknown_dispatch_mode; + bool last_iter = false; + bool do_next_iteration = true; int i = 0; do { if (i % 2 == 0) { for (uint32_t nid = node_start; nid < node_end; ++nid) { - infer_step(nid, false); + infer_step(nid, last_iter); } } else { // backward inference for (uint32_t i = node_end; i != node_start; --i) { - infer_step(i - 1, false); + infer_step(i - 1, last_iter); } } last_num_unknown = num_unknown; @@ -451,8 +501,18 @@ nnvm::Graph InferAttr(nnvm::Graph &&ret, if (dispatch_modes[i] == DispatchMode::kUndefined) ++num_unknown; } } + do_next_iteration = num_unknown > 0 && last_num_unknown > num_unknown; + if (!do_next_iteration && !last_iter) { + // Check if every op agrees that it should be + // the end of attribute inference. If not, + // perform one final step + for (const bool done : inference_finished) { + do_next_iteration = do_next_iteration || !done; + } + last_iter = true; + } ++i; - } while (num_unknown > 0 && last_num_unknown > num_unknown); + } while (do_next_iteration); // set the shapes ret.attrs[attr_name] = std::make_shared(std::move(rshape)); // set the shapes @@ -517,6 +577,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, Op::GetAttr("TIsBackward"); // reshape shape vector AttrVector rshape; + // vector holding information which operators + // finished attribute inference + std::vector inference_finished(idx.num_nodes(), false); // dispatch mode vector DispatchModeVector dispatch_modes; if (ret.attrs.count(attr_name) != 0) { @@ -594,6 +657,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, // inference step function for nid auto infer_step = [&](uint32_t nid, bool last_iter) { + if (inference_finished[nid]) return; const auto& inode = idx[nid]; const std::string name = inode.source->attrs.name; const uint32_t num_inputs = inode.inputs.size(); @@ -613,6 +677,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } } + if (!fis_none(rshape[out_ent_id])) { + inference_finished[nid] = true; + } // assign a default value to node attribute if (dispatch_mode_name != nullptr) { op::dispatch_mode_assign(&dispatch_modes[nid], default_mode_val); @@ -628,14 +695,15 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, static auto& is_fusion_helper = Op::GetAttr("TIsFusionHelper"); if (!is_fusion_helper.get(fwd_ptr->op(), false)) { - GetAttrFromForwardNode(nid, idx, &rshape, fis_none); + GetAttrFromForwardNode(nid, idx, &rshape, &inference_finished, fis_none); } else { - GetAttrFromFusedNode(nid, idx, &rshape, fis_none, + GetAttrFromFusedNode(nid, idx, &rshape, + &inference_finished, + fis_none, "FAccessSubgraphShape"); } } else { DispatchMode* dispatch_mode = nullptr; - bool forward_known = true; // Forward operator inference. ishape.resize(num_inputs, empty_val); bool is_input_dynamic_shape = false; @@ -644,16 +712,13 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, if (!mxnet::ndim_is_known(ishape[i]) && is_dynamic[idx.entry_id(inode.inputs[i])]) { is_input_dynamic_shape = true; } - if (fis_none(ishape[i])) forward_known = false; } oshape.resize(num_outputs, empty_val); for (uint32_t i = 0; i < oshape.size(); ++i) { oshape[i] = rshape[idx.entry_id(nid, i)]; - if (fis_none(oshape[i])) forward_known = false; } if (dispatch_mode_name != nullptr) { dispatch_mode = &dispatch_modes[nid]; - if (dispatch_modes[nid] == DispatchMode::kUndefined) forward_known = false; } auto finfer = finfer_shape.get(inode.source->op(), fdefault); if (finfer == nullptr || is_input_dynamic_shape) { @@ -662,25 +727,27 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, is_dynamic[idx.entry_id(nid, i)] = 1; } } - } else if (!forward_known) { - if (finfer != nullptr) { - // Call inference function of the operator. - try { - static auto& is_fusion = Op::GetAttr("TIsFusion"); - if (is_fusion.get(inode.source->op(), false)) { - ProvideAttrToFusion(nid, idx, rshape, - "FProvideSubgraphShape"); - } - forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs, - nid, &ishape, &oshape, dispatch_mode); - } catch (const std::exception& e) { - throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); + inference_finished[nid] = true; + } else { + // Call inference function of the operator. + try { + static auto& is_fusion = Op::GetAttr("TIsFusion"); + if (is_fusion.get(inode.source->op(), false)) { + ProvideAttrToFusion(nid, idx, rshape, + "FProvideSubgraphShape"); } - } else { - CHECK(!last_iter) - << "Attribute " << infer_name - << " is not registed by op " << inode.source->op()->name - << " we are not able to complete the inference because of this"; + ApplyOpInferAttr(ret, finfer, inode.source->attrs, + nid, &ishape, &oshape, dispatch_mode); + bool finished = true; + for (const auto& attr : ishape) { + if (fis_none(attr)) finished = false; + } + for (const auto& attr : oshape) { + if (fis_none(attr)) finished = false; + } + inference_finished[nid] = finished; + } catch (const std::exception& e) { + throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); } } // Save to the result map. @@ -695,18 +762,20 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, size_t last_num_unknown; size_t num_unknown = static_cast(-1); // Infinity + bool last_iter = false; + bool do_next_iteration = true; int i = 0; do { if (i % 2 == 0) { // forward inference for (uint32_t nid = node_start; nid < node_end; ++nid) { - infer_step(nid, false); + infer_step(nid, last_iter); } } else { // backward inference for (uint32_t i = node_end; i != node_start; --i) { - infer_step(i - 1, false); + infer_step(i - 1, last_iter); } } last_num_unknown = num_unknown; @@ -723,8 +792,18 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret, } } } + do_next_iteration = num_unknown > 0 && last_num_unknown > num_unknown; + if (!do_next_iteration && !last_iter) { + // Check if every op agrees that it should be + // the end of attribute inference. If not, + // perform one final step + for (const bool done : inference_finished) { + do_next_iteration = do_next_iteration || !done; + } + last_iter = true; + } ++i; - } while (num_unknown > 0 && last_num_unknown > num_unknown); + } while (do_next_iteration); // set the shapes ret.attrs[attr_name] = std::make_shared(std::move(rshape)); // set the shapes diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc index fd087ef39679..a9e9038e6c51 100644 --- a/src/operator/control_flow.cc +++ b/src/operator/control_flow.cc @@ -761,7 +761,7 @@ static bool WhileLoopType(const nnvm::NodeAttrs& attrs, std::vector func_in_type; extract_by_loc(*in_type, params.cond_input_locs, &cond_in_type); extract_by_loc(*in_type, params.func_input_locs, &func_in_type); - std::vector cond_out_type = {0}; + std::vector cond_out_type = {-1}; CHECK(params.sync_in_out(in_type, out_type, is_udf)); bool succ_0 = InferSubgraphDataType(*attrs.subgraphs[0], &cond_in_type, &cond_out_type); CHECK(params.sync_in_out(in_type, out_type, is_udf)); diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index 801e4e7126b4..0fee2a26c0ed 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -420,11 +420,9 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs, mxnet::TShape& shp = (*in_attrs)[0]; mxnet::TShape& out_shp = (*out_attrs)[0]; CHECK_LE(shp.ndim(), 6) << "Transpose support at most 6 dimensions"; - CHECK_NE(shp.ndim(), 0) << "Number of dimensions cannot be 0"; - CHECK_NE(out_shp.ndim(), 0) << "Number of dimensions cannot be 0"; if (shp.ndim() == -1 && out_shp.ndim() == -1) return false; // none of the shapes is known - if (out_shp.ndim() > 0 && shp.ndim() > 0) + if (out_shp.ndim() >= 0 && shp.ndim() >= 0) CHECK_EQ(out_shp.ndim(), shp.ndim()); mxnet::TShape get(std::max(shp.ndim(), out_shp.ndim()), -1); mxnet::TShape ret(std::max(shp.ndim(), out_shp.ndim()), -1); diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py index 693336f22496..24e33019f617 100644 --- a/tests/python/gpu/test_fusion.py +++ b/tests/python/gpu/test_fusion.py @@ -172,7 +172,7 @@ def check_binary_ops(): check_fused_symbol(3-a, a=arr1) check_fused_symbol(a*b, a=arr1, b=arr2) check_fused_symbol(a*3, a=arr1) - check_fused_symbol(a/b, a=arr1, b=arr2) + check_fused_symbol(a/(b+1), a=arr1, b=arr2) check_fused_symbol(a/3, a=arr1) check_fused_symbol(3/a, a=arr1) check_fused_symbol(a**b, a=arr1, b=arr2) diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py index a2aad2c079fc..8e4fe11905cf 100644 --- a/tests/python/unittest/test_symbol.py +++ b/tests/python/unittest/test_symbol.py @@ -413,7 +413,7 @@ def test_gen_atomic_symbol_multiple_outputs(): p = mx.sym.Variable('param') h0 = mx.sym.Variable('h0') h1 = mx.sym.Variable('h1') - s = mx.sym.RNN(data, p, h0, h1, state_size=10, num_layers=2, + s = mx.sym.RNN(data, p, h0, h1, state_size=10, num_layers=2, bidirectional=True, state_outputs=True, mode='lstm') atomic_sym = s._gen_atomic_symbol() @@ -542,6 +542,21 @@ def get_net(): assert out_shapes[0] == (batch_size, num_hdidden) # output assert len(aux_shapes) == 0 +def test_infershape_happens_for_all_ops_in_graph(): + v = mx.sym.Variable('V') + s = mx.sym.transpose(v) + x = mx.sym.Variable('x') + s2 = x + v + s3 = s + s2 + with discard_stderr(): + try: + # This should throw an exception as you cannot add arrays + # with shapes [2,3] and [3,2] + e = s3.simple_bind(ctx=mx.cpu(), x=(2,3), grad_req='null') + except: + return + + assert False if __name__ == '__main__': import nose From 391695daf314ff39ec44732639968f7ddb1e5ccf Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 21 Nov 2019 15:49:46 -0800 Subject: [PATCH 5/5] Revert "Mixed precison binary op backward (use in) for numpy (#16791)" This reverts commit 8b58b78c78c080650ea10bb205eb897da81b07b2. --- .../numpy/np_elemwise_broadcast_op.cc | 17 +-- .../numpy/np_elemwise_broadcast_op.cu | 4 - src/operator/numpy/np_elemwise_broadcast_op.h | 104 +----------------- .../tensor/elemwise_binary_broadcast_op.h | 26 ----- src/operator/tensor/elemwise_unary_op.h | 4 +- tests/python/unittest/test_numpy_op.py | 20 ++-- 6 files changed, 13 insertions(+), 162 deletions(-) diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc index acf0395123fc..a76e59d30dc6 100644 --- a/src/operator/numpy/np_elemwise_broadcast_op.cc +++ b/src/operator/numpy/np_elemwise_broadcast_op.cc @@ -147,22 +147,7 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_multiply) "FCompute", NumpyBinaryBroadcastComputeWithBool) #endif -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"}); - -NNVM_REGISTER_OP(_backward_npi_broadcast_mul) -.set_num_inputs(3) -.set_num_outputs(2) -.set_attr("TIsBackward", true) -.set_attr("FInplaceOption", - [](const NodeAttrs& attrs){ - return std::vector >{{0, 1}}; - }) -.set_attr("FResourceRequest", - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace}; - }) -.set_attr("FCompute", NumpyBinaryBackwardUseIn); +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"}); MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod) .set_attr("FCompute", BinaryBroadcastCompute) diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu index d9499625e34d..a0a277df211f 100644 --- a/src/operator/numpy/np_elemwise_broadcast_op.cu +++ b/src/operator/numpy/np_elemwise_broadcast_op.cu @@ -64,10 +64,6 @@ NNVM_REGISTER_OP(_npi_multiply) NumpyBinaryBroadcastComputeWithBool); #endif -NNVM_REGISTER_OP(_backward_npi_broadcast_mul) -.set_attr("FCompute", NumpyBinaryBackwardUseIn); - NNVM_REGISTER_OP(_npi_mod) .set_attr("FCompute", BinaryBroadcastCompute); diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h index 179acd279ab6..1a4596fba91c 100644 --- a/src/operator/numpy/np_elemwise_broadcast_op.h +++ b/src/operator/numpy/np_elemwise_broadcast_op.h @@ -25,7 +25,6 @@ #ifndef MXNET_OPERATOR_NUMPY_NP_ELEMWISE_BROADCAST_OP_H_ #define MXNET_OPERATOR_NUMPY_NP_ELEMWISE_BROADCAST_OP_H_ -#include #include #include @@ -382,13 +381,11 @@ void NumpyBinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs, } template -void NumpyBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs, +void MixedBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - using namespace mshadow; - using namespace mxnet_op; CHECK_EQ(inputs.size(), 3U); CHECK_EQ(outputs.size(), 2U); @@ -399,104 +396,7 @@ void NumpyBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs, return; } - const TBlob& ograd = inputs[0]; - const TBlob& lgrad = outputs[0]; - const TBlob& rgrad = outputs[1]; - - if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) { - // If any of the inputs is a float, it's the same type as the output - // So 2 of the 3 tensors have the same data type - Stream *s = ctx.get_stream(); - mxnet::TShape new_lshape, new_rshape, new_oshape; - using namespace broadcast; - const bool need_bc = BinaryBroadcastShapeCompact(lgrad.shape_, rgrad.shape_, ograd.shape_, - &new_lshape, &new_rshape, &new_oshape) != 0; - - // Prepare all the temporary memory - size_t workspace_size_l = 0, workspace_size_r = 0; - TBlob temp_tblob; // The TBlob for casted input data - TBlob temp_igrad; // The TBlob for casted grad results - size_t tensor_size = (lgrad.type_flag_ != ograd.type_flag_) ? lgrad.Size() : rgrad.Size(); - Tensor workspace; - - MSHADOW_TYPE_SWITCH(ograd.type_flag_, OType, { - BROADCAST_NDIM_SWITCH(new_oshape.ndim(), ndim, { - workspace_size_l = ReduceWorkspaceSize( - s, new_lshape, req[0], new_oshape, new_lshape, new_rshape); - workspace_size_r = ReduceWorkspaceSize( - s, new_rshape, req[1], new_oshape, new_lshape, new_rshape); - }); - size_t workspace_size = std::max(workspace_size_l, workspace_size_r); - size_t cast_tensor_size = tensor_size * sizeof(OType); - // Allocate the temporary memories now - Tensor temp_space = - ctx.requested[0].get_space_typed( - Shape1(workspace_size + cast_tensor_size * 2), s); - // Tensor for temp_tblob - Tensor temp_tblob_tensor( - reinterpret_cast(temp_space.dptr_), - Shape1(tensor_size), s); - // Tensor for temp_igrad - Tensor temp_igrad_tensor( - reinterpret_cast(temp_space.dptr_) + tensor_size, - Shape1(tensor_size), s); - temp_tblob = - TBlob(temp_tblob_tensor) - .reshape(((lgrad.type_flag_ != ograd.type_flag_) ? lhs.shape_ : rhs.shape_)); - temp_igrad = - TBlob(temp_igrad_tensor) - .reshape(((lgrad.type_flag_ != ograd.type_flag_) ? lhs.shape_ : rhs.shape_)); - if (temp_igrad.Size() != 0) { - Kernel::Launch(s, temp_igrad.Size(), temp_igrad.dptr()); - } - workspace = - Tensor(temp_space.dptr_ + 2 * cast_tensor_size, Shape1(workspace_size), s); - }); - // Cast the input that does not have consistent type to temp_tblob - CastCompute( - attrs, ctx, {((lgrad.type_flag_ != ograd.type_flag_) ? lhs : rhs)}, {kWriteTo}, {temp_tblob}); - if (!need_bc) { - if (lhs.type_flag_ != ograd.type_flag_) { - ElemwiseBinaryOp::BackwardUseIn( - attrs, ctx, {ograd, temp_tblob, rhs}, {kWriteTo, req[1]}, {temp_igrad, rgrad}); - } else { - ElemwiseBinaryOp::BackwardUseIn( - attrs, ctx, {ograd, lhs, temp_tblob}, {req[0], kWriteTo}, {lgrad, temp_igrad}); - } - } else { - if (lhs.type_flag_ != ograd.type_flag_) { - MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, { - BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, { - BinaryBroadcastBackwardUseInImplWithWorkspace( - ctx, {ograd, temp_tblob, rhs}, {kWriteTo, req[1]}, {temp_igrad, rgrad}, - workspace, new_lshape, new_rshape, new_oshape); - }); - }); - } else { - MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, { - BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, { - BinaryBroadcastBackwardUseInImplWithWorkspace( - ctx, {ograd, lhs, temp_tblob}, {req[0], kWriteTo}, {lgrad, temp_igrad}, - workspace, new_lshape, new_rshape, new_oshape); - }); - }); - } - } - - // If both inputs are floating numbers, cast the igrad to the input that has - // the different data type - if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) { - if (lhs.type_flag_ != ograd.type_flag_) { - CastCompute(attrs, ctx, {temp_igrad}, {req[0]}, {lgrad}); - } else { - CastCompute(attrs, ctx, {temp_igrad}, {req[1]}, {rgrad}); - } - } - } else { - // Case where both inputs are integer types, should not even do - // backward computation for this case. - PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_); - } + PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_); } } // namespace op diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 69fc965c03b8..b48ed389ba98 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -671,32 +671,6 @@ BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs); -template -void BinaryBroadcastBackwardUseInImplWithWorkspace(const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs, - const mshadow::Tensor& workspace, - const mxnet::TShape& new_lshape, - const mxnet::TShape& new_rshape, - const mxnet::TShape& new_oshape) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace broadcast; - Stream *s = ctx.get_stream(); - const TBlob lgrad = outputs[0].reshape(new_lshape); - const TBlob rgrad = outputs[1].reshape(new_rshape); - const TBlob ograd = inputs[0].reshape(new_oshape); - const TBlob lhs = inputs[1].reshape(new_lshape); - const TBlob rhs = inputs[2].reshape(new_rshape); - if (ograd.Size() != 0) { - Reduce(s, lgrad, req[0], workspace, - ograd, lhs, rhs); - Reduce(s, rgrad, req[1], workspace, - ograd, lhs, rhs); - } -} - template inline void BinaryBroadcastBackwardUseInImpl(const OpContext& ctx, const std::vector& inputs, diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index 8886e15e3972..188ccd68a340 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -453,8 +453,8 @@ void CastCompute(const nnvm::NodeAttrs& attrs, Tensor out = outputs[0].FlatTo1D(s); MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, SrcDType, { Tensor data = inputs[0].FlatTo1D(s); - if ((outputs[0].type_flag_ != inputs[0].type_flag_ || - req[0] != kWriteInplace) && outputs[0].Size() != 0) { + if (outputs[0].type_flag_ != inputs[0].type_flag_ || + req[0] != kWriteInplace) { Assign(out, req[0], tcast(data)); } }); diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index 85eaadd4ffe8..9a36e061ed4c 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -1683,9 +1683,7 @@ def hybrid_forward(self, F, a, b, *args, **kwargs): @with_seed() @use_np def test_np_mixed_precision_binary_funcs(): - itypes = [np.bool, np.int8, np.int32, np.int64] - ftypes = [np.float16, np.float32, np.float64] - def check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, ltype, rtype): + def check_mixed_precision_binary_func(func, low, high, lshape, rshape, ltype, rtype): class TestMixedBinary(HybridBlock): def __init__(self, func): super(TestMixedBinary, self).__init__() @@ -1719,15 +1717,13 @@ def hybrid_forward(self, F, a, b, *args, **kwargs): use_broadcast=False, equal_nan=True) funcs = { - 'add': (-1.0, 1.0, None, None), - 'subtract': (-1.0, 1.0, None, None), - 'multiply': (-1.0, 1.0, lambda y, x1, x2: _np.broadcast_to(x2, y.shape), - lambda y, x1, x2: _np.broadcast_to(x1, y.shape)) + 'add': (-1.0, 1.0), + 'subtract': (-1.0, 1.0), + 'multiply': (-1.0, 1.0), } shape_pairs = [((3, 2), (3, 2)), ((3, 2), (3, 1)), - ((3, 0), (3, 0)), ((3, 1), (3, 0)), ((0, 2), (1, 2)), ((2, 3, 4), (3, 1)), @@ -1737,16 +1733,16 @@ def hybrid_forward(self, F, a, b, *args, **kwargs): itypes = [np.bool, np.int8, np.int32, np.int64] ftypes = [np.float16, np.float32, np.float64] for func, func_data in funcs.items(): - low, high, lgrad, rgrad = func_data + low, high = func_data for lshape, rshape in shape_pairs: for type1, type2 in itertools.product(itypes, ftypes): - check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2) - check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type2, type1) + check_mixed_precision_binary_func(func, low, high, lshape, rshape, type1, type2) + check_mixed_precision_binary_func(func, low, high, lshape, rshape, type2, type1) for type1, type2 in itertools.product(ftypes, ftypes): if type1 == type2: continue - check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2) + check_mixed_precision_binary_func(func, low, high, lshape, rshape, type1, type2) @with_seed()