diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 66df82e4395e..55b263896339 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -521,7 +521,19 @@ class CuDNNConvolutionOp { wshape[1], wshape[2], wshape[3])); - +#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500 + auto kernel_h = wshape[2]; + auto kernel_w = wshape[3]; + auto stride_h = stride[0]; + auto stride_w = stride[1]; + auto pad_h = pad[0]; + auto pad_w = pad[1]; + if (param_.layout.value() == kNCHW && + (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) || + ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) { + exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING; + } +#endif } else if (param_.kernel.ndim() == 3) { // 3d conv #if CUDNN_MAJOR >= 5 @@ -714,7 +726,7 @@ class CuDNNConvolutionOp { bwd_data_results.resize(actual_bwd_data_algos); AlgoFinalSelect(bwd_data_results, "backprop-to-data", - workspace_byte, bwd); + workspace_byte, bwd, exclude_dgrad_algo_); #else // CUDNN_MAJOR < 7 const int kMaxAlgos = 10; @@ -910,12 +922,14 @@ class CuDNNConvolutionOp { // workspace constraints. template void AlgoFinalSelect(const std::vector &perf_results, std::string kernel_name, - size_t workspace_byte, CuDNNAlgo *algo) { + size_t workspace_byte, CuDNNAlgo *algo, + int32_t algo_exclude = -1) { // Determine the fastest acceptable algo that matches the algo_preference (-1 = any), // regardless of mathType. bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false); for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) { const auto &result = perf_results[i]; + bool algo_exclusion = static_cast(result.algo) == algo_exclude; bool algo_is_tensor_core = false; #if CUDNN_MAJOR >= 7 algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH; @@ -924,7 +938,8 @@ class CuDNNConvolutionOp { #if CUDNN_MAJOR >= 7 (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) && #endif - (param_.cudnn_tune.value() == conv::kFastest || result.memory <= workspace_byte)) { + (param_.cudnn_tune.value() == conv::kLimited || result.memory <= workspace_byte) && + !algo_exclusion) { algo->Set(result.algo, algo_is_tensor_core); return; } @@ -1104,6 +1119,8 @@ class CuDNNConvolutionOp { bool cudnn_tensor_core_; // Is req[kWeight] == conv::kAddTo ? bool add_to_weight_; + // Is there a dgrad algo that should be avoided (-1 == none)? + int32_t exclude_dgrad_algo_ = -1; ConvolutionParam param_; }; #endif // __CUDACC__ && CUDNN diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index ec95d2be3309..47f688c8ab9c 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -446,6 +446,19 @@ class CuDNNDeconvolutionOp { wshape[1], wshape[2], wshape[3])); +#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500 + auto kernel_h = wshape[2]; + auto kernel_w = wshape[3]; + auto stride_h = stride[0]; + auto stride_w = stride[1]; + auto pad_h = o_pad[0]; + auto pad_w = o_pad[1]; + if (param_.layout.value() == kNCHW && + (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) || + ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) { + exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING; + } +#endif } else if (param_.kernel.ndim() == 3) { // 3d conv index_t o_pad[3]; @@ -618,7 +631,7 @@ class CuDNNDeconvolutionOp { bwd_data_results.resize(actual_bwd_data_algos); AlgoFinalSelect(bwd_data_results, "backprop-to-data", - workspace_byte, bwd); + workspace_byte, bwd, exclude_dgrad_algo_); #else // CUDNN_MAJOR < 7 const int kMaxAlgos = 10; @@ -829,11 +842,13 @@ class CuDNNDeconvolutionOp { // workspace constraints and a possible user algo preference. template void AlgoFinalSelect(const std::vector &perf_results, std::string kernel_name, - size_t workspace_byte, CuDNNAlgo *algo) { + size_t workspace_byte, CuDNNAlgo *algo, + int32_t algo_exclude = -1) { // Determine the fastest acceptable algo regardless of mathType. bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false); for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) { const auto &result = perf_results[i]; + bool algo_exclusion = static_cast(result.algo) == algo_exclude; bool algo_is_tensor_core = false; #if CUDNN_MAJOR >= 7 algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH; @@ -842,7 +857,8 @@ class CuDNNDeconvolutionOp { #if CUDNN_MAJOR >= 7 (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) && #endif - (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) { + (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) && + !algo_exclusion) { algo->Set(result.algo, algo_is_tensor_core); return; } @@ -1025,6 +1041,8 @@ class CuDNNDeconvolutionOp { bool cudnn_tensor_core_; // Is req[kWeight] == deconv::kAddTo ? bool add_to_weight_; + // Is there a dgrad algo that should be avoided (-1 == none)? + int32_t exclude_dgrad_algo_ = -1; DeconvolutionParam param_; }; #endif // CUDNN diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index fdc8a5b30828..ca4bbcae3168 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -522,6 +522,32 @@ def test_convolution_options(): check_consistency_NxM([sym, sym_no_cudnn], ctx_list) +@with_seed() +def test_conv_deconv_guards(): + # Test cases for convolution and deconvolution via strided fft. Ensure that the framework + # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5) + # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750 + tol = 1e-1 + for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]: + dataname = opname + '_data' + ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}} + test_cases = [ + {'num_filter':32, 'kernel':(6,6), 'pad':(0,0), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(6,6), 'pad':(1,1), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(6,7), 'pad':(0,1), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(7,6), 'pad':(1,0), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(7,7), 'pad':(0,0), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(7,7), 'pad':(1,1), 'stride':(2,2), 'name': opname}] + for test_case_args in test_cases: + try: + sym = op(**test_case_args) + sym_no_cudnn = op(cudnn_off=True, **test_case_args) + check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol) + except: + print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args)) + raise + + def _conv_with_num_streams(seed): with random_seed(seed): # Try to expose timing-dependent improper workspace sharing by parallel dgrad and wgrad