From 817bad0c9b0250255c8035f88e4a10525a051b2a Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 3 Mar 2019 14:40:59 -0800 Subject: [PATCH 1/9] Add test exposing issue with conv dgrad algo 3 for some cudnn's. --- tests/python/gpu/test_operator_gpu.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 4dbf82edd3f5..12796c712fe9 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -521,6 +521,31 @@ def test_convolution_options(): check_consistency_NxM([sym, sym_no_cudnn], ctx_list) +@with_seed() +def test_conv_deconv_guards(): + # Test cases for convolution and deconvolution via strided fft. Ensure that the framework + # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5) + tol = 1e-2 + for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]: + dataname = opname + '_data' + ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}} + test_cases = [ + {'num_filter':32, 'kernel':(6,6), 'pad':(0,0), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(6,6), 'pad':(1,1), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(6,7), 'pad':(0,1), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(7,6), 'pad':(1,0), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(7,7), 'pad':(0,0), 'stride':(2,2), 'name': opname}, + {'num_filter':32, 'kernel':(7,7), 'pad':(1,1), 'stride':(2,2), 'name': opname}] + for test_case_args in test_cases: + try: + sym = op(**test_case_args) + sym_no_cudnn = op(cudnn_off=True, **test_case_args) + check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol) + except: + print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args)) + raise + + # Helper function to run tests in a subprocess to avoid save/restore of os.environ. # Also avoids issues of cached environment variable lookups in the backend. def _test_in_separate_process(func, env, *args): From 1cb743bd2313ca5d1845dbdda6cccd9dcaa5d30c Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 3 Mar 2019 15:23:38 -0800 Subject: [PATCH 2/9] Add test temporarily to tests run with tensorrt CI build (cuda10, cudnn7.4.2) --- ci/docker/runtime_functions.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index de1b7795ce69..848f15267270 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -832,6 +832,7 @@ unittest_ubuntu_tensorrt_gpu() { export CUDNN_VERSION=7.0.3 python tests/python/tensorrt/lenet5_train.py nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/tensorrt/ + nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/gpu/test_operator_gpu.py:test_conv_deconv_guards } # quantization gpu currently only runs on P3 instances From 0cbd0c783f31f8fbf72cad146c2507849b03f538 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Mon, 4 Mar 2019 10:05:14 -0800 Subject: [PATCH 3/9] Relax tol of new test. --- tests/python/gpu/test_operator_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 12796c712fe9..b77229655a6b 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -525,7 +525,7 @@ def test_convolution_options(): def test_conv_deconv_guards(): # Test cases for convolution and deconvolution via strided fft. Ensure that the framework # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5) - tol = 1e-2 + tol = 1e-1 for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]: dataname = opname + '_data' ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}} From 8c82498b3098379d2a2e97979186fa01c3aa84ab Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sun, 3 Mar 2019 14:50:07 -0800 Subject: [PATCH 4/9] Fix for problematic conv dgrad algo 3 for some cuDNNs. --- src/operator/nn/cudnn/cudnn_convolution-inl.h | 22 ++++++++++++++++--- .../nn/cudnn/cudnn_deconvolution-inl.h | 21 ++++++++++++++++-- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index e11f7cc81d25..ea1c18932f31 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -521,7 +521,19 @@ class CuDNNConvolutionOp { wshape[1], wshape[2], wshape[3])); - +#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500 + auto kernel_h = wshape[2]; + auto kernel_w = wshape[3]; + auto stride_h = stride[0]; + auto stride_w = stride[1]; + auto pad_h = pad[0]; + auto pad_w = pad[1]; + if (param_.layout.value() == kNCHW && + (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) || + ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) { + exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING; + } +#endif } else if (param_.kernel.ndim() == 3) { // 3d conv #if CUDNN_MAJOR >= 5 @@ -714,7 +726,7 @@ class CuDNNConvolutionOp { bwd_data_results.resize(actual_bwd_data_algos); AlgoFinalSelect(bwd_data_results, "backprop-to-data", - workspace_byte, bwd); + workspace_byte, bwd, exclude_dgrad_algo_); #else // CUDNN_MAJOR < 7 const int kMaxAlgos = 10; @@ -910,12 +922,14 @@ class CuDNNConvolutionOp { // workspace constraints. template void AlgoFinalSelect(const std::vector &perf_results, std::string kernel_name, - size_t workspace_byte, CuDNNAlgo *algo) { + size_t workspace_byte, CuDNNAlgo *algo, + int32_t algo_exclude = -1) { // Determine the fastest acceptable algo that matches the algo_preference (-1 = any), // regardless of mathType. bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false); for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) { const auto &result = perf_results[i]; + bool algo_exclusion = static_cast(result.algo) == algo_exclude; bool algo_is_tensor_core = false; #if CUDNN_MAJOR >= 7 algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH; @@ -1104,6 +1118,8 @@ class CuDNNConvolutionOp { bool cudnn_tensor_core_; // Is req[kWeight] == conv::kAddTo ? bool add_to_weight_; + // Is there a dgrad algo that should be avoided (-1 == none)? + int32_t exclude_dgrad_algo_ = -1; ConvolutionParam param_; }; #endif // __CUDACC__ && CUDNN diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index ec95d2be3309..e25a9a9e1c3b 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -446,6 +446,19 @@ class CuDNNDeconvolutionOp { wshape[1], wshape[2], wshape[3])); +#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500 + auto kernel_h = wshape[2]; + auto kernel_w = wshape[3]; + auto stride_h = stride[0]; + auto stride_w = stride[1]; + auto pad_h = o_pad[0]; + auto pad_w = o_pad[1]; + if (param_.layout.value() == kNCHW && + (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) || + ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) { + exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING; + } +#endif } else if (param_.kernel.ndim() == 3) { // 3d conv index_t o_pad[3]; @@ -618,7 +631,7 @@ class CuDNNDeconvolutionOp { bwd_data_results.resize(actual_bwd_data_algos); AlgoFinalSelect(bwd_data_results, "backprop-to-data", - workspace_byte, bwd); + workspace_byte, bwd, exclude_dgrad_algo_); #else // CUDNN_MAJOR < 7 const int kMaxAlgos = 10; @@ -829,11 +842,13 @@ class CuDNNDeconvolutionOp { // workspace constraints and a possible user algo preference. template void AlgoFinalSelect(const std::vector &perf_results, std::string kernel_name, - size_t workspace_byte, CuDNNAlgo *algo) { + size_t workspace_byte, CuDNNAlgo *algo, + int32_t algo_exclude = -1) { // Determine the fastest acceptable algo regardless of mathType. bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false); for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) { const auto &result = perf_results[i]; + bool algo_exclusion = static_cast(result.algo) == algo_exclude; bool algo_is_tensor_core = false; #if CUDNN_MAJOR >= 7 algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH; @@ -1025,6 +1040,8 @@ class CuDNNDeconvolutionOp { bool cudnn_tensor_core_; // Is req[kWeight] == deconv::kAddTo ? bool add_to_weight_; + // Is there a dgrad algo that should be avoided (-1 == none)? + int32_t exclude_dgrad_algo_ = -1; DeconvolutionParam param_; }; #endif // CUDNN From 5516b64e5cc1ca509e7fb6ef25ee816af040b6ce Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Mon, 4 Mar 2019 13:23:05 -0800 Subject: [PATCH 5/9] Add algo exclusion term to cudnnFind result processing. --- src/operator/nn/cudnn/cudnn_convolution-inl.h | 3 ++- src/operator/nn/cudnn/cudnn_deconvolution-inl.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index ea1c18932f31..0260c95e9db9 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -938,7 +938,8 @@ class CuDNNConvolutionOp { #if CUDNN_MAJOR >= 7 (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) && #endif - (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) { + (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) && + !algo_exclusion) { algo->Set(result.algo, algo_is_tensor_core); return; } diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index e25a9a9e1c3b..47f688c8ab9c 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -857,7 +857,8 @@ class CuDNNDeconvolutionOp { #if CUDNN_MAJOR >= 7 (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) && #endif - (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) { + (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) && + !algo_exclusion) { algo->Set(result.algo, algo_is_tensor_core); return; } From 989802dd1e8415d700ab15667afcdc7ef520e80e Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Mon, 4 Mar 2019 14:22:18 -0800 Subject: [PATCH 6/9] Revert "Add test temporarily to tests run with tensorrt CI build (cuda10, cudnn7.4.2)" This reverts commit 1cb743bd2313ca5d1845dbdda6cccd9dcaa5d30c. --- ci/docker/runtime_functions.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 848f15267270..de1b7795ce69 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -832,7 +832,6 @@ unittest_ubuntu_tensorrt_gpu() { export CUDNN_VERSION=7.0.3 python tests/python/tensorrt/lenet5_train.py nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/tensorrt/ - nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/gpu/test_operator_gpu.py:test_conv_deconv_guards } # quantization gpu currently only runs on P3 instances From 32d733d3875a7838c28efe90be7d673e57401ff1 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 5 Mar 2019 11:34:22 -0800 Subject: [PATCH 7/9] Trigger CI. From 95cc6ae0304784ed1b4aed6b97d023dd4f347ca7 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 5 Mar 2019 17:09:20 -0800 Subject: [PATCH 8/9] Add link to cuDNN release notes. --- tests/python/gpu/test_operator_gpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index b77229655a6b..0261dca378be 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -525,6 +525,7 @@ def test_convolution_options(): def test_conv_deconv_guards(): # Test cases for convolution and deconvolution via strided fft. Ensure that the framework # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5) + # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750 tol = 1e-1 for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]: dataname = opname + '_data' From 85866b86cd492f5f4156e801f081884109746e29 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 12 Mar 2019 11:47:15 -0700 Subject: [PATCH 9/9] Trigger CI.