From 817bad0c9b0250255c8035f88e4a10525a051b2a Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 3 Mar 2019 14:40:59 -0800
Subject: [PATCH 1/9] Add test exposing issue with conv dgrad algo 3 for some
 cudnn's.

---
 tests/python/gpu/test_operator_gpu.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 4dbf82edd3f5..12796c712fe9 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -521,6 +521,31 @@ def test_convolution_options():
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
 
+@with_seed()
+def test_conv_deconv_guards():
+    # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
+    # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
+    tol = 1e-2
+    for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
+        dataname = opname + '_data'
+        ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}
+        test_cases = [
+            {'num_filter':32, 'kernel':(6,6), 'pad':(0,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(6,6), 'pad':(1,1), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(6,7), 'pad':(0,1), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,6), 'pad':(1,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,7), 'pad':(0,0), 'stride':(2,2), 'name': opname},
+            {'num_filter':32, 'kernel':(7,7), 'pad':(1,1), 'stride':(2,2), 'name': opname}]
+        for test_case_args in test_cases:
+            try:
+                sym = op(**test_case_args)
+                sym_no_cudnn = op(cudnn_off=True, **test_case_args)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol)
+            except:
+                print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args))
+                raise
+
+
 # Helper function to run tests in a subprocess to avoid save/restore of os.environ.
 # Also avoids issues of cached environment variable lookups in the backend.
 def _test_in_separate_process(func, env, *args):

From 1cb743bd2313ca5d1845dbdda6cccd9dcaa5d30c Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 3 Mar 2019 15:23:38 -0800
Subject: [PATCH 2/9] Add test temporarily to tests run with tensorrt CI build
 (cuda10, cudnn7.4.2)

---
 ci/docker/runtime_functions.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index de1b7795ce69..848f15267270 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -832,6 +832,7 @@ unittest_ubuntu_tensorrt_gpu() {
     export CUDNN_VERSION=7.0.3
     python tests/python/tensorrt/lenet5_train.py
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/tensorrt/
+    nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/gpu/test_operator_gpu.py:test_conv_deconv_guards
 }
 
 # quantization gpu currently only runs on P3 instances

From 0cbd0c783f31f8fbf72cad146c2507849b03f538 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 4 Mar 2019 10:05:14 -0800
Subject: [PATCH 3/9] Relax tol of new test.

---
 tests/python/gpu/test_operator_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 12796c712fe9..b77229655a6b 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -525,7 +525,7 @@ def test_convolution_options():
 def test_conv_deconv_guards():
     # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
     # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
-    tol = 1e-2
+    tol = 1e-1
     for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
         dataname = opname + '_data'
         ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}

From 8c82498b3098379d2a2e97979186fa01c3aa84ab Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 3 Mar 2019 14:50:07 -0800
Subject: [PATCH 4/9] Fix for problematic conv dgrad algo 3 for some cuDNNs.

---
 src/operator/nn/cudnn/cudnn_convolution-inl.h | 22 ++++++++++++++++---
 .../nn/cudnn/cudnn_deconvolution-inl.h        | 21 ++++++++++++++++--
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index e11f7cc81d25..ea1c18932f31 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -521,7 +521,19 @@ class CuDNNConvolutionOp {
                                             wshape[1],
                                             wshape[2],
                                             wshape[3]));
-
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h = pad[0];
+      auto pad_w = pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
       #if CUDNN_MAJOR >= 5
@@ -714,7 +726,7 @@ class CuDNNConvolutionOp {
     bwd_data_results.resize(actual_bwd_data_algos);
     AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
                     cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
-                                                   workspace_byte, bwd);
+                                                   workspace_byte, bwd, exclude_dgrad_algo_);
 #else
     // CUDNN_MAJOR < 7
     const int kMaxAlgos = 10;
@@ -910,12 +922,14 @@ class CuDNNConvolutionOp {
   // workspace constraints.
   template <typename PerfType, typename AlgoType>
   void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
-                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
+                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
+                       int32_t algo_exclude = -1) {
     // Determine the fastest acceptable algo that matches the algo_preference (-1 = any),
     // regardless of mathType.
     bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
     for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
       const auto &result = perf_results[i];
+      bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
       bool algo_is_tensor_core = false;
       #if CUDNN_MAJOR >= 7
         algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
@@ -1104,6 +1118,8 @@ class CuDNNConvolutionOp {
   bool cudnn_tensor_core_;
   // Is req[kWeight] == conv::kAddTo ?
   bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
   ConvolutionParam param_;
 };
 #endif  // __CUDACC__ && CUDNN
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index ec95d2be3309..e25a9a9e1c3b 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -446,6 +446,19 @@ class CuDNNDeconvolutionOp {
                                             wshape[1],
                                             wshape[2],
                                             wshape[3]));
+#if CUDNN_VERSION >= 7301 && CUDNN_VERSION < 7500
+      auto kernel_h = wshape[2];
+      auto kernel_w = wshape[3];
+      auto stride_h = stride[0];
+      auto stride_w = stride[1];
+      auto pad_h = o_pad[0];
+      auto pad_w = o_pad[1];
+      if (param_.layout.value() == kNCHW &&
+          (((stride_h == 2) && (kernel_h % 2 == 0) && (pad_h % 2 == 0)) ||
+           ((stride_w == 2) && (kernel_w % 2 == 0) && (pad_w % 2 == 0)))) {
+        exclude_dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING;
+      }
+#endif
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
       index_t o_pad[3];
@@ -618,7 +631,7 @@ class CuDNNDeconvolutionOp {
     bwd_data_results.resize(actual_bwd_data_algos);
     AlgoFinalSelect<cudnnConvolutionBwdDataAlgoPerf_t,
         cudnnConvolutionBwdDataAlgo_t>(bwd_data_results, "backprop-to-data",
-                                       workspace_byte, bwd);
+                                       workspace_byte, bwd, exclude_dgrad_algo_);
 #else
     // CUDNN_MAJOR < 7
     const int kMaxAlgos = 10;
@@ -829,11 +842,13 @@ class CuDNNDeconvolutionOp {
   // workspace constraints and a possible user algo preference.
   template <typename PerfType, typename AlgoType>
   void AlgoFinalSelect(const std::vector<PerfType> &perf_results, std::string kernel_name,
-                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo) {
+                       size_t workspace_byte, CuDNNAlgo<AlgoType> *algo,
+                       int32_t algo_exclude = -1) {
     // Determine the fastest acceptable algo regardless of mathType.
     bool enforce_determinism = dmlc::GetEnv("MXNET_ENFORCE_DETERMINISM", false);
     for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
       const auto &result = perf_results[i];
+      bool algo_exclusion = static_cast<int32_t>(result.algo) == algo_exclude;
       bool algo_is_tensor_core = false;
       #if CUDNN_MAJOR >= 7
         algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
@@ -1025,6 +1040,8 @@ class CuDNNDeconvolutionOp {
   bool cudnn_tensor_core_;
   // Is req[kWeight] == deconv::kAddTo ?
   bool add_to_weight_;
+  // Is there a dgrad algo that should be avoided (-1 == none)?
+  int32_t exclude_dgrad_algo_ = -1;
   DeconvolutionParam param_;
 };
 #endif  // CUDNN

From 5516b64e5cc1ca509e7fb6ef25ee816af040b6ce Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 4 Mar 2019 13:23:05 -0800
Subject: [PATCH 5/9] Add algo exclusion term to cudnnFind result processing.

---
 src/operator/nn/cudnn/cudnn_convolution-inl.h   | 3 ++-
 src/operator/nn/cudnn/cudnn_deconvolution-inl.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index ea1c18932f31..0260c95e9db9 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -938,7 +938,8 @@ class CuDNNConvolutionOp {
         #if CUDNN_MAJOR >= 7
           (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
         #endif
-          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) {
+          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
         algo->Set(result.algo, algo_is_tensor_core);
         return;
       }
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index e25a9a9e1c3b..47f688c8ab9c 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -857,7 +857,8 @@ class CuDNNDeconvolutionOp {
         #if CUDNN_MAJOR >= 7
           (!enforce_determinism || result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
         #endif
-          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte)) {
+          (param_.cudnn_tune.value() != conv::kLimited || result.memory <= workspace_byte) &&
+          !algo_exclusion) {
         algo->Set(result.algo, algo_is_tensor_core);
         return;
       }

From 989802dd1e8415d700ab15667afcdc7ef520e80e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 4 Mar 2019 14:22:18 -0800
Subject: [PATCH 6/9] Revert "Add test temporarily to tests run with tensorrt
 CI build (cuda10, cudnn7.4.2)"

This reverts commit 1cb743bd2313ca5d1845dbdda6cccd9dcaa5d30c.
---
 ci/docker/runtime_functions.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 848f15267270..de1b7795ce69 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -832,7 +832,6 @@ unittest_ubuntu_tensorrt_gpu() {
     export CUDNN_VERSION=7.0.3
     python tests/python/tensorrt/lenet5_train.py
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/tensorrt/
-    nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/gpu/test_operator_gpu.py:test_conv_deconv_guards
 }
 
 # quantization gpu currently only runs on P3 instances

From 32d733d3875a7838c28efe90be7d673e57401ff1 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 5 Mar 2019 11:34:22 -0800
Subject: [PATCH 7/9] Trigger CI.


From 95cc6ae0304784ed1b4aed6b97d023dd4f347ca7 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 5 Mar 2019 17:09:20 -0800
Subject: [PATCH 8/9] Add link to cuDNN release notes.

---
 tests/python/gpu/test_operator_gpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index b77229655a6b..0261dca378be 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -525,6 +525,7 @@ def test_convolution_options():
 def test_conv_deconv_guards():
     # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
     # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
+    # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750
     tol = 1e-1
     for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
         dataname = opname + '_data'

From 85866b86cd492f5f4156e801f081884109746e29 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 12 Mar 2019 11:47:15 -0700
Subject: [PATCH 9/9] Trigger CI.