From 73431ddf82d62eda70a8cf137ff5e2b5556e953c Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sat, 5 Feb 2022 13:09:26 -0800
Subject: [PATCH 01/19] Add g5 instance to jenkinsfiles where both p3 and g4
 are mentioned

---
 ci/jenkins/Jenkinsfile_unix_gpu                  | 2 +-
 ci/jenkins/Jenkinsfile_website_beta              | 2 +-
 ci/jenkins/Jenkinsfile_website_full              | 2 +-
 ci/jenkins/Jenkinsfile_website_version_artifacts | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 46d455f1db3e..d6af8b885d9f 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -29,7 +29,7 @@ node('utility') {
   utils = load('ci/Jenkinsfile_utils.groovy')
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4')
+utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4', linux_gpu_g5: 'mxnetlinux-gpu-g5')
 
 utils.main_wrapper(
 core_logic: {
diff --git a/ci/jenkins/Jenkinsfile_website_beta b/ci/jenkins/Jenkinsfile_website_beta
index 7b3f689a7e70..11741dac392e 100644
--- a/ci/jenkins/Jenkinsfile_website_beta
+++ b/ci/jenkins/Jenkinsfile_website_beta
@@ -31,7 +31,7 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_g5: 'restricted-mxnetlinux-gpu-g5', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 utils.main_wrapper(
 core_logic: {
diff --git a/ci/jenkins/Jenkinsfile_website_full b/ci/jenkins/Jenkinsfile_website_full
index 7e237b50883b..a7a1fc0a1d7e 100644
--- a/ci/jenkins/Jenkinsfile_website_full
+++ b/ci/jenkins/Jenkinsfile_website_full
@@ -30,7 +30,7 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_g5: 'restricted-mxnetlinux-gpu-g5', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 utils.main_wrapper(
 core_logic: {
diff --git a/ci/jenkins/Jenkinsfile_website_version_artifacts b/ci/jenkins/Jenkinsfile_website_version_artifacts
index 57fcbefeade1..8eda4fabb788 100644
--- a/ci/jenkins/Jenkinsfile_website_version_artifacts
+++ b/ci/jenkins/Jenkinsfile_website_version_artifacts
@@ -30,7 +30,7 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_g5: 'restricted-mxnetlinux-gpu-g5', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 utils.main_wrapper(
 core_logic: {

From 143391137e955fc82e4a299f3e3a623a17e72466 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 7 Feb 2022 15:19:04 -0800
Subject: [PATCH 02/19] Remove reference to non-existent
 restricted-mxnetlinux-gpu-g5

---
 ci/jenkins/Jenkinsfile_website_beta              | 2 +-
 ci/jenkins/Jenkinsfile_website_full              | 2 +-
 ci/jenkins/Jenkinsfile_website_version_artifacts | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/jenkins/Jenkinsfile_website_beta b/ci/jenkins/Jenkinsfile_website_beta
index 11741dac392e..7b3f689a7e70 100644
--- a/ci/jenkins/Jenkinsfile_website_beta
+++ b/ci/jenkins/Jenkinsfile_website_beta
@@ -31,7 +31,7 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_g5: 'restricted-mxnetlinux-gpu-g5', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 utils.main_wrapper(
 core_logic: {
diff --git a/ci/jenkins/Jenkinsfile_website_full b/ci/jenkins/Jenkinsfile_website_full
index a7a1fc0a1d7e..7e237b50883b 100644
--- a/ci/jenkins/Jenkinsfile_website_full
+++ b/ci/jenkins/Jenkinsfile_website_full
@@ -30,7 +30,7 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_g5: 'restricted-mxnetlinux-gpu-g5', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 utils.main_wrapper(
 core_logic: {
diff --git a/ci/jenkins/Jenkinsfile_website_version_artifacts b/ci/jenkins/Jenkinsfile_website_version_artifacts
index 8eda4fabb788..57fcbefeade1 100644
--- a/ci/jenkins/Jenkinsfile_website_version_artifacts
+++ b/ci/jenkins/Jenkinsfile_website_version_artifacts
@@ -30,7 +30,7 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_g5: 'restricted-mxnetlinux-gpu-g5', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu-g4', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 utils.main_wrapper(
 core_logic: {

From dd3b58a6fb0201de79729d1654bc3ee4ed4c7406 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 7 Feb 2022 17:28:43 -0800
Subject: [PATCH 03/19] Enable unittest job on g5

---
 ci/Jenkinsfile_utils.groovy           |  1 +
 ci/jenkins/Jenkins_steps.groovy       | 16 ++++++++++++++++
 ci/jenkins/Jenkinsfile_unix_gpu       |  1 +
 tests/python/gpu/test_operator_gpu.py |  8 +++++++-
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index 17d4e847bb82..2ea78f2fcd1e 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -250,6 +250,7 @@ def assign_node_labels(args) {
   NODE_LINUX_CPU = args.linux_cpu
   NODE_LINUX_GPU = args.linux_gpu
   NODE_LINUX_GPU_G4 = args.linux_gpu_g4
+  NODE_LINUX_GPU_G5 = args.linux_gpu_g5
   NODE_LINUX_GPU_P3 = args.linux_gpu_p3
   NODE_WINDOWS_CPU = args.windows_cpu
   NODE_WINDOWS_GPU = args.windows_gpu
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 92d126612b50..aec2b6564f55 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -716,6 +716,22 @@ def test_unix_python3_gpu(lib_name) {
     }]
 }
 
+def test_unix_python3_ampere_gpu(lib_name) {
+    return ['Python3: Ampere-GPU': {
+      node(NODE_LINUX_GPU_G5) {
+        ws('workspace/ut-python3-gpu') {
+          try {
+            utils.unpack_and_init(lib_name, mx_lib_cython)
+            python3_gpu_ut_cython('ubuntu_gpu_cu111')
+            utils.publish_test_coverage()
+          } finally {
+            utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_ampere_gpu.xml')
+          }
+        }
+      }
+    }]
+}
+
 def test_unix_python3_debug_cpu() {
     return ['Python3: CPU debug': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index d6af8b885d9f..e182cd6963d3 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -44,6 +44,7 @@ core_logic: {
 
   utils.parallel_stage('Tests', [
     custom_steps.test_unix_python3_gpu('gpu'),
+    custom_steps.test_unix_python3_gpu('ampere_gpu'),
     custom_steps.test_unix_python3_onednn_gpu('onednn_gpu'),
     custom_steps.test_unix_python3_onednn_nocudnn_gpu('onednn_gpu_nocudnn'),
     custom_steps.test_unix_cpp_package_gpu('gpu'),
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 6592cd490dac..aaaab23fdfa2 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -27,7 +27,7 @@
 import mxnet.ndarray.sparse as mxsps
 from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, assert_allclose
 from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr
-from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts
+from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts, get_cuda_compute_capability
 from mxnet.base import MXNetError
 from mxnet import autograd
 
@@ -54,6 +54,12 @@
 
 set_default_device(mx.gpu(0))
 
+# Log GPU compute cababilities even if output is captured and not displayed for a passing test
+def test_report_compute_capabilities(capsys):
+    with capsys.disabled():
+        sys.stdout.write('= {} '.format(
+            [get_cuda_compute_capability(mx.gpu(i)) for i in range(mx.device.num_gpus())] ))
+
 def check_countsketch(in_dim,out_dim,n):
     data = mx.sym.Variable("data")
     h = mx.sym.Variable("h")

From 48b6173a30b07301088eca143f66289716f49b0f Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 7 Feb 2022 21:42:02 -0800
Subject: [PATCH 04/19] Fix Jenkinsfile_unix_gpu syntax

---
 ci/jenkins/Jenkinsfile_unix_gpu       | 2 +-
 tests/python/gpu/test_operator_gpu.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index e182cd6963d3..69ce5a3a0d87 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -44,7 +44,7 @@ core_logic: {
 
   utils.parallel_stage('Tests', [
     custom_steps.test_unix_python3_gpu('gpu'),
-    custom_steps.test_unix_python3_gpu('ampere_gpu'),
+    custom_steps.test_unix_python3_ampere_gpu('gpu'),
     custom_steps.test_unix_python3_onednn_gpu('onednn_gpu'),
     custom_steps.test_unix_python3_onednn_nocudnn_gpu('onednn_gpu_nocudnn'),
     custom_steps.test_unix_cpp_package_gpu('gpu'),
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index aaaab23fdfa2..387084164111 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -54,7 +54,8 @@
 
 set_default_device(mx.gpu(0))
 
-# Log GPU compute cababilities even if output is captured and not displayed for a passing test
+# For info purposes, log GPU compute cababilities.  Run serially so output appears in log.
+@pytest.mark.serial
 def test_report_compute_capabilities(capsys):
     with capsys.disabled():
         sys.stdout.write('= {} '.format(

From 1c116348d1083eed67dfc6f87a937363ee6a21e8 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 8 Feb 2022 16:13:53 -0800
Subject: [PATCH 05/19] Include A10G arch 86 in build for g5

---
 ci/docker/runtime_functions.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 05f80032cd15..acf45f2af087 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -22,8 +22,9 @@
 
 set -ex
 
-CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70"
-CI_CMAKE_CUDA_ARCH="5.2 7.0"
+# compute capabilities for p3, g4 and g5 instances
+CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_86,code=sm_86"
+CI_CMAKE_CUDA_ARCH="5.2 7.0 8.6"
 
 clean_repo() {
     set -ex

From ec30697cc485bb00730ecb78706cbbc2e8513b3a Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 15 Feb 2022 14:34:29 -0800
Subject: [PATCH 06/19] Update is_TF32_enabled() for SM arch > 80

---
 python/mxnet/test_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index c80417347d77..dc9167aeff9a 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -112,15 +112,15 @@ def effective_dtype(dat):
     ----------
     dat : np.ndarray or mx.nd.array or mx.np.ndarray
     """
-    # On arch 80 gpus, a float32-io gemm or conv op will trim the mantissa of data
-    # inputs to be of comparable precision to a float16, so float16 becomes the
+    # On arch 80 gpus or later, a float32-io gemm or conv op will trim the mantissa of
+    # data inputs to be of comparable precision to a float16, so float16 becomes the
     # 'effective dtype' for tolerance tests involving such op outputs.
 
     # Is TF32 enabled in the device (the default on arch 80 GPUs)
     def is_TF32_enabled(device):
         try:
             return (device.device_type == 'gpu' and
-                    get_cuda_compute_capability(device) == 80 and
+                    get_cuda_compute_capability(device) >= 80 and
                     os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
         except:  # pylint: disable=bare-except
             return False

From 1c96ef9b938bfd2aa6c2c0eb07fbcc3f21cb2d7b Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 15 Feb 2022 16:31:21 -0800
Subject: [PATCH 07/19] Remove gpu arch 86 from centos builds on cuda 10

---
 ci/docker/runtime_functions.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index acf45f2af087..d68ce9649dcf 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -22,9 +22,11 @@
 
 set -ex
 
-# compute capabilities for p3, g4 and g5 instances
-CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_86,code=sm_86"
-CI_CMAKE_CUDA_ARCH="5.2 7.0 8.6"
+# compute capabilities for CI instances supported by CUDA 10.x (i.e. p3, g4)
+CI_CMAKE_CUDA10_ARCH="5.2 7.5"
+
+# compute capabilities for CI instances supported by CUDA >= 11.1 (i.e. p3, g4, g5)
+CI_CMAKE_CUDA_ARCH="5.2 7.5 8.6"
 
 clean_repo() {
     set -ex
@@ -299,7 +301,7 @@ build_centos7_gpu() {
         -DUSE_BLAS=Open \
         -DUSE_ONEDNN=ON \
         -DUSE_CUDA=ON \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA10_ARCH" \
         -DUSE_DIST_KVSTORE=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
         -DUSE_INT64_TENSOR_SIZE=OFF \

From 4b93cdb5d2a052bce8a73a9c3cdc2824df2924a7 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 12 Jan 2022 14:12:07 -0800
Subject: [PATCH 08/19] Fix
 test_convolution_{grouping,dilated_impulse_response}, test_np_linalg_qr

---
 tests/python/unittest/test_numpy_op.py | 3 +++
 tests/python/unittest/test_operator.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 8008c053cd2b..b71f599cacd2 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -6477,6 +6477,9 @@ def check_qr(q, r, a_np):
 
         data_np = onp.array(data_np, dtype=dtype)
         data = np.array(data_np, dtype=dtype)
+        if effective_dtype(data) == onp.dtype(np.float16):
+            print('Skipping test on this platform: {} has a float16 effective dtype'.format(dtype))
+            pytest.skip()
 
         data.attach_grad()
         with mx.autograd.record():
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 5f290318824b..2f32806ae939 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1724,6 +1724,7 @@ def np_groupnorm_grad(ograd, data, gamma, beta, mean, std, num_groups, eps):
                                 atol=5e-2 if dtype == np.float16 else 1e-4, dtype=dtype)
 
 
+@pytest.mark.serial
 def test_convolution_grouping():
     for dim in [1, 2, 3]:
         num_filter = 4
@@ -2285,6 +2286,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001)
 
 
+@pytest.mark.serial
 def test_convolution_dilated_impulse_response():
     # 1D
     for dil in [ (1,), (2,), (3,) ]:

From 5583ae6722b18771ef8cad78dc2655ef68495ac7 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 18 Jan 2022 10:49:21 -0800
Subject: [PATCH 09/19] Fix test_convolution_grouping on A100

---
 tests/python/unittest/test_operator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 2f32806ae939..ef0d98b65a89 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1746,7 +1746,7 @@ def test_convolution_grouping():
             exe1 = y1._simple_bind(default_device(), x=shape)
             exe2 = y2._simple_bind(default_device(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
             for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
-                arr1[:] = np.float32(np.random.normal(size=arr1.shape))
+                arr1[:] = np.random.normal(size=arr1.shape).astype(effective_dtype(mx.nd.array([1.,])))
                 arr2[:] = arr1
             exe1.forward(is_train=True)
             exe1.backward(exe1.outputs[0])
@@ -1754,7 +1754,7 @@ def test_convolution_grouping():
             exe2.backward(exe2.outputs[0])
 
             for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
-                np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
+                assert_almost_equal(arr1, arr2)
 
 
 @pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/14052")

From 00cfc897e234325a4903d39a63ad055f78ad1a70 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 18 Jan 2022 15:20:25 -0800
Subject: [PATCH 10/19] Fix test_rnn_unroll_variant_length

---
 tests/python/unittest/test_gluon_rnn.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 2911f9165244..cbb68d57aa19 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -844,14 +844,12 @@ def test_rnn_unroll_variant_length():
                                               inputs=data_nd[i:(i+1), :ele_length, :],
                                               merge_outputs=True,
                                               layout='NTC')
-            assert_allclose(ele_out.asnumpy(), outs[i:(i+1), :ele_length, :].asnumpy(),
-                            atol=1E-4, rtol=1E-4)
+            assert_almost_equal(ele_out, outs[i:(i+1), :ele_length, :])
             if ele_length < max_length:
                 # Check the padded outputs are all zero
-                assert_allclose(outs[i:(i+1), ele_length:max_length, :].asnumpy(), 0)
+                assert_almost_equal(outs[i:(i+1), ele_length:max_length, :], 0)
             for valid_out_state, gt_state in zip(states, ele_states):
-                assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
-                                atol=1E-4, rtol=1E-4)
+                assert_almost_equal(valid_out_state[i:(i+1)], gt_state)
 
         # Test for TNC layout
         data_nd = mx.np.random.normal(0, 1, size=(max_length, batch_size, 20))
@@ -864,14 +862,12 @@ def test_rnn_unroll_variant_length():
                                               inputs=data_nd[:ele_length, i:(i+1), :],
                                               merge_outputs=True,
                                               layout='TNC')
-            assert_allclose(ele_out.asnumpy(), outs[:ele_length, i:(i + 1), :].asnumpy(),
-                            atol=1E-4, rtol=1E-4)
+            assert_almost_equal(ele_out, outs[:ele_length, i:(i + 1), :])
             if ele_length < max_length:
                 # Check the padded outputs are all zero
-                assert_allclose(outs[ele_length:max_length, i:(i+1), :].asnumpy(), 0)
+                assert_almost_equal(outs[ele_length:max_length, i:(i+1), :], 0)
             for valid_out_state, gt_state in zip(states, ele_states):
-                assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
-                                atol=1E-4, rtol=1E-4)
+                assert_almost_equal(valid_out_state[i:(i+1)], gt_state)
 
 
 def test_cell_fill_shape():

From 91614955bfd2e32943c60c8670af747b24a2763e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 22 Aug 2021 20:30:58 -0700
Subject: [PATCH 11/19] Fix test_convolution_dilated_impulse_response

---
 tests/python/unittest/test_operator.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ef0d98b65a89..f0e0e0977a58 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2217,7 +2217,8 @@ def test_bxor(a, b):
     test_bor(a, b)
     test_bxor(a, b)
 
-def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):
+
+def run_convolution_dilated_impulse_response(dil, kernel_shape, tol):
     dim = len(dil)
     assert(len(kernel_shape) == dim)
     # Input for spike response
@@ -2260,7 +2261,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     out_o = be.outputs[0].asnumpy()
     assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5)
 
-    rnd_kernel_s = np.random.uniform(low=0.0, high=1.0, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
+    rnd_kernel_s = np.random.uniform(low=-0.5, high=0.5, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
     impulse_error = mx.nd.array(out_o/np.sum(out_o)) # This should be 1.0 at [0,0,16,16]
     rnd_kernel = mx.nd.array(rnd_kernel_s)
 
@@ -2283,23 +2284,27 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     be.forward(True)
     out = be.outputs[0].asnumpy()
     # Now do a simple check of the kernel gradient
-    assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001)
-
+    d = np.abs(out[center] - np.sum(kernel_gradient) - out_orig[center])
+    assert d < tol, f'd: {d}'
 
 @pytest.mark.serial
 def test_convolution_dilated_impulse_response():
+    tol = 1e-3
     # 1D
     for dil in [ (1,), (2,), (3,) ]:
         for ks in [ (1,), (2,), (3,), (4,)]:
-            test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
+            run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol)
     # 2D
     for dil in [ (1,1), (2,2), (3,3) ]:
         for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]:
-            test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
+            run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol)
     # 3D
+    # On Ampere, autotuning might select a TensorCore conv engine, which effectively
+    # does a cast to fp16 of the weights and data.  Expand tol in these 3D cases.
+    tol3D = 1e-2 if effective_dtype(mx.nd.array([1.,])) == np.float16 else tol
     for dil in [ (1,1,1), (2,2,2), (3,3,3) ]:
         for ks in [ (3,3,3), (4,4,4), (2,3,4), (3,2,4), (1,1,1) ]:
-            test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
+            run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol3D)
 
 
 @pytest.mark.serial

From f4e2f4064e3f9e044bad6c404f55faa8c0735c99 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 16 Feb 2022 22:40:35 -0800
Subject: [PATCH 12/19] Skip test_np_standard_binary_funcs test of 0-dim array
 broadcast

---
 tests/python/unittest/test_numpy_op.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index b71f599cacd2..e3a2fd8036c4 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -11715,8 +11715,12 @@ def array_values(low, high, shape):
     ((3, 1), (3, 0)),
     ((0, 2), (1, 2)),
     ((2, 3, 4), (3, 1)),
-    ((2, 3), ()),
-    ((), (2, 3))
+# MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays.
+# See https://github.com/apache/incubator-mxnet/issues/20898.
+#    ((2, 3), ()),
+#    ((), (2, 3))
+    ((2, 3), (1,)),
+    ((1,), (2, 3))
 ])
 def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape):
     class TestStandardBinary(HybridBlock):

From 4a9056a26f8c210497e3b5ed2318e30c8c2dbc5e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 18 Feb 2022 01:53:07 -0800
Subject: [PATCH 13/19] Temporarily add '-s' to pytest cpu tests

---
 ci/docker/runtime_functions.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index d68ce9649dcf..cca0af642b4c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -803,8 +803,9 @@ cd_unittest_ubuntu() {
 
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
-    OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
-    pytest -m 'serial' --durations=50 --verbose tests/python/unittest
+    # Temporarily tell pytest to not capture output ('-s') to get more insight into Python: Aborted error
+    OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest
+    pytest -m 'serial' --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest
 
     # https://github.com/apache/incubator-mxnet/issues/11801
     # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then

From 6c316fcf29a70e5e9a7f5efe3034921e119e2c0e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 20 Feb 2022 20:02:44 -0800
Subject: [PATCH 14/19] Revert "Temporarily add '-s' to pytest cpu tests"

This reverts commit 4a9056a26f8c210497e3b5ed2318e30c8c2dbc5e.
---
 ci/docker/runtime_functions.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index cca0af642b4c..d68ce9649dcf 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -803,9 +803,8 @@ cd_unittest_ubuntu() {
 
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
-    # Temporarily tell pytest to not capture output ('-s') to get more insight into Python: Aborted error
-    OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest
-    pytest -m 'serial' --durations=50 --verbose -s --log-cli-level=DEBUG tests/python/unittest
+    OMP_NUM_THREADS=$(expr $(nproc) / 4) pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
+    pytest -m 'serial' --durations=50 --verbose tests/python/unittest
 
     # https://github.com/apache/incubator-mxnet/issues/11801
     # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then

From 02a09443673edb93736ed5905e375c2a70f0378f Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 20 Feb 2022 20:44:57 -0800
Subject: [PATCH 15/19] Improve test_rnn_layers_fp{16,32} invocation

---
 tests/python/unittest/test_gluon_rnn.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index cbb68d57aa19..1429c4d17ffc 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -606,7 +606,8 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=m
 
 
 @mx.util.use_np
-def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
+def run_rnn_layers(dtype, dtype2):
+    device = default_device()
 
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device)
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype), device=device)
@@ -673,10 +674,12 @@ def test_rnn_layers_fp32():
     run_rnn_layers('float32', 'float32')
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now")
 @pytest.mark.serial
 def test_rnn_layers_fp16():
-    run_rnn_layers('float16', 'float32', mx.gpu())
+    # Dynamic skip condition is best handled this way, rather than with pytest.mark.skipIf
+    if default_device().device_type == 'cpu':
+        pytest.skip('RNN FP16 only implemented for GPU for now')
+    run_rnn_layers('float16', 'float32')
 
 
 def check_rnn_consistency(fused_layer, stack_layer, loss, mode, num_layers, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4):

From ae17b1f2af787427740c66a05ee1fb733ea56dd3 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 21 Feb 2022 00:54:41 -0800
Subject: [PATCH 16/19] Pin MarkupSafe==2.0.1 to avoid soft_unicode import
 failure

---
 docs/python_docs/requirements | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/python_docs/requirements b/docs/python_docs/requirements
index 84b569cd19e7..62d453b2f156 100644
--- a/docs/python_docs/requirements
+++ b/docs/python_docs/requirements
@@ -18,6 +18,7 @@
 numpy>=1.17,<1.20.0
 jupyter
 Jinja2==2.11.3
+MarkupSafe==2.0.1
 sphinx==2.4.0
 matplotlib
 notebook

From 223d9bf513cb73fb3f340ad4dee893af31ed4334 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 21 Feb 2022 10:42:28 -0800
Subject: [PATCH 17/19] Run test_rnn_layers_fp32 only when cuDNN is present

---
 tests/python/unittest/test_gluon_rnn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 1429c4d17ffc..ac38b73ac4f8 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -669,6 +669,7 @@ def run_rnn_layers(dtype, dtype2):
         out.backward()
         out = out.asnumpy()
 
+@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 @pytest.mark.serial
 def test_rnn_layers_fp32():
     run_rnn_layers('float32', 'float32')

From 9114332377472701331c3a3b323b734b285ae9af Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 21 Feb 2022 16:58:32 -0800
Subject: [PATCH 18/19] Fix potential out-of-bounds write in count_sketch.cu

---
 src/operator/contrib/count_sketch.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu
index 24ca7970e064..bb16695caa74 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -93,6 +93,9 @@ __global__ void sketch_backward_kernel(const int nthreads,
   // only calculate gradient regarding x
   // can also calculate gradient regarding s if needed
   const int index    = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= nthreads) {
+    return;
+  }
   const int i_indim  = index % in_dim;
   const int i_sample = index / in_dim;
   const int i_outdim = i_sample * out_dim + h[i_indim];

From 3b9a520a6da05314f97e14d39d0d05092764febf Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 23 Feb 2022 18:22:30 -0800
Subject: [PATCH 19/19] Revert "Pin MarkupSafe==2.0.1 to avoid soft_unicode
 import failure"

This reverts commit ae17b1f2af787427740c66a05ee1fb733ea56dd3.
---
 docs/python_docs/requirements | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/python_docs/requirements b/docs/python_docs/requirements
index 62d453b2f156..84b569cd19e7 100644
--- a/docs/python_docs/requirements
+++ b/docs/python_docs/requirements
@@ -18,7 +18,6 @@
 numpy>=1.17,<1.20.0
 jupyter
 Jinja2==2.11.3
-MarkupSafe==2.0.1
 sphinx==2.4.0
 matplotlib
 notebook