From 8e3f0f31c6c281d82833c2ed06f611b52cc811ad Mon Sep 17 00:00:00 2001
From: dw_sjtu <46704444+sjtuWangDing@users.noreply.github.com>
Date: Tue, 14 Apr 2020 12:57:58 +0800
Subject: [PATCH 01/14] * impl - linalg matrix_rank for cpu and gpu implemented
 (#18020)

* fix - python interface

* impl - ffi for matrix_rank

* impl - ffi benchmark

Co-authored-by: Ubuntu <ubuntu@ip-172-31-10-214.us-east-2.compute.internal>
---
 benchmark/python/ffi/benchmark_ffi.py         |   9 +-
 python/mxnet/ndarray/numpy/linalg.py          |  47 +-
 python/mxnet/numpy/fallback_linalg.py         |   2 -
 python/mxnet/numpy/linalg.py                  |  43 +-
 python/mxnet/numpy_dispatch_protocol.py       |   1 +
 python/mxnet/symbol/numpy/linalg.py           |  35 +-
 .../operator/numpy/linalg/np_matrix_rank.cc   |  76 +++
 .../numpy/linalg/np_matrix_rank-inl.h         | 449 ++++++++++++++++++
 src/operator/numpy/linalg/np_matrix_rank.cc   | 165 +++++++
 src/operator/numpy/linalg/np_matrix_rank.cu   |  38 ++
 .../unittest/test_numpy_interoperability.py   |  29 +-
 tests/python/unittest/test_numpy_op.py        |  77 +++
 12 files changed, 956 insertions(+), 15 deletions(-)
 create mode 100644 src/api/operator/numpy/linalg/np_matrix_rank.cc
 create mode 100644 src/operator/numpy/linalg/np_matrix_rank-inl.h
 create mode 100644 src/operator/numpy/linalg/np_matrix_rank.cc
 create mode 100644 src/operator/numpy/linalg/np_matrix_rank.cu

diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 01534f1c949a..099bfadbe6d2 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -65,6 +65,7 @@ def prepare_workloads():
     OpArgMngr.add_workload("linalg.eigh", pool['3x3'])
     OpArgMngr.add_workload("linalg.det", pool['3x3'])
     OpArgMngr.add_workload("linalg.slogdet", pool['3x3'])
+    OpArgMngr.add_workload("linalg.matrix_rank", pool['3x3'], pool['1'], hermitian=False)
     OpArgMngr.add_workload("linalg.svd", pool['3x3'])
     OpArgMngr.add_workload("linalg.cholesky", pool['1x1'])
     OpArgMngr.add_workload("linalg.eigvals", pool['1x1'])
@@ -127,10 +128,10 @@ def prepare_workloads():
                            out=dnp.array([False, False], dtype=bool), keepdims=False)
     OpArgMngr.add_workload("roll", pool["2x2"], 1, axis=0)
     OpArgMngr.add_workload("rot90", pool["2x2"], 2)
-    OpArgMngr.add_workload("array_split", pool['2X2'], 2, axis=1)
-    OpArgMngr.add_workload("vsplit", pool['2X2'], 2)
-    OpArgMngr.add_workload("hsplit", pool['2X2'], 2)
-    OpArgMngr.add_workload("dsplit", pool['2X2x2'], 2)
+    OpArgMngr.add_workload("array_split", pool['2x2'], 2, axis=1)
+    OpArgMngr.add_workload("vsplit", pool['2x2'], 2)
+    OpArgMngr.add_workload("hsplit", pool['2x2'], 2)
+    OpArgMngr.add_workload("dsplit", pool['2x2x2'], 2)
     OpArgMngr.add_workload("arange", 10)
     OpArgMngr.add_workload("concatenate", (pool['1x2'], pool['1x2'], pool['1x2']), axis=0)
     OpArgMngr.add_workload("append", pool['2x2'], pool['1x2'], axis=0)
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
index 7d7d6fc064c7..86bf11a00b02 100644
--- a/python/mxnet/ndarray/numpy/linalg.py
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -23,7 +23,52 @@
 from . import _api_internal
 
 __all__ = ['norm', 'svd', 'cholesky', 'qr', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve',
-           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq']
+           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq', 'matrix_rank']
+
+
+def matrix_rank(M, tol=None, hermitian=False):
+    """
+    Return matrix rank of array using SVD method
+
+    Rank of the array is the number of singular values of the array that are
+    greater than `tol`.
+
+    Parameters
+    M : {(M,), (..., M, N)} ndarray
+        Input vector or stack of matrices.
+    tol : (...) ndarray, float, optional
+        Threshold below which SVD values are considered zero. If `tol` is
+        None, and ``S`` is an array with singular values for `M`, and
+        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+        set to ``S.max() * max(M.shape) * eps``.
+    hermitian : bool, optional
+        If True, `M` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+    Returns
+    -------
+    rank : (...) ndarray
+        Rank of M.
+
+    Examples
+    --------
+    >>> from mxnet import np
+    >>> np.matrix_rank(np.eye(4)) # Full rank matrix
+    4
+    >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix
+    >>> np.matrix_rank(I)
+    3
+    >>> np.matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0
+    1
+    >>> np.matrix_rank(np.zeros((4,)))
+    0
+    """
+    finfo_eps_32 = _np.finfo(_np.float32).eps
+    finfo_eps_64 = _np.finfo(_np.float64).eps
+    if hermitian is True:
+        raise NotImplementedError("hermitian is not supported yet...")
+    return _api_internal.matrix_rank(M, tol, hermitian, finfo_eps_32, finfo_eps_64)
 
 
 def lstsq(a, b, rcond='warn'):
diff --git a/python/mxnet/numpy/fallback_linalg.py b/python/mxnet/numpy/fallback_linalg.py
index 5e06ff94a4ce..79d6b83062ec 100644
--- a/python/mxnet/numpy/fallback_linalg.py
+++ b/python/mxnet/numpy/fallback_linalg.py
@@ -24,11 +24,9 @@
 __all__ = [
     'cond',
     'matrix_power',
-    'matrix_rank',
     'multi_dot'
 ]
 
 cond = onp.linalg.cond
 matrix_power = onp.linalg.matrix_power
-matrix_rank = onp.linalg.matrix_rank
 multi_dot = onp.linalg.multi_dot
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 445adfdeaeae..d2756d531b7c 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -22,10 +22,51 @@
 from . import fallback_linalg
 
 __all__ = ['norm', 'svd', 'cholesky', 'qr', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve',
-           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq']
+           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq', 'matrix_rank']
 __all__ += fallback_linalg.__all__
 
 
+def matrix_rank(M, tol=None, hermitian=False):
+    """
+    Return matrix rank of array using SVD method
+
+    Rank of the array is the number of singular values of the array that are
+    greater than `tol`.
+
+    Parameters
+    M : {(M,), (..., M, N)} ndarray
+        Input vector or stack of matrices.
+    tol : (...) ndarray, float, optional
+        Threshold below which SVD values are considered zero. If `tol` is
+        None, and ``S`` is an array with singular values for `M`, and
+        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+        set to ``S.max() * max(M.shape) * eps``.
+    hermitian : bool, optional
+        If True, `M` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+    Returns
+    -------
+    rank : (...) ndarray
+        Rank of M.
+
+    Examples
+    --------
+    >>> from mxnet import np
+    >>> np.matrix_rank(np.eye(4)) # Full rank matrix
+    4
+    >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix
+    >>> np.matrix_rank(I)
+    3
+    >>> np.matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0
+    1
+    >>> np.matrix_rank(np.zeros((4,)))
+    0
+    """
+    return _mx_nd_np.linalg.matrix_rank(M, tol, hermitian)
+
+
 def lstsq(a, b, rcond='warn'):
     r"""
     Return the least-squares solution to a linear matrix equation.
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index 9d973e492f04..e693a00ea1a5 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -166,6 +166,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'linalg.eigvalsh',
     'linalg.eigh',
     'linalg.qr',
+    'linalg.matrix_rank',
     'shape',
     'trace',
     'tril',
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index 81740dd0ac2e..da7095520674 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -23,7 +23,40 @@
 from . import _internal as _npi
 
 __all__ = ['norm', 'svd', 'cholesky', 'qr', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve',
-           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq']
+           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq', 'matrix_rank']
+
+
+def matrix_rank(M, tol=None, hermitian=False):
+    """
+    Return matrix rank of array using SVD method
+
+    Rank of the array is the number of singular values of the array that are
+    greater than `tol`.
+
+    Parameters
+    M : {(M,), (..., M, N)} _Symbol
+        Input vector or stack of matrices.
+    tol : (...) _Symbol, float, optional
+        Threshold below which SVD values are considered zero. If `tol` is
+        None, and ``S`` is an array with singular values for `M`, and
+        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+        set to ``S.max() * max(M.shape) * eps``.
+    hermitian : bool, optional
+        If True, `M` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+    Returns
+    -------
+    rank : (...) _Symbol
+        Rank of M.
+    """
+    finfo_eps_32 = _np.finfo(_np.float32).eps
+    finfo_eps_64 = _np.finfo(_np.float64).eps
+    if tol is None:
+        return _npi.matrix_rank_none_tol(M, finfo_eps_32, finfo_eps_64, hermitian)
+    else:
+        return _npi.matrix_rank(M, tol, hermitian)
 
 
 def lstsq(a, b, rcond='warn'):
diff --git a/src/api/operator/numpy/linalg/np_matrix_rank.cc b/src/api/operator/numpy/linalg/np_matrix_rank.cc
new file mode 100644
index 000000000000..4bfe66664ef8
--- /dev/null
+++ b/src/api/operator/numpy/linalg/np_matrix_rank.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_pinv.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/linalg/np_matrix_rank.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/linalg/np_matrix_rank-inl.h"
+
+namespace mxnet {
+
+inline static void _npi_matrix_rank_none_tol(runtime::MXNetArgs args,
+                                             runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_matrix_rank_none_tol");
+  op::MatrixRankNoneTolParam param;
+  nnvm::NodeAttrs attrs;
+  param.hermitian = args[2].operator bool();
+  param.finfoEps32 = args[3].operator double();
+  param.finfoEps64 = args[4].operator double();
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::MatrixRankNoneTolParam>(&attrs);
+  int num_inputs = 1;
+  int num_outputs = 0;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*()};
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+}
+
+inline static void _npi_matrix_rank(runtime::MXNetArgs args,
+                                    runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_matrix_rank");
+  op::MatrixRankParam param;
+  nnvm::NodeAttrs attrs;
+  param.hermitian = args[2].operator bool();
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::MatrixRankParam>(&attrs);
+  int num_inputs = 2;
+  int num_outputs = 0;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*(), args[1].operator mxnet::NDArray*()};
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+}
+
+MXNET_REGISTER_API("_npi.matrix_rank")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  if (args[1].type_code() == kNull) {
+    _npi_matrix_rank_none_tol(args, ret);
+  } else {
+    _npi_matrix_rank(args, ret);
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/operator/numpy/linalg/np_matrix_rank-inl.h b/src/operator/numpy/linalg/np_matrix_rank-inl.h
new file mode 100644
index 000000000000..8ccecb57db11
--- /dev/null
+++ b/src/operator/numpy/linalg/np_matrix_rank-inl.h
@@ -0,0 +1,449 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_matrix_rank-inl.h
+ * \brief Placeholder for matrix_rank
+ */
+#ifndef MXNET_OPERATOR_NUMPY_LINALG_NP_MATRIX_RANK_INL_H_
+#define MXNET_OPERATOR_NUMPY_LINALG_NP_MATRIX_RANK_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include "../../operator_common.h"
+#include "../../mshadow_op.h"
+#include "./np_pinv-inl.h"
+
+namespace mxnet {
+namespace op {
+
+using namespace mshadow;
+
+struct MatrixRankNoneTolParam : public dmlc::Parameter<MatrixRankNoneTolParam> {
+  float finfoEps32;
+  double finfoEps64;
+  bool hermitian;
+  DMLC_DECLARE_PARAMETER(MatrixRankNoneTolParam) {
+    DMLC_DECLARE_FIELD(finfoEps32)
+    .set_default(0)
+    .describe("Machine limits for float32 type");
+    DMLC_DECLARE_FIELD(finfoEps64)
+    .set_default(0)
+    .describe("Machine limits for float64 type");
+    DMLC_DECLARE_FIELD(hermitian)
+    .set_default(false)
+    .describe("If True, M is assumed to be Hermitian (symmetric if real-valued).");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream finfoEps32_s, finfoEps64_s, hermitian_s;
+    finfoEps32_s << finfoEps32;
+    finfoEps64_s << finfoEps64;
+    hermitian_s << hermitian;
+    (*dict)["finfoEps32"] = finfoEps32_s.str();
+    (*dict)["finfoEps64"] = finfoEps64_s.str();
+    (*dict)["hermitian"] = hermitian_s.str();
+  }
+};
+
+struct MatrixRankParam : public dmlc::Parameter<MatrixRankParam> {
+  bool hermitian;
+  DMLC_DECLARE_PARAMETER(MatrixRankParam) {
+    DMLC_DECLARE_FIELD(hermitian)
+    .set_default(false)
+    .describe("If True, M is assumed to be Hermitian (symmetric if real-valued).");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream hermitian_s;
+    hermitian_s << hermitian;
+    (*dict)["hermitian"] = hermitian_s.str();
+  }
+};
+
+template<int req>
+struct VectorRankKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data,
+                                  int64_t *out_data, const int& data_size) {
+    bool all_nozero = true;
+    for (int j = 0; j < data_size; ++j) {
+      if (!((in_data[j] > 0 ? in_data[j] : -in_data[j]) > 0)) {
+        all_nozero = false;
+        break;
+      }
+    }
+    KERNEL_ASSIGN(*out_data, req, static_cast<int64_t>(all_nozero ? 1 : 0));
+  }
+};
+
+template<int req>
+struct MatrixRankNoneTolKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data, int64_t *out_data,
+                                  const int& nrow, const int& ncol, const double& finfoEps,
+                                  const int& data_size, const int& batch_size) {
+    if (i < batch_size) {
+      DType max_singular_value = 0;
+      for (int j = 0; j < data_size; ++j) {
+        DType sv = in_data[j + i * data_size];
+        max_singular_value = sv > max_singular_value ? sv : max_singular_value;
+      }
+      double tol = (nrow > ncol ? nrow : ncol) * static_cast<double>(max_singular_value) * finfoEps;
+      int64_t rank_num = 0;
+      for (int j = 0; j < data_size; ++j) {
+        rank_num += in_data[j + i * data_size] > tol ? 1 : 0;
+      }
+      KERNEL_ASSIGN(out_data[i], req, rank_num);
+    }
+  }
+};
+
+template<int req>
+struct MatrixRankKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data, int64_t *out_data,
+                                  const int& data_size, const int& batch_size) {
+    if (i < batch_size) {
+      int64_t rank_num = 0;
+      for (int j = 0; j < data_size; ++j) {
+        rank_num += in_data[j + i * data_size] > 0 ? 1 : 0;
+      }
+      KERNEL_ASSIGN(out_data[i], req, rank_num);
+    }
+  }
+};
+
+struct SVDWrapper {
+  template<typename xpu, typename DType>
+  static void op(const TBlob& a, const TBlob& s,
+                 const TBlob& u, const mxnet::TShape& ut_shape,
+                 const TBlob& v, const mxnet::TShape& vt_shape,
+                 const TBlob& work, const OpContext& ctx) {
+    Stream<xpu> *s_xpu = ctx.get_stream<xpu>();
+    const mxnet::TShape& a_shape = a.shape_;
+    const mxnet::TShape& ut_axis = GetTransAxis(u.shape_);
+    const int a_ndim = a.ndim();
+    const int nrow = a_shape[a_ndim - 2];
+    const int ncol = a_shape[a_ndim - 1];
+    if (nrow > ncol) {
+      const_cast<TBlob&>(u) = u.reshape(ut_shape);
+      const_cast<TBlob&>(v) = v.reshape(vt_shape);
+      mxnet::op::TransposeImpl<xpu>(ctx.run_ctx, a, u, ut_axis);
+      BatchSVDImpl(ncol, nrow,
+                   v.FlatToKD<xpu, 3, DType>(s_xpu),
+                   s.FlatToKD<xpu, 2, DType>(s_xpu),
+                   u.FlatToKD<xpu, 3, DType>(s_xpu),
+                   work.FlatToKD<xpu, 1, DType>(s_xpu), s_xpu);
+    } else {
+      if (a.dptr<DType>() != v.dptr<DType>()) {
+        Copy(v.FlatToKD<xpu, 3, DType>(s_xpu), a.FlatToKD<xpu, 3, DType>(s_xpu), s_xpu);
+      }
+      BatchSVDImpl(nrow, ncol,
+                   u.FlatToKD<xpu, 3, DType>(s_xpu),
+                   s.FlatToKD<xpu, 2, DType>(s_xpu),
+                   v.FlatToKD<xpu, 3, DType>(s_xpu),
+                   work.FlatToKD<xpu, 1, DType>(s_xpu), s_xpu);
+    }
+  }
+};
+
+inline void GetOrCheckBroadcastShape(const nnvm::NodeAttrs& attrs,
+                                     const mxnet::TShape& a_shape,
+                                     const mxnet::TShape& tol_shape,
+                                     mxnet::TShape *broadcast_shape = nullptr,
+                                     mxnet::TShape *new_tol_shape = nullptr) {
+  CHECK_GE(a_shape.ndim(), 2);
+  const int a_ndim = a_shape.ndim();
+  const int tol_ndim = tol_shape.ndim();
+  const int nrow = a_shape[a_ndim - 2];
+  const int ncol = a_shape[a_ndim - 1];
+  // Get new tol shape.
+  mxnet::TShape temp_new_tol_shape(tol_ndim + 1, 1);
+  for (int i = 0; i < tol_ndim; ++i) { temp_new_tol_shape[i] = tol_shape[i]; }
+  // Get singular value shape.
+  mxnet::TShape temp_s_shape(a_ndim - 1, 0);
+  for (int i = 0; i < a_ndim - 2; ++i) {
+    temp_s_shape[i] = a_shape[i];
+  }
+  temp_s_shape[a_ndim - 2] = std::min(nrow, ncol);
+  // Check binary broadcast shape.
+  mxnet::ShapeVector in_shape_vec({ temp_s_shape, temp_new_tol_shape });
+  mxnet::ShapeVector out_shape_vec(1, mxnet::TShape());
+  mxnet::op::BinaryBroadcastShape(attrs, &in_shape_vec, &out_shape_vec);
+  // Assign shape.
+  if (broadcast_shape) {
+    *broadcast_shape = out_shape_vec[0];
+  }
+  if (new_tol_shape) {
+    *new_tol_shape = temp_new_tol_shape;
+  }
+}
+
+template<typename xpu, typename DType>
+struct WSQ {
+  static size_t SVDWorkspaceSizeQuery(const TBlob& a,
+                                      const mxnet::TShape& u_shape,
+                                      const mxnet::TShape& s_shape,
+                                      const mxnet::TShape& v_shape,
+                                      const OpContext& ctx) {
+    size_t workspace_size = 0;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const int a_ndim = a.shape_.ndim();
+    const int u_ndim = u_shape.ndim();
+    const int s_ndim = s_shape.ndim();
+    const int v_ndim = v_shape.ndim();
+    mxnet::TShape u_shape2 = Shape2(u_shape[u_ndim - 2], u_shape[u_ndim - 1]);
+    mxnet::TShape s_shape1 = Shape1(s_shape[s_ndim - 1]);
+    mxnet::TShape v_shape2 = Shape2(v_shape[v_ndim - 2], v_shape[v_ndim - 1]);
+    if (xpu::kDevCPU) {
+      std::vector<DType> u_vec(u_shape2.Size(), 0);
+      std::vector<DType> s_vec(s_shape1.Size(), 0);
+      std::vector<DType> v_vec(v_shape2.Size(), 0);
+      // Get workspace size in linalg_gesdd.
+      workspace_size += linalg_gesdd_workspace_query(
+        a.shape_[a_ndim - 2], a.shape_[a_ndim - 1],
+        TBlob(u_vec.data(), u_shape2, a.dev_mask(), a.dev_id()).get<xpu, 2, DType>(s),
+        TBlob(s_vec.data(), s_shape1, a.dev_mask(), a.dev_id()).get<xpu, 1, DType>(s),
+        TBlob(v_vec.data(), v_shape2, a.dev_mask(), a.dev_id()).get<xpu, 2, DType>(s), s);
+    } else {
+      Storage::Handle u_handle =
+        Storage::Get()->Alloc(sizeof(DType) * u_shape2.Size(), Context::GPU());
+      Storage::Handle s_handle =
+        Storage::Get()->Alloc(sizeof(DType) * s_shape1.Size(), Context::GPU());
+      Storage::Handle v_handle =
+        Storage::Get()->Alloc(sizeof(DType) * v_shape2.Size(), Context::GPU());
+      TBlob u_data(static_cast<DType*>(u_handle.dptr), u_shape2, a.dev_mask(), a.dev_id());
+      TBlob s_data(static_cast<DType*>(s_handle.dptr), s_shape1, a.dev_mask(), a.dev_id());
+      TBlob v_data(static_cast<DType*>(v_handle.dptr), v_shape2, a.dev_mask(), a.dev_id());
+      // Get workspace size in linalg_gesvd.
+      if (a.shape_[a_ndim - 2] >= a.shape_[a_ndim - 1]) {
+        workspace_size += linalg_gesvd_workspace_query(v_data.get<xpu, 2, DType>(s),
+                                                       s_data.get<xpu, 1, DType>(s),
+                                                       u_data.get<xpu, 2, DType>(s), s);
+      } else {
+        workspace_size += linalg_gesvd_workspace_query(u_data.get<xpu, 2, DType>(s),
+                                                       s_data.get<xpu, 1, DType>(s),
+                                                       v_data.get<xpu, 2, DType>(s), s);
+      }
+      Storage::Get()->Free(u_handle);
+      Storage::Get()->Free(s_handle);
+      Storage::Get()->Free(v_handle);
+    }
+    return workspace_size;
+  }
+
+  static size_t MatrixRankNoneTolForwardWSQ(size_t *svd_workspace_size,
+                                            const TBlob& a,
+                                            const OpContext& ctx) {
+    size_t workspace_size = 0;
+    mxnet::TShape u_shape, s_shape, v_shape;
+    GetPinvShape(a.shape_, &u_shape, &s_shape, &v_shape);
+    *svd_workspace_size = SVDWorkspaceSizeQuery(a, u_shape, s_shape, v_shape, ctx);
+    workspace_size += *svd_workspace_size;  // For #gesdd_ or #gesvd work space.
+    workspace_size += u_shape.Size();       // For UT.
+    workspace_size += s_shape.Size();       // For S.
+    workspace_size += v_shape.Size();       // For V.
+    return workspace_size * sizeof(DType);
+  }
+
+  static size_t MatrixRankForwardWSQ(size_t *svd_workspace_size,
+                                     const TBlob& a,
+                                     const TBlob& tol,
+                                     const nnvm::NodeAttrs& attrs,
+                                     const OpContext& ctx) {
+    const mxnet::TShape a_shape = a.shape_;
+    const mxnet::TShape tol_shape = tol.shape_;
+    size_t workspace_size = 0;
+    mxnet::TShape u_shape, s_shape, v_shape;
+    GetPinvShape(a.shape_, &u_shape, &s_shape, &v_shape);
+    mxnet::TShape broadcast_shape, new_tol_shape;
+    GetOrCheckBroadcastShape(attrs, a_shape, tol_shape, &broadcast_shape, &new_tol_shape);
+    *svd_workspace_size = SVDWorkspaceSizeQuery(a, u_shape, s_shape, v_shape, ctx);
+    workspace_size += *svd_workspace_size;     // For #gesdd_ or #gesvd work space.
+    workspace_size += u_shape.Size();          // For UT.
+    workspace_size += s_shape.Size();          // For S.
+    workspace_size += v_shape.Size();          // For V.
+    workspace_size += new_tol_shape.Size();    // For tol with newaxis.
+    workspace_size += broadcast_shape.Size();  // For binary broadcast shape.
+    return workspace_size * sizeof(DType);
+  }
+};
+
+template<typename xpu>
+void MatrixRankNoneTolForwardImpl(const TBlob& a,
+                                  const TBlob& rank,
+                                  const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<OpReqType>& req) {
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const mxnet::TShape& a_shape = a.shape_;
+  const int a_ndim = a.ndim();
+  MSHADOW_SGL_DBL_TYPE_SWITCH(a.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      if (a_ndim < 2) {
+        mxnet_op::Kernel<VectorRankKernel<req_type>, xpu>::Launch(
+          s, 1, a.dptr<DType>(), rank.dptr<int64_t>(), a.Size());
+        return;
+      }
+      // a_ndim >= 2
+      const int nrow = a_shape[a_ndim - 2];
+      const int ncol = a_shape[a_ndim - 1];
+      const MatrixRankNoneTolParam& param = nnvm::get<MatrixRankNoneTolParam>(attrs.parsed);
+      CHECK_EQ(param.hermitian, false)
+        << "matrix_rank not support param.hermitian = true at present.";
+      double finfoEps = a.type_flag_ == mshadow::kFloat32 ? param.finfoEps32 : param.finfoEps64;
+      // Step1: Calculate workspace size.
+      size_t svd_workspace_size = 0;
+      size_t workspace_size =
+        WSQ<xpu, DType>::MatrixRankNoneTolForwardWSQ(&svd_workspace_size, a, ctx);
+      Tensor<xpu, 1, char> workspace =
+        ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+      // Step2: Allocate memory.
+      mxnet::TShape s_shape, u_shape, v_shape, ut_shape, vt_shape;
+      GetPinvShape(a_shape, &u_shape, &s_shape, &v_shape, &ut_shape, &vt_shape);
+      DType *s_ptr = reinterpret_cast<DType*>(workspace.dptr_);
+      DType *u_ptr = s_ptr + s_shape.Size();
+      DType *v_ptr = u_ptr + u_shape.Size();
+      DType *work_ptr = v_ptr + v_shape.Size();
+      TBlob s_data(s_ptr, s_shape, a.dev_mask(), a.dev_id());
+      TBlob u_data(u_ptr, u_shape, a.dev_mask(), a.dev_id());
+      TBlob v_data(v_ptr, v_shape, a.dev_mask(), a.dev_id());
+      TBlob work_data(work_ptr, Shape1(svd_workspace_size), a.dev_mask(), a.dev_id());
+      // Step3: SVD.
+      SVDWrapper::op<xpu, DType>(a, s_data, u_data, ut_shape, v_data, vt_shape, work_data, ctx);
+      // Step4: Calculate rank.
+      const int data_size = s_data.size(s_data.ndim() - 1);
+      const int batch_size = a_ndim == 2 ? 1 : s_shape.ProdShape(0, s_shape.ndim() - 1);
+      mxnet_op::Kernel<MatrixRankNoneTolKernel<req_type>, xpu>::Launch(s, batch_size,
+                                                                       s_data.dptr<DType>(),
+                                                                       rank.dptr<int64_t>(),
+                                                                       nrow, ncol, finfoEps,
+                                                                       data_size, batch_size);
+    });
+  });
+}
+
+template<typename xpu>
+void MatrixRankNoneTolForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (kNullOp == req[0]) { return; }
+  CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
+
+  const TBlob& a = inputs[0];
+  const TBlob& rank = outputs[0];
+  MatrixRankNoneTolForwardImpl<xpu>(a, rank, attrs, ctx, req);
+}
+
+template<typename xpu>
+void MatrixRankForwardImpl(const TBlob& a,
+                           const TBlob& tol,
+                           const TBlob& rank,
+                           const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<OpReqType>& req) {
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const mxnet::TShape& a_shape = a.shape_;
+  const mxnet::TShape& tol_shape = tol.shape_;
+  const int a_ndim = a.ndim();
+  MSHADOW_SGL_DBL_TYPE_SWITCH(a.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      if (a_ndim < 2) {
+        mxnet_op::Kernel<VectorRankKernel<req_type>, xpu>::Launch(
+          s, 1, a.dptr<DType>(), rank.dptr<int64_t>(), a.Size());
+        return;
+      }
+      // a_ndim >= 2
+      const MatrixRankParam& param = nnvm::get<MatrixRankParam>(attrs.parsed);
+      CHECK_EQ(param.hermitian, false)
+        << "matrix_rank not support param.hermitian = true at present.";
+      mxnet::TShape s_shape, u_shape, v_shape, ut_shape, vt_shape;
+      GetPinvShape(a_shape, &u_shape, &s_shape, &v_shape, &ut_shape, &vt_shape);
+      mxnet::TShape broadcast_shape, new_tol_shape;
+      GetOrCheckBroadcastShape(attrs, a_shape, tol_shape, &broadcast_shape, &new_tol_shape);
+      // Step1: Calculate workspace size.
+      size_t svd_workspace_size = 0;
+      size_t workspace_size =
+        WSQ<xpu, DType>::MatrixRankForwardWSQ(&svd_workspace_size, a, tol, attrs, ctx);
+      Tensor<xpu, 1, char> workspace =
+        ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+      // Step2: Allocate memory.
+      DType *s_ptr = reinterpret_cast<DType*>(workspace.dptr_);
+      DType *u_ptr = s_ptr + s_shape.Size();
+      DType *v_ptr = u_ptr + u_shape.Size();
+      DType *work_ptr = v_ptr + v_shape.Size();
+      DType *new_tol_ptr = work_ptr + svd_workspace_size;
+      DType *broadcast_ptr = new_tol_ptr + new_tol_shape.Size();
+      TBlob s_data(s_ptr, s_shape, a.dev_mask(), a.dev_id());
+      TBlob u_data(u_ptr, u_shape, a.dev_mask(), a.dev_id());
+      TBlob v_data(v_ptr, v_shape, a.dev_mask(), a.dev_id());
+      TBlob work_data(work_ptr, Shape1(svd_workspace_size), a.dev_mask(), a.dev_id());
+      TBlob new_tol_data(new_tol_ptr, new_tol_shape, a.dev_mask(), a.dev_id());
+      TBlob broadcast_data(broadcast_ptr, broadcast_shape, a.dev_mask(), a.dev_id());
+      // Step3: SVD.
+      SVDWrapper::op<xpu, DType>(a, s_data, u_data, ut_shape, v_data, vt_shape, work_data, ctx);
+      // Step4: Calculate broadcast data.
+      if (new_tol_data.dptr<DType>() != tol.dptr<DType>()) {
+        Copy(new_tol_data.FlatTo1D<xpu, DType>(s), tol.FlatTo1D<xpu, DType>(s), s);
+      }
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
+                                                                 {s_data, new_tol_data},
+                                                                 {kWriteTo}, {broadcast_data});
+      // Step5: Calculate rank.
+      const int b_ndim  = broadcast_shape.ndim();
+      const int data_size = broadcast_data.size(b_ndim - 1);
+      const int batch_size = b_ndim == 1 ? 1 : broadcast_shape.ProdShape(0, b_ndim - 1);
+      mxnet_op::Kernel<MatrixRankKernel<req_type>, xpu>::Launch(s, batch_size,
+                                                                broadcast_data.dptr<DType>(),
+                                                                rank.dptr<int64_t>(),
+                                                                data_size, batch_size);
+    });
+  });
+}
+
+template<typename xpu>
+void MatrixRankForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (kNullOp == req[0]) { return; }
+  CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
+
+  const TBlob& a = inputs[0];
+  const TBlob& tol = inputs[1];
+  const TBlob& rank = outputs[0];
+  MatrixRankForwardImpl<xpu>(a, tol, rank, attrs, ctx, req);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_LINALG_NP_MATRIX_RANK_INL_H_
diff --git a/src/operator/numpy/linalg/np_matrix_rank.cc b/src/operator/numpy/linalg/np_matrix_rank.cc
new file mode 100644
index 000000000000..d3794a1de0e9
--- /dev/null
+++ b/src/operator/numpy/linalg/np_matrix_rank.cc
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_matrix_rank.cc
+ * \brief CPU implementation of the matrix_rank Operator
+ */
+#include "./np_matrix_rank-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool MatrixRankNoneTolShape(const nnvm::NodeAttrs& attrs,
+                                   mxnet::ShapeVector *in_attrs,
+                                   mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const mxnet::TShape& a_shape = (*in_attrs)[0];
+  const int a_ndim = a_shape.ndim();
+
+  if (shape_is_known(a_shape)) {
+    CHECK_GT(a_shape.Size(), 0U)
+      << "Not support zero-size input array which has no identity";
+    if (a_ndim < 2) {
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+    } else {
+      mxnet::TShape rank_shape(a_ndim - 2, 0);
+      for (int i = 0; i < a_ndim - 2; ++i) { rank_shape[i] = a_shape[i]; }
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, rank_shape);
+    }
+  }
+  return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
+}
+
+inline bool MatrixRankNoneTolType(const nnvm::NodeAttrs& attrs,
+                                  std::vector<int>* in_attrs,
+                                  std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  int a_type = in_attrs->at(0);
+
+  CHECK_NE(a_type, mshadow::kFloat16)
+    << "array type float16 is unsupported in linalg.";
+  CHECK(a_type == mshadow::kFloat32 || a_type == mshadow::kFloat64)
+    << "array type should be float32 or float64.";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  return out_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(MatrixRankNoneTolParam);
+
+NNVM_REGISTER_OP(_npi_matrix_rank_none_tol)
+.describe(R"code()code" ADD_FILELINE)
+.set_attr_parser(mxnet::op::ParamParser<MatrixRankNoneTolParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs){
+  return std::vector<std::string>{"M"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", MatrixRankNoneTolShape)
+.set_attr<nnvm::FInferType>("FInferType", MatrixRankNoneTolType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs){
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", MatrixRankNoneTolForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("M", "NDArray-or-Symbol", "Tensor of matrix")
+.add_arguments(MatrixRankNoneTolParam::__FIELDS__());
+
+inline bool MatrixRankShape(const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector *in_attrs,
+                            mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const mxnet::TShape& a_shape = (*in_attrs)[0];
+  const mxnet::TShape& tol_shape = (*in_attrs)[1];
+  const int a_ndim = a_shape.ndim();
+  const int tol_ndim = tol_shape.ndim();
+
+  if (shape_is_known(a_shape) && shape_is_known(tol_shape)) {
+    CHECK_GT(a_shape.Size(), 0U)
+      << "Not support zero-size input array which has no identity";
+    if (a_ndim < 2) {
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+    } else {
+      mxnet::TShape broadcast_shape;
+      GetOrCheckBroadcastShape(attrs, a_shape, tol_shape, &broadcast_shape);
+      if (broadcast_shape.ndim() == 1) {
+        if (tol_ndim == 0) {
+          SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+        } else {
+          SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+        }
+      } else {
+        mxnet::TShape rank_shape(broadcast_shape.ndim() - 1, 0);
+        for (int i = 0; i < broadcast_shape.ndim() - 1; ++i) {
+          rank_shape[i] = broadcast_shape[i];
+        }
+        SHAPE_ASSIGN_CHECK(*out_attrs, 0, rank_shape);
+      }
+    }
+  }
+  return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
+}
+
+inline bool MatrixRankType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  int a_type = in_attrs->at(0);
+  int tol_type = in_attrs->at(1);
+
+  CHECK_NE(a_type, mshadow::kFloat16)
+    << "array type float16 is unsupported in linalg.";
+  CHECK(a_type == mshadow::kFloat32 || a_type == mshadow::kFloat64)
+    << "array type should be float32 or float64.";
+  CHECK(tol_type == mshadow::kFloat32 || tol_type == mshadow::kFloat64)
+    << "tol type should be float32 or float64.";
+  CHECK_EQ(a_type, tol_type)
+    << "array type and tol type should be the same.";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  return out_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(MatrixRankParam);
+
+NNVM_REGISTER_OP(_npi_matrix_rank)
+.describe(R"code()code" ADD_FILELINE)
+.set_attr_parser(mxnet::op::ParamParser<MatrixRankParam>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs){
+  return std::vector<std::string>{"M", "tol"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", MatrixRankShape)
+.set_attr<nnvm::FInferType>("FInferType", MatrixRankType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs){
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", MatrixRankForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("M", "NDArray-or-Symbol", "Tensor of matrix")
+.add_argument("tol", "NDArray-or-Symbol", "Tensor of matrix")
+.add_arguments(MatrixRankParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/linalg/np_matrix_rank.cu b/src/operator/numpy/linalg/np_matrix_rank.cu
new file mode 100644
index 000000000000..9528f698d35c
--- /dev/null
+++ b/src/operator/numpy/linalg/np_matrix_rank.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_matrix_rank.cu
+ * \brief GPU implementation of the matrix_rank Operator
+ */
+#include <mxnet/operator_util.h>
+#include "./np_matrix_rank-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_matrix_rank_none_tol)
+.set_attr<FCompute>("FCompute<gpu>", MatrixRankNoneTolForward<gpu>);
+
+NNVM_REGISTER_OP(_npi_matrix_rank)
+.set_attr<FCompute>("FCompute<gpu>", MatrixRankForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 1f795721b820..79a45f5e46f6 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -2113,12 +2113,29 @@ def _add_workload_linalg_matrix_power():
 
 
 def _add_workload_linalg_matrix_rank():
-    a = np.eye(4)
-    b = a; b[-1,-1] = 0
-    c = np.ones((4,))
-    OpArgMngr.add_workload('linalg.matrix_rank', a)
-    OpArgMngr.add_workload('linalg.matrix_rank', b)
-    OpArgMngr.add_workload('linalg.matrix_rank', c)
+    shapes = [
+        ((4, 3), ()),
+        ((4, 3), (1,)),
+        ((4, 3), (2, 3,)),
+        ((2, 1, 1), (1,)),
+        ((2, 3, 3), (2,)),
+        ((2, 3, 1, 1), ()),
+        ((2, 3, 4, 4), (1, 3)),
+        ((2, 3, 4, 5), (2, 3)),
+        ((2, 3, 5, 4), (2, 3)),
+    ]
+    dtypes = (np.float32, np.float64)
+    for dtype in dtypes:
+        for a_shape, tol_shape in shapes:
+            for tol_is_none in [True, False]:
+                a_np = _np.asarray(_np.random.uniform(-10., 10., a_shape))
+                a = np.array(a_np, dtype=dtype)
+                if tol_is_none:
+                    OpArgMngr.add_workload('linalg.matrix_rank', a, None, False)
+                else:
+                    tol_np = _np.random.uniform(10., 20., tol_shape)
+                    tol = np.array(tol_np, dtype=dtype)
+                    OpArgMngr.add_workload('linalg.matrix_rank', a, tol, False)
 
 
 def _add_workload_linalg_multi_dot():
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 063c3b7a58a6..111f0282283e 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -5625,6 +5625,83 @@ def check_lstsq(a_np, b_np, rcond_np, x, residuals, rank, s):
                 check_lstsq(a_np, b_np, rcond, x, residuals, rank, s)
 
 
+@with_seed()
+@use_np
+def test_np_linalg_matrix_rank():
+    class TestMatrixRank(HybridBlock):
+        def __init__(self, hermitian):
+            super(TestMatrixRank, self).__init__()
+            self._hermitian = hermitian
+
+        def hybrid_forward(self, F, M, tol=None):
+            return F.np.linalg.matrix_rank(M, tol, hermitian=self._hermitian)
+
+    def check_matrix_rank(rank, a_np, tol, hermitian):
+        try:
+            rank_expected = _np.linalg.matrix_rank(a_np, tol=tol, hermitian=hermitian)
+        except Exception as e:
+            print("a:", a_np)
+            print("a shape:", a_np.shape)
+            print(e)
+        else:
+            if a_np.ndim < 2:
+                assert rank.shape == _np.asarray(rank_expected).shape
+            else:
+                assert rank.shape == rank_expected.shape
+            assert_almost_equal(rank.asnumpy(), rank_expected, rtol=rtol, atol=atol)
+
+    shapes = [
+        ((), ()),
+        ((1,), (1,)),
+        ((3,), (1,)),
+        ((1, 1), ()),
+        ((1, 1), (1,)),
+        ((3, 3), (1,)),
+        ((3, 4), (1,)),
+        ((4, 3), ()),
+        ((4, 3), (1,)),
+        ((4, 3), (2,)),
+        ((4, 3), (2, 3,)),
+        ((2, 1, 1), ()),
+        ((2, 1, 1), (1,)),
+        ((2, 3, 3), (2,)),
+        ((2, 3, 4), (1,)),
+        ((2, 4, 3), (2,)),
+        ((2, 3, 1, 1), ()),
+        ((2, 3, 1, 1), (1, 1)),
+        ((2, 3, 1, 1), (2, 1)),
+        ((2, 3, 4, 4), (1, 3)),
+        ((2, 3, 4, 5), (2, 1)),
+        ((2, 3, 5, 4), (1, 3)),
+        ((2, 3, 1, 1), (2, 3)),
+        ((2, 3, 4, 4), (2, 3)),
+        ((2, 3, 4, 5), (2, 3)),
+        ((2, 3, 5, 4), (2, 3)),
+    ]
+    dtypes = ['float32', 'float64']
+    for dtype in dtypes:
+        for a_shape, tol_shape in shapes:
+            for tol_is_none, hybridize in itertools.product([True, False], [True, False]):
+                rtol = 1e-3
+                atol = 1e-5
+                test_matrix_rank = TestMatrixRank(hermitian=False)
+                if hybridize:
+                    test_matrix_rank.hybridize()
+
+                a_np = _np.asarray(_np.random.uniform(-10., 10., a_shape))
+                a = np.array(a_np, dtype=dtype)
+                if tol_is_none:
+                    rank = test_matrix_rank(a)
+                    # check matrix_rank validity
+                    check_matrix_rank(rank, a.asnumpy(), tol=None, hermitian=False)
+                else:
+                    tol_np = _np.random.uniform(10., 20., tol_shape)
+                    tol = np.array(tol_np, dtype=dtype)
+                    rank = test_matrix_rank(a, tol)
+                    # check matrix_rank validity
+                    check_matrix_rank(rank, a.asnumpy(), tol.asnumpy(), hermitian=False)
+
+
 @with_seed()
 @use_np
 def test_np_linalg_pinv():

From a044744884271f82465dfd2ddd04fe37870da51c Mon Sep 17 00:00:00 2001
From: dw_sjtu <46704444+sjtuWangDing@users.noreply.github.com>
Date: Tue, 14 Apr 2020 12:58:11 +0800
Subject: [PATCH 02/14] [Numpy] FFI for linalg.qr and linalg.lstsq (#18040)

* impl - ffi for linalg.qr/lstsq

* impl - ffi benchmark

Co-authored-by: Ubuntu <ubuntu@ip-172-31-10-214.us-east-2.compute.internal>
---
 benchmark/python/ffi/benchmark_ffi.py     |  2 +
 python/mxnet/ndarray/numpy/linalg.py      | 12 ++---
 python/mxnet/symbol/numpy/linalg.py       |  6 ++-
 src/api/operator/numpy/linalg/np_lstsq.cc | 65 +++++++++++++++++++++++
 src/api/operator/numpy/linalg/np_qr.cc    | 44 +++++++++++++++
 src/operator/numpy/linalg/np_lstsq-inl.h  | 27 ++++++++--
 6 files changed, 143 insertions(+), 13 deletions(-)
 create mode 100644 src/api/operator/numpy/linalg/np_lstsq.cc
 create mode 100644 src/api/operator/numpy/linalg/np_qr.cc

diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 099bfadbe6d2..26d20e6e0aac 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -68,6 +68,8 @@ def prepare_workloads():
     OpArgMngr.add_workload("linalg.matrix_rank", pool['3x3'], pool['1'], hermitian=False)
     OpArgMngr.add_workload("linalg.svd", pool['3x3'])
     OpArgMngr.add_workload("linalg.cholesky", pool['1x1'])
+    OpArgMngr.add_workload("linalg.qr", pool['3x3'])
+    OpArgMngr.add_workload("linalg.lstsq", pool['2x1'], pool['2'], rcond=None)
     OpArgMngr.add_workload("linalg.eigvals", pool['1x1'])
     OpArgMngr.add_workload("linalg.eigvalsh", pool['1x1'], UPLO='L')
     OpArgMngr.add_workload("linalg.inv", pool['1x1'])
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
index 86bf11a00b02..d31e8ea7921d 100644
--- a/python/mxnet/ndarray/numpy/linalg.py
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -139,13 +139,9 @@ def lstsq(a, b, rcond='warn'):
     >>> m, c
     (1.0 -0.95) # may vary
     """
-    new_default = False
-    if rcond is None:
-        rcond = _np.finfo(a.dtype).eps
-        new_default = True
-    if rcond == "warn":
-        rcond = -1
-    x, residuals, rank, s = _npi.lstsq(a, b, rcond=rcond, new_default=new_default)
+    finfo_eps_32 = _np.finfo(_np.float32).eps
+    finfo_eps_64 = _np.finfo(_np.float64).eps
+    x, residuals, rank, s = _api_internal.lstsq(a, b, rcond, finfo_eps_32, finfo_eps_64)
     return (x, residuals, rank, s)
 
 
@@ -570,7 +566,7 @@ def qr(a, mode='reduced'):
     """
     if mode is not None and mode != 'reduced':
         raise NotImplementedError("Only default mode='reduced' is implemented.")
-    return tuple(_npi.qr(a))
+    return tuple(_api_internal.qr(a))
 
 
 def inv(a):
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index da7095520674..3cea6ddae157 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -114,12 +114,14 @@ def lstsq(a, b, rcond='warn'):
     If `b` is a matrix, then all array results are returned as matrices.
     """
     new_default = False
+    finfo_eps_32 = _np.finfo(_np.float32).eps
+    finfo_eps_64 = _np.finfo(_np.float64).eps
     if rcond is None:
-        rcond = _np.finfo(_np.float64).eps
+        rcond = 1
         new_default = True
     if rcond == "warn":
         rcond = -1
-    x, residuals, rank, s = _npi.lstsq(a, b, rcond=rcond, new_default=new_default)
+    x, residuals, rank, s = _npi.lstsq(a, b, rcond=rcond, finfoEps32=finfo_eps_32, finfoEps64=finfo_eps_64, new_default=new_default)  # pylint: disable=line-too-long
     return (x, residuals, rank, s)
 
 
diff --git a/src/api/operator/numpy/linalg/np_lstsq.cc b/src/api/operator/numpy/linalg/np_lstsq.cc
new file mode 100644
index 000000000000..fbeafbee6054
--- /dev/null
+++ b/src/api/operator/numpy/linalg/np_lstsq.cc
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_lstsq.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/linalg/np_lstsq.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/linalg/np_lstsq-inl.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.lstsq")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_lstsq");
+  nnvm::NodeAttrs attrs;
+  op::LstsqParam param;
+  if (args[2].type_code() == kNull) {
+    param.rcond = static_cast<double>(1);
+  } else if (args[2].type_code() == kStr) {
+    const std::string rcond_str = args[2].operator std::string();
+    if (rcond_str == "warn") {
+      param.rcond = static_cast<double>(-1);
+    } else {
+      CHECK(false) << "ValueError: wrong parameter rcond = " << rcond_str;
+    }
+  } else {
+    param.rcond = args[2].operator double();
+  }
+  param.finfoEps32 = args[3].operator double();
+  param.finfoEps64 = args[4].operator double();
+  param.new_default = args[2].type_code() == kNull ? true : false;
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::LstsqParam>(&attrs);
+  int num_inputs = 2;
+  int num_outputs = 0;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*(), args[1].operator mxnet::NDArray*()};
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = ADT(0, {NDArrayHandle(ndoutputs[0]),
+                 NDArrayHandle(ndoutputs[1]),
+                 NDArrayHandle(ndoutputs[2]),
+                 NDArrayHandle(ndoutputs[3])});
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/linalg/np_qr.cc b/src/api/operator/numpy/linalg/np_qr.cc
new file mode 100644
index 000000000000..e9c0ec5d66d3
--- /dev/null
+++ b/src/api/operator/numpy/linalg/np_qr.cc
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_qr.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/linalg/np_qr.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.qr")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_qr");
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  int num_inputs = 1;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*()};
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = ADT(0, {NDArrayHandle(ndoutputs[0]),
+                 NDArrayHandle(ndoutputs[1])});
+});
+
+}  // namespace mxnet
diff --git a/src/operator/numpy/linalg/np_lstsq-inl.h b/src/operator/numpy/linalg/np_lstsq-inl.h
index 0389b7a5d92c..00fc19d9ed80 100644
--- a/src/operator/numpy/linalg/np_lstsq-inl.h
+++ b/src/operator/numpy/linalg/np_lstsq-inl.h
@@ -41,15 +41,34 @@ using namespace mshadow;
 
 struct LstsqParam : public dmlc::Parameter<LstsqParam> {
   double rcond;
+  float finfoEps32;
+  double finfoEps64;
   bool new_default;
   DMLC_DECLARE_PARAMETER(LstsqParam) {
     DMLC_DECLARE_FIELD(rcond)
     .set_default(-1)
     .describe("Cut-off ratio for small singular values");
+    DMLC_DECLARE_FIELD(finfoEps32)
+    .set_default(0)
+    .describe("Machine limits for float32 type");
+    DMLC_DECLARE_FIELD(finfoEps64)
+    .set_default(0)
+    .describe("Machine limits for float64 type");
     DMLC_DECLARE_FIELD(new_default)
     .set_default(false)
     .describe("Specifies whether rcond is default which is machine precision");
   }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream rcond_s, finfoEps32_s, finfoEps64_s, new_default_s;
+    rcond_s << rcond;
+    finfoEps32_s << finfoEps32;
+    finfoEps64_s << finfoEps64;
+    new_default_s << new_default;
+    (*dict)["rcond"] = rcond_s.str();
+    (*dict)["finfoEps32"] = finfoEps32_s.str();
+    (*dict)["finfoEps64"] = finfoEps64_s.str();
+    (*dict)["new_default"] = new_default_s.str();
+  }
 };
 
 template<typename xpu, typename DType>
@@ -335,10 +354,12 @@ void LstsqOpForwardImpl(const TBlob& a,
                         const OpContext& ctx,
                         const std::vector<OpReqType>& req) {
   // Get param.
-  double rcond = nnvm::get<LstsqParam>(attrs.parsed).rcond;
-  bool new_default = nnvm::get<LstsqParam>(attrs.parsed).new_default;
+  const LstsqParam& param = nnvm::get<LstsqParam>(attrs.parsed);
+  double rcond = param.rcond;
+  bool new_default = param.new_default;
+  double finfoEps = a.type_flag_ == mshadow::kFloat32 ? param.finfoEps32 : param.finfoEps64;
   if (new_default) {
-    rcond *= std::max(a.shape_[0], a.shape_[1]);
+    rcond = finfoEps * std::max(a.shape_[0], a.shape_[1]);
   }
   const mxnet::TShape& a_shape = a.shape_;
   const mxnet::TShape& b_shape = b.shape_;

From f3cfaf9b01b5c4dd99b4412e1c63c59f839d6cdf Mon Sep 17 00:00:00 2001
From: Yiyan66 <57363390+Yiyan66@users.noreply.github.com>
Date: Tue, 14 Apr 2020 15:23:22 +0800
Subject: [PATCH 03/14] ffi random (#18051)

---
 benchmark/python/ffi/benchmark_ffi.py         |   7 +
 python/mxnet/ndarray/numpy/random.py          | 101 ++++--------
 python/mxnet/numpy/random.py                  |  39 +----
 python/mxnet/symbol/numpy/random.py           |  11 +-
 .../numpy/random/np_exponential_op.cc         |  71 +++++++++
 .../numpy/random/np_location_scale_op.cc      | 150 ++++++++++++++++++
 src/api/operator/numpy/random/np_pareto_op.cc |  72 +++++++++
 src/api/operator/numpy/random/np_power_op.cc  |  72 +++++++++
 .../operator/numpy/random/np_rayleigh_op.cc   |  72 +++++++++
 .../operator/numpy/random/np_weibull_op.cc    |  72 +++++++++
 src/operator/numpy/random/np_exponential_op.h |   8 +
 .../numpy/random/np_location_scale_op.h       |  10 ++
 src/operator/numpy/random/np_pareto_op.h      |   9 ++
 src/operator/numpy/random/np_power_op.h       |  13 ++
 src/operator/numpy/random/np_rayleigh_op.h    |   9 ++
 src/operator/numpy/random/np_weibull_op.h     |   9 ++
 16 files changed, 611 insertions(+), 114 deletions(-)
 create mode 100644 src/api/operator/numpy/random/np_exponential_op.cc
 create mode 100644 src/api/operator/numpy/random/np_location_scale_op.cc
 create mode 100644 src/api/operator/numpy/random/np_pareto_op.cc
 create mode 100644 src/api/operator/numpy/random/np_power_op.cc
 create mode 100644 src/api/operator/numpy/random/np_rayleigh_op.cc
 create mode 100644 src/api/operator/numpy/random/np_weibull_op.cc

diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 26d20e6e0aac..328a74c98c83 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -111,6 +111,13 @@ def prepare_workloads():
     OpArgMngr.add_workload("hypot", pool['2x2'], pool['2x2'])
     OpArgMngr.add_workload("ldexp", pool['2x2'].astype(int), pool['2x2'].astype(int))
     OpArgMngr.add_workload("random.uniform", low=0, high=1, size=1)
+    OpArgMngr.add_workload("random.exponential", scale=2, size=(2,2))
+    OpArgMngr.add_workload("random.rayleigh", scale=2, size=(2,2))
+    OpArgMngr.add_workload("random.weibull", a=2, size=(2,2))
+    OpArgMngr.add_workload("random.pareto", a=2, size=(2,2))
+    OpArgMngr.add_workload("random.power", a=2, size=(2,2))
+    OpArgMngr.add_workload("random.logistic", loc=2, scale=2, size=(2,2))
+    OpArgMngr.add_workload("random.gumbel", loc=2, scale=2, size=(2,2))
     OpArgMngr.add_workload("where", pool['2x3'], pool['2x3'], pool['2x1'])
     OpArgMngr.add_workload("fmax", pool['2x2'], pool['2x2'])
     OpArgMngr.add_workload("fmin", pool['2x2'], pool['2x2'])
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 8449852a6e72..f6e5bce00e75 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -242,24 +242,13 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized logistic distribution.
     """
-    from ...numpy import ndarray as np_ndarray
-    input_type = (isinstance(loc, np_ndarray), isinstance(scale, np_ndarray))
     if ctx is None:
-        ctx = current_context()
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    if input_type == (True, True):
-        return _npi.logistic(loc, scale, loc=None, scale=None, size=size,
-                             ctx=ctx, out=out)
-    elif input_type == (False, True):
-        return _npi.logistic(scale, loc=loc, scale=None, size=size,
-                             ctx=ctx, out=out)
-    elif input_type == (True, False):
-        return _npi.logistic(loc, loc=None, scale=scale, size=size,
-                             ctx=ctx, out=out)
-    else:
-        return _npi.logistic(loc=loc, scale=scale, size=size,
-                             ctx=ctx, out=out)
+    return _api_internal.logistic(loc, scale, size, ctx, out)
 
 
 def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
@@ -290,24 +279,13 @@ def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized Gumbel distribution.
     """
-    from ...numpy import ndarray as np_ndarray
-    input_type = (isinstance(loc, np_ndarray), isinstance(scale, np_ndarray))
     if ctx is None:
-        ctx = current_context()
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    if input_type == (True, True):
-        return _npi.gumbel(loc, scale, loc=None, scale=None, size=size,
-                           ctx=ctx, out=out)
-    elif input_type == (False, True):
-        return _npi.gumbel(scale, loc=loc, scale=None, size=size,
-                           ctx=ctx, out=out)
-    elif input_type == (True, False):
-        return _npi.gumbel(loc, loc=None, scale=scale, size=size,
-                           ctx=ctx, out=out)
-    else:
-        return _npi.gumbel(loc=loc, scale=scale, size=size,
-                           ctx=ctx, out=out)
+    return _api_internal.gumbel(loc, scale, size, ctx, out)
 
 
 def multinomial(n, pvals, size=None):
@@ -387,17 +365,13 @@ def rayleigh(scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized Rayleigh distribution.
     """
-    from ...numpy import ndarray as np_ndarray
-    tensor_type_name = np_ndarray
     if ctx is None:
-        ctx = current_context()
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    is_tensor = isinstance(scale, tensor_type_name)
-    if is_tensor:
-        return _npi.rayleigh(scale, scale=None, size=size, ctx=ctx, out=out)
-    else:
-        return _npi.rayleigh(scale=scale, size=size, ctx=ctx, out=out)
+    return _api_internal.rayleigh(scale, size, ctx, out)
 
 
 def multivariate_normal(mean, cov, size=None, check_valid=None, tol=None):
@@ -570,18 +544,13 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized exponential distribution.
     """
-    from ...numpy import ndarray as np_ndarray
-    tensor_type_name = np_ndarray
     if ctx is None:
-        ctx = current_context()
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    is_tensor = isinstance(scale, tensor_type_name)
-    if is_tensor:
-        return _npi.exponential(scale, scale=None, size=size,
-                                ctx=ctx, out=out)
-    else:
-        return _npi.exponential(scale=scale, size=size, ctx=ctx, out=out)
+    return _api_internal.exponential(scale, size, ctx, out)
 
 
 def weibull(a, size=None, ctx=None, out=None):
@@ -626,17 +595,13 @@ def weibull(a, size=None, ctx=None, out=None):
     model time to failure, in modeling particle sizes, in information retrieval
     to model dwell time on pages, in quantitative finance to model risk etc.
     """
-    from ...numpy import ndarray as np_ndarray
-    tensor_type_name = np_ndarray
     if ctx is None:
-        ctx = current_context()
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    is_tensor = isinstance(a, tensor_type_name)
-    if is_tensor:
-        return _npi.weibull(a, a=None, size=size, ctx=ctx, out=out)
-    else:
-        return _npi.weibull(a=a, size=size, ctx=ctx, out=out)
+    return _api_internal.weibull(a, size, ctx, out)
 
 
 def pareto(a, size=None, ctx=None, out=None):
@@ -671,20 +636,16 @@ def pareto(a, size=None, ctx=None, out=None):
     where a is the shape and m the scale. Here m is assumed 1. The Pareto distribution
     is a power law distribution. Pareto created it to describe the wealth in the economy.
     """
-    from ...numpy import ndarray as np_ndarray
-    tensor_type_name = np_ndarray
     if ctx is None:
-        ctx = current_context()
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    is_tensor = isinstance(a, tensor_type_name)
-    if is_tensor:
-        return _npi.pareto(a, a=None, size=size, ctx=ctx, out=out)
-    else:
-        return _npi.pareto(a=a, size=size, ctx=ctx, out=out)
+    return _api_internal.pareto(a, size, ctx, out)
 
 
-def power(a, size=None):
+def power(a, size=None, ctx=None, out=None):
     r"""Draw samples in [0, 1] from a power distribution with given parameter a.
 
     Parameters
@@ -716,15 +677,13 @@ def power(a, size=None):
     The power distribution is just the inverse of the Pareto distribution and
     a special case of the Beta distribution.
     """
-    from ...numpy import ndarray as np_ndarray
-    tensor_type_name = np_ndarray
+    if ctx is None:
+        ctx = str(current_context())
+    else:
+        ctx = str(ctx)
     if size == ():
         size = None
-    is_tensor = isinstance(a, tensor_type_name)
-    if is_tensor:
-        return _npi.powerd(a, a=None, size=size)
-    else:
-        return _npi.powerd(a=a, size=size)
+    return _api_internal.powerd(a, size, ctx, out)
 
 
 def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index c6690f149fe8..6d46b2d314aa 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -270,10 +270,8 @@ def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
 
 def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     r"""Draw samples from a logistic distribution.
-
     Samples are drawn from a logistic distribution with specified
     parameters, loc (location or mean, also median), and scale (>0).
-
     Parameters
     ----------
     loc : float or array_like of floats, optional
@@ -290,23 +288,18 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
         Device context of output, default is current context.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
-
     Returns
     -------
     out : ndarray or scalar
         Drawn samples from the parameterized logistic distribution.
-
     Examples
     --------
     Draw samples from the distribution:
-
     >>> loc, scale = 10, 1
     >>> s = np.random.logistic(loc, scale, 10000)
     >>> import matplotlib.pyplot as plt
     >>> count, bins, ignored = plt.hist(s, bins=50)
-
     #   plot against distribution
-
     >>> def logist(x, loc, scale):
     ...     return np.exp((loc-x)/scale)/(scale*(1+np.exp((loc-x)/scale))**2)
     >>> lgst_val = logist(bins, loc, scale)
@@ -318,10 +311,8 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
 
 def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     r"""Draw samples from a Gumbel distribution.
-
     Draw samples from a Gumbel distribution with specified location and
     scale.
-
     Parameters
     ----------
     loc : float or array_like of floats, optional
@@ -338,32 +329,25 @@ def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
         Device context of output, default is current context.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
-
     Returns
     -------
     out : ndarray or scalar
         Drawn samples from the parameterized Gumbel distribution.
-
     Examples
     --------
     Draw samples from the distribution:
-
     >>> mu, beta = 0, 0.1 # location and scale
     >>> s = np.random.gumbel(mu, beta, 1000)
-
     Display the histogram of the samples, along with
     the probability density function:
-
     >>> import matplotlib.pyplot as plt
     >>> count, bins, ignored = plt.hist(s, 30, density=True)
     >>> plt.plot(bins, (1/beta)*np.exp(-(bins - mu)/beta)
     ...          * np.exp( -np.exp( -(bins - mu) /beta) ),
     ...          linewidth=2, color='r')
     >>> plt.show()
-
     Show how an extreme value distribution can arise from a Gaussian process
     and compare to a Gaussian:
-
     >>> means = []
     >>> maxima = []
     >>> for i in range(0,1000) :
@@ -561,10 +545,8 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
 
 def rayleigh(scale=1.0, size=None, ctx=None, out=None):
     r"""Draw samples from a Rayleigh distribution.
-
     The :math:`\chi` and Weibull distributions are generalizations of the
     Rayleigh.
-
     Parameters
     ----------
     scale : float, optional
@@ -578,7 +560,6 @@ def rayleigh(scale=1.0, size=None, ctx=None, out=None):
         Device context of output, default is current context.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
-
     Returns
     -------
     out : ndarray or scalar
@@ -616,7 +597,6 @@ def rand(*size, **kwargs):
 
 def exponential(scale=1.0, size=None, ctx=None, out=None):
     r"""Draw samples from an exponential distribution.
-
     Parameters
     ----------
     scale : float or array_like of floats
@@ -631,7 +611,6 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
         Device context of output, default is current context.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
-
     Returns
     -------
     out : ndarray or scalar
@@ -643,7 +622,6 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
 def weibull(a, size=None, ctx=None, out=None):
     r"""Draw samples from a 1-parameter Weibull distribution with given parameter a
     via inversion.
-
     Parameters
     ----------
     a : float or array_like of floats
@@ -661,23 +639,18 @@ def weibull(a, size=None, ctx=None, out=None):
     --------
     >>> np.random.weibull(a=5)
     array(0.9553641)
-
     >>> np.random.weibull(a=5, size=[2,3])
     array([[1.0466299 , 1.1320982 , 0.98415005],
           [1.1430776 , 0.9532727 , 1.1344457 ]])
-
     >>> np.random.weibull(a=np.array([2,3])
     array([0.98843634, 1.0125613 ])
-
     The Weibull distribution is one of a class of Generalized Extreme
     Value (GEV) distributions. This class includes the Gumbel and Frechet
     distributions.
-
     The probability density for the Weibull distribution is
     f(x) = \frac{a}{\lambda}(\frac{x}{\lambda})^{a-1}e^{-(x/\lambda)^a},
     where a is the shape and \lambda the scale. The generated 1-parameter Weibull
     sample has the scale parameter \lambda = 1.
-
     The Weibull distribution is commonly used in reliability engineering to
     model time to failure, in modeling particle sizes, in information retrieval
     to model dwell time on pages, in quantitative finance to model risk etc.
@@ -687,7 +660,6 @@ def weibull(a, size=None, ctx=None, out=None):
 
 def pareto(a, size=None, ctx=None, out=None):
     r"""Draw samples from a Pareto II or Lomax distribution with specified shape a.
-
     Parameters
     ----------
     a : float or array_like of floats
@@ -697,12 +669,10 @@ def pareto(a, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``a`` is a scalar. Otherwise,
         ``np.array(a).size`` samples are drawn.
-
     Returns
     -------
     out : ndarray or scalar
         Drawn samples from the Pareto distribution.
-
     Examples
     --------
     >>> np.random.pareto(a=5)
@@ -712,7 +682,6 @@ def pareto(a, size=None, ctx=None, out=None):
             [0.0311172 , 0.12911797, 0.03370714]])
     >>> np.random.pareto(a=np.array([2,3])
     array([0.26636696, 0.15685666])
-
     The probability density for the Pareto distribution is f(x) = \frac{am^a}{x^{a+1}}
     where a is the shape and m the scale. Here m is assumed 1. The Pareto distribution
     is a power law distribution. Pareto created it to describe the wealth in the economy.
@@ -720,9 +689,8 @@ def pareto(a, size=None, ctx=None, out=None):
     return _mx_nd_np.random.pareto(a, size=size, ctx=ctx, out=out)
 
 
-def power(a, size=None):
+def power(a, size=None, ctx=None, out=None):
     r"""Draw samples in [0, 1] from a power distribution with given parameter a.
-
     Parameters
     ----------
     a : float or array_like of floats
@@ -732,12 +700,10 @@ def power(a, size=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``a`` is a scalar. Otherwise,
         ``np.array(a).size`` samples are drawn.
-
     Returns
     -------
     out : ndarray or scalar
         Drawn samples from the power distribution.
-
     Examples
     --------
     >>> np.random.power(a=5)
@@ -747,12 +713,11 @@ def power(a, size=None):
            [0.9078098 , 0.87819266, 0.730635]])
     >>> np.random.power(a=np.array([2,3])
     array([0.7499419 , 0.88894516])
-
     The probability density function is f(x; a) = ax^{a-1}, 0 \le x \le 1, a>0.
     The power distribution is just the inverse of the Pareto distribution and
     a special case of the Beta distribution.
     """
-    return _mx_nd_np.random.power(a, size)
+    return _mx_nd_np.random.power(a, size=size, ctx=ctx, out=out)
 
 
 def shuffle(x):
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
index 6abf2172177f..db3338494a43 100644
--- a/python/mxnet/symbol/numpy/random.py
+++ b/python/mxnet/symbol/numpy/random.py
@@ -527,10 +527,8 @@ def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
 
 def rayleigh(scale=0.0, size=None, ctx=None, out=None):
     r"""Draw samples from a Rayleigh distribution.
-
     The :math:`\chi` and Weibull distributions are generalizations of the
     Rayleigh.
-
     Parameters
     ----------
     scale : float or _Symbol
@@ -542,7 +540,6 @@ def rayleigh(scale=0.0, size=None, ctx=None, out=None):
         ``np.array(scale).size`` samples are drawn.
     ctx : Context, optional
         Device context of output. Default is current context.
-
     Returns
     -------
     out : _Symbol
@@ -863,7 +860,7 @@ def pareto(a, size=None, ctx=None, out=None):
         return _npi.pareto(a=a, size=size, ctx=ctx, out=out)
 
 
-def power(a, size=None):
+def power(a, size=None, ctx=None, out=None):
     r"""Draw samples in [0, 1] from a power distribution with given parameter a.
 
     Parameters
@@ -897,13 +894,15 @@ def power(a, size=None):
     """
     from ..numpy import _Symbol as np_symbol
     tensor_type_name = np_symbol
+    if ctx is None:
+        ctx = current_context()
     if size == ():
         size = None
     is_tensor = isinstance(a, tensor_type_name)
     if is_tensor:
-        return _npi.powerd(a, a=None, size=size)
+        return _npi.powerd(a, a=None, size=size, ctx=ctx, out=out)
     else:
-        return _npi.powerd(a=a, size=size)
+        return _npi.powerd(a=a, size=size, ctx=ctx, out=out)
 
 
 def multivariate_normal(mean, cov, size=None, check_valid=None, tol=None):
diff --git a/src/api/operator/numpy/random/np_exponential_op.cc b/src/api/operator/numpy/random/np_exponential_op.cc
new file mode 100644
index 000000000000..fbb1644c6c5a
--- /dev/null
+++ b/src/api/operator/numpy/random/np_exponential_op.cc
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_exponential_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/random/np_exponential_op.h
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/random/np_exponential_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.exponential")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_exponential");
+  op::NumpyExponentialParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[1].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[1].operator int64_t());
+  } else if (args[1].type_code() == kNull) {
+      param.size = dmlc::nullopt;
+  } else {
+      param.size = Tuple<int>(args[1].operator ObjectRef());
+  }
+  if (args[2].type_code() != kNull) {
+    attrs.dict["ctx"] = args[2].operator std::string();
+  }
+  NDArray* out = args[3].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  NDArray* inputs[1];
+  int num_inputs = 0;
+  if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+    param.scale = args[0].operator double();
+    num_inputs = 0;
+  } else {
+    param.scale = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyExponentialParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(3);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/random/np_location_scale_op.cc b/src/api/operator/numpy/random/np_location_scale_op.cc
new file mode 100644
index 000000000000..d4702fc96404
--- /dev/null
+++ b/src/api/operator/numpy/random/np_location_scale_op.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_location_scale_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/random/np_location_scale_op.h
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/random/np_location_scale_op.h"
+
+namespace mxnet {
+
+int scalar_number(const runtime::MXNetArgs& args) {
+    int result = 0;
+    if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt)
+         result++;
+    if (args[1].type_code() == kDLFloat || args[1].type_code() == kDLInt)
+        result++;
+    return result;
+}
+
+MXNET_REGISTER_API("_npi.gumbel")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_gumbel");
+  op::NumpyLocationScaleParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[2].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[2].operator int64_t());
+  } else if (args[2].type_code() == kNull) {
+      param.size = Tuple<int>({1});
+  } else {
+      param.size = Tuple<int>(args[2].operator ObjectRef());
+  }
+  if (args[3].type_code() != kNull) {
+    attrs.dict["ctx"] = args[3].operator std::string();
+  }
+  NDArray* out = args[4].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  int scalar = scalar_number(args);
+  NDArray* inputs[2];
+  int num_inputs = 0;
+  if (scalar == 2) {
+    param.loc = args[0].operator double();
+    param.scale = args[1].operator double();
+  } else if (scalar == 0) {
+    param.loc = dmlc::nullopt;
+    param.scale = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    inputs[1] = args[1].operator mxnet::NDArray*();
+    num_inputs = 2;
+  } else {
+    if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+      param.loc = dmlc::nullopt;
+      param.scale = args[1].operator double();
+      inputs[0] = args[0].operator mxnet::NDArray*();
+    } else {
+      param.loc = args[0].operator double();
+      param.scale = dmlc::nullopt;
+      inputs[0] = args[1].operator mxnet::NDArray*();
+    }
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyLocationScaleParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(4);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+
+MXNET_REGISTER_API("_npi.logistic")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_logistic");
+  op::NumpyLocationScaleParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[2].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[2].operator int64_t());
+  } else if (args[2].type_code() == kNull) {
+      param.size = dmlc::nullopt;
+  } else {
+      param.size = Tuple<int>(args[2].operator ObjectRef());
+  }
+  if (args[3].type_code() != kNull) {
+    attrs.dict["ctx"] = args[3].operator std::string();
+  }
+  NDArray* out = args[4].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  int scalar = scalar_number(args);
+  NDArray* inputs[2];
+  int num_inputs = 0;
+  if (scalar == 2) {
+    param.loc = args[0].operator double();
+    param.scale = args[1].operator double();
+  } else if (scalar == 0) {
+    param.loc = dmlc::nullopt;
+    param.scale = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    inputs[1] = args[1].operator mxnet::NDArray*();
+    num_inputs = 2;
+  } else {
+    if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+      param.loc = dmlc::nullopt;
+      param.scale = args[1].operator double();
+      inputs[0] = args[0].operator mxnet::NDArray*();
+    } else {
+      param.loc = args[0].operator double();
+      param.scale = dmlc::nullopt;
+      inputs[0] = args[1].operator mxnet::NDArray*();
+    }
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyLocationScaleParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(4);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/random/np_pareto_op.cc b/src/api/operator/numpy/random/np_pareto_op.cc
new file mode 100644
index 000000000000..92e3645b75bd
--- /dev/null
+++ b/src/api/operator/numpy/random/np_pareto_op.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_pareto_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/random/np_pareto_op.h
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/random/np_pareto_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.pareto")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_pareto");
+  op::NumpyParetoParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[1].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[1].operator int64_t());
+  } else if (args[1].type_code() == kNull) {
+      param.size = dmlc::nullopt;
+  } else {
+      param.size = Tuple<int>(args[1].operator ObjectRef());
+  }
+  if (args[2].type_code() != kNull) {
+    attrs.dict["ctx"] = args[2].operator std::string();
+  }
+  NDArray* out = args[3].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  NDArray* inputs[1];
+  int num_inputs = 0;
+  if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+    param.a = args[0].operator double();
+    num_inputs = 0;
+  } else {
+    param.a = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyParetoParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(3);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/random/np_power_op.cc b/src/api/operator/numpy/random/np_power_op.cc
new file mode 100644
index 000000000000..12a621726cd2
--- /dev/null
+++ b/src/api/operator/numpy/random/np_power_op.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_power_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/random/np_power_op.h
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/random/np_power_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.powerd")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_powerd");
+  op::NumpyPowerParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[1].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[1].operator int64_t());
+  } else if (args[1].type_code() == kNull) {
+      param.size = dmlc::nullopt;
+  } else {
+      param.size = Tuple<int>(args[1].operator ObjectRef());
+  }
+  if (args[2].type_code() != kNull) {
+    attrs.dict["ctx"] = args[2].operator std::string();
+  }
+  NDArray* out = args[3].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  NDArray* inputs[1];
+  int num_inputs = 0;
+  if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+    param.a = args[0].operator double();
+    num_inputs = 0;
+  } else {
+    param.a = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyPowerParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(3);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/random/np_rayleigh_op.cc b/src/api/operator/numpy/random/np_rayleigh_op.cc
new file mode 100644
index 000000000000..428e433763ad
--- /dev/null
+++ b/src/api/operator/numpy/random/np_rayleigh_op.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_rayleigh_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/random/np_rayleigh_op.h
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/random/np_rayleigh_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.rayleigh")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_rayleigh");
+  op::NumpyRayleighParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[1].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[1].operator int64_t());
+  } else if (args[1].type_code() == kNull) {
+      param.size = dmlc::nullopt;
+  } else {
+      param.size = Tuple<int>(args[1].operator ObjectRef());
+  }
+  if (args[2].type_code() != kNull) {
+    attrs.dict["ctx"] = args[2].operator std::string();
+  }
+  NDArray* out = args[3].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  NDArray* inputs[1];
+  int num_inputs = 0;
+  if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+    param.scale = args[0].operator double();
+    num_inputs = 0;
+  } else {
+    param.scale = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyRayleighParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(3);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/random/np_weibull_op.cc b/src/api/operator/numpy/random/np_weibull_op.cc
new file mode 100644
index 000000000000..ef3b7e6ed7b6
--- /dev/null
+++ b/src/api/operator/numpy/random/np_weibull_op.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_weibull_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/random/np_weibull_op.h
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/random/np_weibull_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.weibull")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_weibull");
+  op::NumpyWeibullParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  if (args[1].type_code() == kDLInt) {
+      param.size = Tuple<int>(1, args[1].operator int64_t());
+  } else if (args[1].type_code() == kNull) {
+      param.size = dmlc::nullopt;
+  } else {
+      param.size = Tuple<int>(args[1].operator ObjectRef());
+  }
+  if (args[2].type_code() != kNull) {
+    attrs.dict["ctx"] = args[2].operator std::string();
+  }
+  NDArray* out = args[3].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+  NDArray* inputs[1];
+  int num_inputs = 0;
+  if (args[0].type_code() == kDLFloat || args[0].type_code() == kDLInt) {
+    param.a = args[0].operator double();
+    num_inputs = 0;
+  } else {
+    param.a = dmlc::nullopt;
+    inputs[0] = args[0].operator mxnet::NDArray*();
+    num_inputs = 1;
+  }
+  attrs.parsed = std::move(param);
+  SetAttrDict<op::NumpyWeibullParam>(&attrs);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs,
+                          &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(3);
+  } else {
+    *ret = ndoutputs[0];
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/operator/numpy/random/np_exponential_op.h b/src/operator/numpy/random/np_exponential_op.h
index 25593063872d..36d29ff842e3 100644
--- a/src/operator/numpy/random/np_exponential_op.h
+++ b/src/operator/numpy/random/np_exponential_op.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <unordered_map>
 #include "../../elemwise_op_common.h"
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -57,6 +58,13 @@ struct NumpyExponentialParam : public dmlc::Parameter<NumpyExponentialParam> {
         "Context of output, in format [cpu|gpu|cpu_pinned](n)."
         " Only used for imperative calls.");
   }
+    void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream scale_s, size_s;
+    scale_s << scale;
+    size_s << size;
+    (*dict)["scale"] = scale_s.str();
+    (*dict)["size"] = size_s.str();
+  }
 };
 
 template <typename DType>
diff --git a/src/operator/numpy/random/np_location_scale_op.h b/src/operator/numpy/random/np_location_scale_op.h
index 558bdcd5b267..00c89c149c5c 100644
--- a/src/operator/numpy/random/np_location_scale_op.h
+++ b/src/operator/numpy/random/np_location_scale_op.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <unordered_map>
 #include "../../elemwise_op_common.h"
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -59,6 +60,15 @@ struct NumpyLocationScaleParam : public dmlc::Parameter<NumpyLocationScaleParam>
         "Context of output, in format [cpu|gpu|cpu_pinned](n)."
         " Only used for imperative calls.");
   }
+    void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream loc_s, scale_s, size_s;
+    loc_s << loc;
+    scale_s << scale;
+    size_s << size;
+    (*dict)["loc"] = loc_s.str();
+    (*dict)["scale"] = scale_s.str();
+    (*dict)["size"] = size_s.str();
+  }
 };
 
 inline bool NumpyLocationScaleOpType(const nnvm::NodeAttrs &attrs,
diff --git a/src/operator/numpy/random/np_pareto_op.h b/src/operator/numpy/random/np_pareto_op.h
index 85eab97aef8c..a8a5d7f411c0 100644
--- a/src/operator/numpy/random/np_pareto_op.h
+++ b/src/operator/numpy/random/np_pareto_op.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <unordered_map>
 #include "../../elemwise_op_common.h"
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -57,6 +58,14 @@ struct NumpyParetoParam : public dmlc::Parameter<NumpyParetoParam> {
         "Context of output, in format [cpu|gpu|cpu_pinned](n)."
         " Only used for imperative calls.");
   }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream a_s, size_s;
+    a_s << a;
+    size_s << size;
+    (*dict)["a"] = a_s.str();
+    (*dict)["size"] = size_s.str();
+  }
 };
 
 template <typename DType>
diff --git a/src/operator/numpy/random/np_power_op.h b/src/operator/numpy/random/np_power_op.h
index a8835fd62957..dae730285902 100644
--- a/src/operator/numpy/random/np_power_op.h
+++ b/src/operator/numpy/random/np_power_op.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <unordered_map>
 #include "../../elemwise_op_common.h"
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -43,6 +44,7 @@ namespace op {
 
 struct NumpyPowerParam : public dmlc::Parameter<NumpyPowerParam> {
   dmlc::optional<float> a;
+  std::string ctx;
   dmlc::optional<mxnet::Tuple<int>> size;
   DMLC_DECLARE_PARAMETER(NumpyPowerParam) {
       DMLC_DECLARE_FIELD(a)
@@ -52,6 +54,17 @@ struct NumpyPowerParam : public dmlc::Parameter<NumpyPowerParam> {
       .describe("Output shape. If the given shape is, "
           "e.g., (m, n, k), then m * n * k samples are drawn. "
           "Default is None, in which case a single value is returned.");
+      DMLC_DECLARE_FIELD(ctx).set_default("cpu").describe(
+        "Context of output, in format [cpu|gpu|cpu_pinned](n)."
+        " Only used for imperative calls.");
+  }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream a_s, size_s;
+    a_s << a;
+    size_s << size;
+    (*dict)["a"] = a_s.str();
+    (*dict)["size"] = size_s.str();
   }
 };
 
diff --git a/src/operator/numpy/random/np_rayleigh_op.h b/src/operator/numpy/random/np_rayleigh_op.h
index 9b222684188b..3444f3b74af5 100644
--- a/src/operator/numpy/random/np_rayleigh_op.h
+++ b/src/operator/numpy/random/np_rayleigh_op.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <unordered_map>
 #include "../../elemwise_op_common.h"
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -57,6 +58,14 @@ struct NumpyRayleighParam : public dmlc::Parameter<NumpyRayleighParam> {
         "Context of output, in format [cpu|gpu|cpu_pinned](n)."
         " Only used for imperative calls.");
   }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream scale_s, size_s;
+    scale_s << scale;
+    size_s << size;
+    (*dict)["scale"] = scale_s.str();
+    (*dict)["size"] = size_s.str();
+  }
 };
 
 template <typename DType>
diff --git a/src/operator/numpy/random/np_weibull_op.h b/src/operator/numpy/random/np_weibull_op.h
index afb37288b04e..ff4c40ae8db5 100644
--- a/src/operator/numpy/random/np_weibull_op.h
+++ b/src/operator/numpy/random/np_weibull_op.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <unordered_map>
 #include "../../elemwise_op_common.h"
 #include "../../mshadow_op.h"
 #include "../../mxnet_op.h"
@@ -57,6 +58,14 @@ struct NumpyWeibullParam : public dmlc::Parameter<NumpyWeibullParam> {
         "Context of output, in format [cpu|gpu|cpu_pinned](n)."
         " Only used for imperative calls.");
   }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream a_s, size_s;
+    a_s << a;
+    size_s << size;
+    (*dict)["a"] = a_s.str();
+    (*dict)["size"] = size_s.str();
+  }
 };
 
 template <typename DType>

From fb73a1717acad61caeaeef010faed9e9fcc05f0e Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Tue, 14 Apr 2020 10:29:29 -0700
Subject: [PATCH 04/14] Switch to C++17 and modernize toolchain + CI (#17984)

As per #17968, require C++17 compatible compiler. For cuda code, use C++14 mode introduced in Cuda 9. C++17 support for Cuda will be available in Cuda 11.

Switching to C++17 requires modernizing the toolchain, which exposed a number  of technical debt issues in the codebase. All blocking issues are fixed as part of this PR. See the full list below.

This PR contains the following specific changes:

    Switch CI pipeline to use gcc7 on Ubuntu and CentOS
    Switch CD pipeline to CentOS 7 with https://www.softwarecollections.org/en/scls/rhscl/devtoolset-7/ This enables us to build with gcc7 C++17 compiler while keeping a relatively old glibc requirement for distribution.
    Simplify ARM Edge builds
        Switch to standard Ubuntu / Debian cross-compilation toolchain for ARMv7, ARMv8
        Switch to https://toolchains.bootlin.com/ toolchain for ARMv6 (the Debian ARMv6 toolchain is for ARMv4 + ARMv5 + ARMv6, but we wish to only target ARMv6 and make use of ARMv6 features)
        Remove reliance on dockcross for cross compilation.
    Simplify Jetson build
        Use standard Ubuntu / Debian cross-compilation toolchain for ARMv8
        Upgrade to Cuda 10 and Jetpack 4.3
        Simplify build setup
    Simplify QEMU ARM virtualization test setup on CI
        Remove complex "Virtual Machine in Docker" logic and run a QEMU based Docker container instead based on arm32v7/ubuntu
    Fix out of bounds vector accesses in
        SoftmaxGradOpType
        MKLDNNFCBackward
    Fix use of non-standard rand_r function (which is not available on anymore on newer Android toolchains and shouldn't be use in any case).
    Fix reproducibility of RNN with Dropout
    Fix reproducibility of DGL Graph Sampling Operators
    Update tests for Android Edge build to NDK19. The previously used standalone toolchain is obsolete.

Those Dockerfiles that required refactoring as part of the effort were refactored based on the following consideration

    Maximize the use of system dependencies provided by the distribution instead of manually installing dependencies from source or from third party vendors. This reduces the complexity of the installation process and essentially pins the dependency versions, increasing CI stability. Further, Dockerfile build speed is improved. To facilitate this, use recent distribution versions. We still ensure backwards compatibility via CentOS7 based build and test stages
    Minimize the number of layers in the Dockerfile. Don't have 5 different script files executed, each calling apt-get update and install, but just execute once. Speeds up the build and reduces image size. Keep each Dockerfile simple and tailored to a purpose, instead of running 20 scripts to install dependencies for every thinkable scenario, which is unmaintainable.

Some more small changes:

    Remove outdated references to Cuda 7 and Cuda 8 in various files.
    Remove C++03 support in mshadow
    Disable broken tests
        NumpyBooleanAssignForwardCPU #17990
        test_init.test_rsp_const_init #17988
        quantized_elemwise_mul #18034

List of squashed commits

* cpp standard

* Remove leftover files of Cuda 7 and Cuda 8 support

* thrust 1.9.8 for clang10

* compiler warnings

* Disable broken test_init.test_rsp_const_init

* Disable tests invoking NumpyBooleanAssignForwardCPU

* Fix out of bounds access in SoftmaxGradOpType

* Use CentOS 7 for staticbuilds

CentOS 7 fullfills the requirements for PEP 599 manylinux-2014 and provides a
C++17 toolchain.

* Fix MKLDNNFCBackward

* Update edge toolchain

* Support platforms without rand_r

* Cleanup random.h

* Greatly simplify qemu setup

* Remove unused functions in Jenkins_steps.groovy

* Skip quantized_elemwise_mul due QuantizedElemwiseMulOpShape bug

* Fix R package installation

https://github.com/apache/incubator-mxnet/issues/18042

* Fix centos ccache

* Fix GPU Makefile staticbuild on CentOS7

* CentOS7 NCCL

* CentOS7 staticbuild fix link with libculibos
---
 3rdparty/dmlc-core                            |   2 +-
 3rdparty/mshadow/guide/Makefile               |   2 +-
 3rdparty/mshadow/guide/mshadow-ps/Makefile    |   2 +-
 3rdparty/mshadow/make/mshadow.mk              |   4 +-
 3rdparty/mshadow/mshadow/base.h               |  24 --
 3rdparty/mshadow/mshadow/logging.h            |   5 +
 3rdparty/mshadow/mshadow/packet-inl.h         |   4 +
 3rdparty/mshadow/mshadow/random.h             | 103 +----
 3rdparty/mshadow/test/Makefile                |   2 +-
 CMakeLists.txt                                |  44 +--
 Makefile                                      |  30 +-
 amalgamation/Makefile                         |   7 +-
 amalgamation/amalgamation.py                  |   2 +-
 cd/mxnet_lib/mxnet_lib_pipeline.groovy        |   8 +-
 cd/mxnet_lib/static/Jenkins_pipeline.groovy   |   4 +-
 cd/python/docker/Jenkins_pipeline.groovy      |   5 +-
 cd/python/pypi/Jenkins_pipeline.groovy        |   5 +-
 cd/python/pypi/pypi_package.sh                |   2 +-
 cd/utils/docker_tag.sh                        |   2 +-
 cd/utils/mxnet_base_image.sh                  |   3 -
 ci/README.md                                  |  95 +----
 ci/build.py                                   |  11 +-
 ci/dev_menu.py                                |   2 +-
 ci/docker/Dockerfile.build.android_armv7      |  94 ++---
 ci/docker/Dockerfile.build.android_armv8      |  92 ++---
 ci/docker/Dockerfile.build.armv6              |  45 ++-
 ci/docker/Dockerfile.build.armv7              |  54 ++-
 ci/docker/Dockerfile.build.armv8              |  56 +--
 ci/docker/Dockerfile.build.jetson             |  96 +++--
 ci/docker/Dockerfile.build.test.arm_qemu      |  47 ---
 ...tu1404_gpu => Dockerfile.build.test.armv7} |  24 +-
 ...tu1404_gpu => Dockerfile.build.test.armv8} |  25 +-
 ci/docker/Dockerfile.build.ubuntu_build_cuda  |  11 +-
 ci/docker/Dockerfile.build.ubuntu_cpu         |   8 +-
 ci/docker/Dockerfile.build.ubuntu_cpu_julia   |   8 +-
 ci/docker/Dockerfile.build.ubuntu_cpu_r       |   3 +
 ci/docker/Dockerfile.build.ubuntu_cpu_scala   |   3 +
 ci/docker/Dockerfile.build.ubuntu_gpu_cu100   |  84 ----
 ci/docker/Dockerfile.build.ubuntu_gpu_cu101   |  14 +-
 ci/docker/Dockerfile.build.ubuntu_gpu_cu102   |  85 -----
 ci/docker/Dockerfile.build.ubuntu_gpu_cu80    |  79 ----
 ci/docker/Dockerfile.build.ubuntu_gpu_cu90    |  85 -----
 ci/docker/Dockerfile.build.ubuntu_gpu_cu92    |  84 ----
 ci/docker/Dockerfile.build.ubuntu_nightly_cpu |   8 +-
 ci/docker/Dockerfile.build.ubuntu_nightly_gpu |   8 +-
 ...404_cpu => Dockerfile.publish.centos7_cpu} |  28 +-
 .../Dockerfile.publish.centos7_gpu_cu100      |  43 +++
 .../Dockerfile.publish.centos7_gpu_cu101      |  43 +++
 .../Dockerfile.publish.centos7_gpu_cu102      |  43 +++
 ...pu => Dockerfile.publish.centos7_gpu_cu90} |  32 +-
 ci/docker/Dockerfile.publish.centos7_gpu_cu92 |  43 +++
 ci/docker/install/android_armv7_openblas.sh   |  31 --
 ci/docker/install/android_ndk.sh              |  38 --
 ci/docker/install/arm64_openblas.sh           |  35 --
 ci/docker/install/centos7_base.sh             |   8 +
 ci/docker/install/centos7_ccache.sh           |   8 +-
 ci/docker/install/centos7_core.sh             |   8 +
 .../{arm_openblas.sh => centos7_nccl.sh}      |  19 +-
 ci/docker/install/deb_ubuntu_ccache.sh        |  26 +-
 .../install/{ubuntu_arm.sh => thrust.sh}      |  15 +-
 ci/docker/install/ubuntu_arm_qemu_bin.sh      |  40 --
 ci/docker/install/ubuntu_gcc8.sh              |   2 +-
 ci/docker/install/ubuntu_publish.sh           |  92 -----
 ci/docker/install/ubuntu_r.sh                 |   5 +-
 ci/docker/install/ubuntu_scala.sh             |  26 +-
 ci/docker/qemu/README.md                      |  18 -
 ci/docker/qemu/runtime_functions.py           | 134 -------
 ci/docker/qemu/vmcontrol.py                   | 360 ------------------
 ci/docker/runtime_functions.sh                | 197 ++++++----
 .../aarch64-linux-gnu-toolchain.cmake}        |  29 +-
 .../arm-linux-gnueabihf-toolchain.cmake}      |  23 +-
 ci/jenkins/Jenkins_steps.groovy               | 109 +++---
 ci/jenkins/Jenkinsfile_clang                  |   4 +-
 ci/jenkins/Jenkinsfile_edge                   |   7 +-
 ci/publish/Jenkinsfile                        |   2 +-
 ci/publish/README.md                          |   7 +-
 ci/qemu/README.md                             |  92 -----
 ci/qemu/copy.sh                               |  23 --
 ci/qemu/init.sh                               |  23 --
 ci/qemu/initrd_modif/inittab                  |  38 --
 ci/qemu/install.sh                            |  32 --
 ci/qemu/mxnet_requirements.txt                |   7 -
 ci/qemu/preseed.cfg                           |  68 ----
 ci/qemu/preseed.sh                            |  29 --
 ci/qemu/run.sh                                |  33 --
 ci/qemu/test_requirements.txt                 |   3 -
 cmake/Modules/FindNCCL.cmake                  |  10 +-
 cmake/upstream/FindCUDAToolkit.cmake          | 205 +++++++---
 config/distribution/linux_cu100.cmake         |   3 +-
 config/distribution/linux_cu101.cmake         |   3 +-
 config/distribution/linux_cu102.cmake         |   3 +-
 config/distribution/linux_cu75.cmake          |  35 --
 config/distribution/linux_cu80.cmake          |  35 --
 config/distribution/linux_cu90.cmake          |   3 +-
 config/distribution/linux_cu91.cmake          |   3 +-
 config/distribution/linux_cu92.cmake          |   3 +-
 cpp-package/example/Makefile                  |   2 +-
 cpp-package/example/example.mk                |   4 +-
 cpp-package/example/feature_extract/Makefile  |   4 +-
 cpp-package/example/inference/Makefile        |   2 +-
 cpp-package/example/inference/inference.mk    |   4 +-
 .../image-classification/predict-cpp/Makefile |   6 +-
 example/multi_threaded_inference/Makefile     |   2 +-
 .../multi_threaded_inference.cc               |   5 +-
 example/rnn/large_word_lm/setup.py            |   2 +-
 include/mxnet/base.h                          |  12 -
 make/crosscompile.jetson.mk                   | 216 -----------
 make/staticbuild/linux_cu100.mk               |   8 +
 make/staticbuild/linux_cu101.mk               |   8 +
 make/staticbuild/linux_cu102.mk               |   8 +
 make/staticbuild/linux_cu75.mk                | 167 --------
 make/staticbuild/linux_cu80.mk                | 170 ---------
 make/staticbuild/linux_cu90.mk                |   8 +
 make/staticbuild/linux_cu91.mk                |   8 +
 make/staticbuild/linux_cu92.mk                |   8 +
 perl-package/AI-MXNet/t/test_init.t           |   5 +-
 python/setup.py                               |   4 +-
 src/c_api/c_api_executor.cc                   |   2 +-
 src/operator/contrib/dgl_graph.cc             |  73 +++-
 src/operator/fusion/fused_op.cu               |   2 +-
 .../nn/mkldnn/mkldnn_fully_connected.cc       |   4 +-
 src/operator/nn/mkldnn/mkldnn_rnn.cc          |   4 +-
 src/operator/nn/softmax-inl.h                 |   2 +-
 src/operator/numpy/np_boolean_mask_assign.cc  |   2 +-
 src/operator/random/shuffle_op.cc             |   2 +-
 src/operator/rnn-inl.h                        |  20 +-
 src/operator/rnn.cc                           |   1 +
 src/operator/rnn_impl.h                       |  29 +-
 tests/cpp/engine/threaded_engine_test.cc      |  14 +-
 tests/cpp/thread_safety/thread_safety_test.cc |  16 +-
 tests/cpp/unittest.mk                         |  24 +-
 tests/jenkins/run_test_pip_installations.sh   |  12 +-
 .../python/quantization/test_quantization.py  |   4 +
 tests/python/unittest/test_init.py            |   6 +-
 tests/python/unittest/test_numpy_ndarray.py   |   1 +
 tests/python/unittest/test_numpy_op.py        |   2 +
 tools/dependencies/README.md                  |   1 -
 .../dependencies/make_shared_dependencies.sh  |   3 +-
 tools/dependencies/zmq.sh                     |   6 +
 tools/pip/doc/CPU_ADDITIONAL.md               |   5 +-
 tools/pip/doc/CU100_ADDITIONAL.md             |   7 +-
 tools/pip/doc/CU101_ADDITIONAL.md             |   7 +-
 tools/pip/doc/CU102_ADDITIONAL.md             |   6 +-
 tools/pip/doc/CU75_ADDITIONAL.md              |  38 --
 tools/pip/doc/CU80_ADDITIONAL.md              |  38 --
 tools/pip/doc/CU90_ADDITIONAL.md              |   6 +-
 tools/pip/doc/CU92_ADDITIONAL.md              |   6 +-
 tools/pip/doc/NATIVE_ADDITIONAL.md            |   8 +-
 tools/pip/doc/PYPI_README.md                  |   2 +-
 tools/pip/setup.py                            |   9 +-
 tools/setup_gpu_build_tools.sh                |  59 +--
 tools/staticbuild/build.sh                    |   2 +-
 tools/staticbuild/build_lib.sh                |  10 +-
 tools/staticbuild/build_lib_cmake.sh          |  10 +-
 154 files changed, 1252 insertions(+), 3351 deletions(-)
 delete mode 100644 ci/docker/Dockerfile.build.test.arm_qemu
 rename ci/docker/{Dockerfile.publish.ubuntu1404_gpu => Dockerfile.build.test.armv7} (72%)
 rename ci/docker/{Dockerfile.publish.test.ubuntu1404_gpu => Dockerfile.build.test.armv8} (72%)
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu100
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu102
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu80
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu90
 delete mode 100644 ci/docker/Dockerfile.build.ubuntu_gpu_cu92
 rename ci/docker/{Dockerfile.publish.test.ubuntu1404_cpu => Dockerfile.publish.centos7_cpu} (68%)
 create mode 100644 ci/docker/Dockerfile.publish.centos7_gpu_cu100
 create mode 100644 ci/docker/Dockerfile.publish.centos7_gpu_cu101
 create mode 100644 ci/docker/Dockerfile.publish.centos7_gpu_cu102
 rename ci/docker/{Dockerfile.publish.ubuntu1404_cpu => Dockerfile.publish.centos7_gpu_cu90} (63%)
 create mode 100644 ci/docker/Dockerfile.publish.centos7_gpu_cu92
 delete mode 100755 ci/docker/install/android_armv7_openblas.sh
 delete mode 100755 ci/docker/install/android_ndk.sh
 delete mode 100755 ci/docker/install/arm64_openblas.sh
 rename ci/docker/install/{arm_openblas.sh => centos7_nccl.sh} (53%)
 rename ci/docker/install/{ubuntu_arm.sh => thrust.sh} (75%)
 delete mode 100755 ci/docker/install/ubuntu_arm_qemu_bin.sh
 delete mode 100755 ci/docker/install/ubuntu_publish.sh
 delete mode 100644 ci/docker/qemu/README.md
 delete mode 100755 ci/docker/qemu/runtime_functions.py
 delete mode 100644 ci/docker/qemu/vmcontrol.py
 rename ci/docker/{install/ubuntu_arm_qemu.sh => toolchains/aarch64-linux-gnu-toolchain.cmake} (64%)
 mode change 100755 => 100644
 rename ci/docker/{install/android_arm64_openblas.sh => toolchains/arm-linux-gnueabihf-toolchain.cmake} (65%)
 mode change 100755 => 100644
 delete mode 100644 ci/qemu/README.md
 delete mode 100755 ci/qemu/copy.sh
 delete mode 100755 ci/qemu/init.sh
 delete mode 100644 ci/qemu/initrd_modif/inittab
 delete mode 100755 ci/qemu/install.sh
 delete mode 100644 ci/qemu/mxnet_requirements.txt
 delete mode 100644 ci/qemu/preseed.cfg
 delete mode 100755 ci/qemu/preseed.sh
 delete mode 100755 ci/qemu/run.sh
 delete mode 100644 ci/qemu/test_requirements.txt
 delete mode 100644 config/distribution/linux_cu75.cmake
 delete mode 100644 config/distribution/linux_cu80.cmake
 delete mode 100644 make/crosscompile.jetson.mk
 delete mode 100644 make/staticbuild/linux_cu75.mk
 delete mode 100644 make/staticbuild/linux_cu80.mk
 delete mode 100644 tools/pip/doc/CU75_ADDITIONAL.md
 delete mode 100644 tools/pip/doc/CU80_ADDITIONAL.md

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 14bf7e884328..5df8305fe699 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 14bf7e884328eb97bfde160ec6f64c20f5337459
+Subproject commit 5df8305fe699d3b503d10c60a231ab0223142407
diff --git a/3rdparty/mshadow/guide/Makefile b/3rdparty/mshadow/guide/Makefile
index bad7a8e94b1d..c8b828c3834b 100644
--- a/3rdparty/mshadow/guide/Makefile
+++ b/3rdparty/mshadow/guide/Makefile
@@ -4,7 +4,7 @@ export CXX = g++
 export NVCC =nvcc
 include config.mk
 include ../make/mshadow.mk
-export CFLAGS = -Wall -O3 -std=c++11 -I../ $(MSHADOW_CFLAGS)
+export CFLAGS = -Wall -O3 -std=c++17 -I../ $(MSHADOW_CFLAGS)
 export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
 export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 
diff --git a/3rdparty/mshadow/guide/mshadow-ps/Makefile b/3rdparty/mshadow/guide/mshadow-ps/Makefile
index 58d64a304009..e16f0a059ad8 100644
--- a/3rdparty/mshadow/guide/mshadow-ps/Makefile
+++ b/3rdparty/mshadow/guide/mshadow-ps/Makefile
@@ -4,7 +4,7 @@ export CXX = g++
 export NVCC =nvcc
 include config.mk
 include ../../make/mshadow.mk
-export CFLAGS = -Wall -O3 -std=c++11 -fopenmp -I../../ $(MSHADOW_CFLAGS)
+export CFLAGS = -Wall -O3 -std=c++17 -fopenmp -I../../ $(MSHADOW_CFLAGS)
 export LDFLAGS= -lm $(MSHADOW_LDFLAGS)
 export NVCCFLAGS = -O3 --use_fast_math -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 
diff --git a/3rdparty/mshadow/make/mshadow.mk b/3rdparty/mshadow/make/mshadow.mk
index 86155eaaadcf..cce3db5fe093 100644
--- a/3rdparty/mshadow/make/mshadow.mk
+++ b/3rdparty/mshadow/make/mshadow.mk
@@ -149,13 +149,13 @@ else
 endif
 
 ifeq ($(USE_DIST_PS),1)
-MSHADOW_CFLAGS += -DMSHADOW_DIST_PS=1 -std=c++11 \
+MSHADOW_CFLAGS += -DMSHADOW_DIST_PS=1 -std=c++17 \
 	-I$(PS_PATH)/src -I$(PS_THIRD_PATH)/include
 PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libps_main.a) \
 	$(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \
 	libglog.a libz.a libsnappy.a)
 	# -L$(PS_THIRD_PATH)/lib -lgflags -lzmq -lprotobuf -lglog -lz -lsnappy
-MSHADOW_NVCCFLAGS += --std=c++11
+MSHADOW_NVCCFLAGS += --std=c++14
 else
 	MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0
 endif
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index a99838422348..cdca74b04f84 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -119,18 +119,6 @@ typedef unsigned __int64 uint64_t;
 #define MSHADOW_OLD_CUDA 0
 #endif
 
-/*!
- * \brief macro to decide existence of c++11 compiler
- */
-#ifndef MSHADOW_IN_CXX11
-  #if (defined(__GXX_EXPERIMENTAL_CXX0X__) ||\
-      __cplusplus >= 201103L || defined(_MSC_VER))
-    #define MSHADOW_IN_CXX11 1
-  #else
-    #define MSHADOW_IN_CXX11 0
-  #endif
-#endif
-
 /*! \brief whether use SSE */
 #ifndef MSHADOW_USE_SSE
   #define MSHADOW_USE_SSE 1
@@ -207,13 +195,6 @@ extern "C" {
 /*! \brief cpu force inline */
 #define MSHADOW_CINLINE MSHADOW_FORCE_INLINE
 
-#if defined(__GXX_EXPERIMENTAL_CXX0X) ||\
-    defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-  #define MSHADOW_CONSTEXPR constexpr
-#else
-  #define MSHADOW_CONSTEXPR const
-#endif
-
 /*!
  * \brief default data type for tensor string
  *  in code release, change it to default_real_t
@@ -231,13 +212,8 @@ extern "C" {
 #define MSHADOW_USE_GLOG DMLC_USE_GLOG
 #endif  // MSHADOW_USE_GLOG
 
-#if DMLC_USE_CXX11
 #define MSHADOW_THROW_EXCEPTION noexcept(false)
 #define MSHADOW_NO_EXCEPTION  noexcept(true)
-#else
-#define MSHADOW_THROW_EXCEPTION
-#define MSHADOW_NO_EXCEPTION
-#endif
 
 #if defined(_MSC_VER)
 #define MSHADOW_ALIGNED(x) __declspec(align(x))
diff --git a/3rdparty/mshadow/mshadow/logging.h b/3rdparty/mshadow/mshadow/logging.h
index 5fc56aff3bae..6aede0d69725 100644
--- a/3rdparty/mshadow/mshadow/logging.h
+++ b/3rdparty/mshadow/mshadow/logging.h
@@ -204,7 +204,12 @@ class LogMessageFatal {
   ~LogMessageFatal() MSHADOW_THROW_EXCEPTION {
     // throwing out of destructor is evil
     // hopefully we can do it here
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wterminate"
+#endif
     throw Error(log_stream_.str());
+#pragma GCC diagnostic pop
   }
 
  private:
diff --git a/3rdparty/mshadow/mshadow/packet-inl.h b/3rdparty/mshadow/mshadow/packet-inl.h
index 58cbc4005aaf..69a41b50e08a 100644
--- a/3rdparty/mshadow/mshadow/packet-inl.h
+++ b/3rdparty/mshadow/mshadow/packet-inl.h
@@ -74,7 +74,11 @@ inline void* AlignedMallocPitch(size_t *out_pitch,
   if (res == NULL) {
     LOG(FATAL) << "AlignedMallocPitch failed";
   }
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
   return res;
+#pragma GCC diagnostic pop
 }
 
 /*!
diff --git a/3rdparty/mshadow/mshadow/random.h b/3rdparty/mshadow/mshadow/random.h
index e6e620cc32e0..259dbe163227 100644
--- a/3rdparty/mshadow/mshadow/random.h
+++ b/3rdparty/mshadow/mshadow/random.h
@@ -14,14 +14,7 @@
 #include "./base.h"
 #include "./tensor.h"
 #include "./tensor_container.h"
-
-#if MSHADOW_IN_CXX11
-#include <random>  // use cxx11 random by default
-#endif
-
-#if _MSC_VER
-#define rand_r(x) rand()
-#endif
+#include <random>
 
 
 namespace mshadow {
@@ -52,9 +45,7 @@ class Random<cpu, DType> {
    * \param seed seed of prng
    */
   inline void Seed(int seed) {
-#if MSHADOW_IN_CXX11
     rnd_engine_.seed(seed);
-#endif
     this->rseed_ = static_cast<uint64_t>(seed);
   }
   /*!
@@ -71,9 +62,6 @@ class Random<cpu, DType> {
   inline void set_stream(Stream<cpu> *stream) {
   }
 
-// These samplers are only avail in C++11.
-#if MSHADOW_IN_CXX11
-
   /*!
    * \brief get some random integer
    * \return integer as unsigned
@@ -226,7 +214,6 @@ class Random<cpu, DType> {
                return static_cast<DType>(dist_poisson(rnd_engine_));});
     }
   }
-#endif
 
   /*!
    * \brief return a temporal expression storing standard gaussian random variables
@@ -270,98 +257,10 @@ class Random<cpu, DType> {
   }
 
  private:
-#if MSHADOW_IN_CXX11
   /*! \brief use c++11 random engine. */
   std::mt19937 rnd_engine_;
   /*! \brief random number seed used in random engine */
   uint64_t rseed_;
-
-#else
-
-  /*! \brief random number seed used by PRNG */
-  unsigned rseed_;
-  // functions
-  template<int dim>
-  inline void SampleUniform(Tensor<cpu, dim, DType> *dst,
-                            DType a = 0.0f, DType b = 1.0f) {
-    if (dst->CheckContiguous()) {
-      this->GenUniform(dst->dptr_, dst->shape_.Size(), a, b);
-    } else {
-      Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
-      for (index_t i = 0; i < mat.size(0); ++i) {
-        this->GenUniform(mat[i].dptr_, mat.size(1), a, b);
-      }
-    }
-  }
-  template<int dim>
-  inline void SampleGaussian(Tensor<cpu, dim, DType> *dst,
-                             DType mu = 0.0f, DType sigma = 1.0f) {
-    if (sigma <= 0.0f) {
-      *dst = mu; return;
-    }
-    if (dst->CheckContiguous()) {
-      this->GenGaussian(dst->dptr_, dst->shape_.Size(), mu, sigma);
-    } else {
-      Tensor<cpu, 2, DType> mat = dst->FlatTo2D();
-      for (index_t i = 0; i < mat.size(0); ++i) {
-        this->GenGaussian(mat[i].dptr_, mat.size(1), mu, sigma);
-      }
-    }
-  }
-  inline void GenUniform(float *dptr, index_t size, float a, float b) {
-    for (index_t j = 0; j < size; ++j) {
-      dptr[j] = static_cast<float>(RandNext()) * (b - a) + a;
-    }
-  }
-  inline void GenUniform(double *dptr, index_t size, double a, double b) {
-    for (index_t j = 0; j < size; ++j) {
-      dptr[j] = static_cast<double>(RandNext()) * (b - a) + a;
-    }
-  }
-  inline void GenGaussian(float *dptr, index_t size, float mu, float sigma) {
-    this->GenGaussianX(dptr, size, mu, sigma);
-  }
-  inline void GenGaussian(double *dptr, index_t size, double mu, double sigma) {
-    this->GenGaussianX(dptr, size, mu, sigma);
-  }
-  inline void GenGaussianX(DType *dptr, index_t size, DType mu, DType sigma) {
-    DType g1 = 0.0f, g2 = 0.0f;
-    for (index_t j = 0; j < size; ++j) {
-      if ((j & 1) == 0) {
-        this->SampleNormal2D(&g1, &g2);
-        dptr[j] = mu + g1 * sigma;
-      } else {
-        dptr[j] = mu + g2 * sigma;
-      }
-    }
-  }
-  /*! \brief get next random number from rand */
-  inline DType RandNext(void) {
-    return static_cast<DType>(rand_r(&rseed_)) /
-        (static_cast<DType>(RAND_MAX) + 1.0f);
-  }
-  /*! \brief return a real numer uniform in (0,1) */
-  inline DType RandNext2(void) {
-    return (static_cast<DType>(rand_r(&rseed_)) + 1.0f) /
-        (static_cast<DType>(RAND_MAX) + 2.0f);
-  }
-  /*!
-   * \brief sample iid xx,yy ~N(0,1)
-   * \param xx first  gaussian output
-   * \param yy second gaussian output
-   */
-  inline void SampleNormal2D(DType *xx_, DType *yy_) {
-    DType &xx = *xx_, &yy = *yy_;
-    DType x, y, s;
-    do {
-      x = 2.0f * RandNext2() - 1.0f;
-      y = 2.0f * RandNext2() - 1.0f;
-      s = x * x + y * y;
-    } while (s >= 1.0f || s == 0.0f);
-    DType t = std::sqrt(-2.0f * std::log(s) / s);
-    xx = x * t; yy = y * t;
-  }
-#endif
   /*! \brief temporal space used to store random numbers */
   TensorContainer<cpu, 1, DType> buffer_;
 };  // class Random<cpu, DType>
diff --git a/3rdparty/mshadow/test/Makefile b/3rdparty/mshadow/test/Makefile
index dc2d0552deb4..ec11128e949f 100644
--- a/3rdparty/mshadow/test/Makefile
+++ b/3rdparty/mshadow/test/Makefile
@@ -20,7 +20,7 @@ test: test.cu
 test_tblob: test_tblob.cc
 
 $(BIN) :
-	$(CXX) $(CFLAGS) -std=c++0x -o $@ $(filter %.cpp %.o %.c %.cc, $^)  $(LDFLAGS)
+	$(CXX) $(CFLAGS) -std=c++17 -o $@ $(filter %.cpp %.o %.c %.cc, $^)  $(LDFLAGS)
 
 $(OBJ) :
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dce131473b6..1ca92ff19a93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,9 @@ if(CMAKE_CROSSCOMPILING)
 endif()
 
 project(mxnet C CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)  # GNU extensions used by src/operator/random/shuffle_op.cc
 
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake)
   # Load config.cmake only if mxnet is not compiled as a dependency of another project
@@ -59,7 +62,6 @@ option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF)
 option(USE_CPP_PACKAGE "Build C++ Package" OFF)
 option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
-option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF)
 option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
 option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
 option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
@@ -98,14 +100,7 @@ if(USE_CUDA)
       "Please fix your cuda installation: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#mandatory-post")
   endif()
   enable_language(CUDA)
-  set(CMAKE_CUDA_STANDARD 11)
-  include(CheckCXXCompilerFlag)
-  if(USE_CXX14_IF_AVAILABLE)
-    check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
-    if (SUPPORT_CXX14)
-      set(CMAKE_CUDA_STANDARD 14)
-    endif()
-  endif()
+  set(CMAKE_CUDA_STANDARD 14)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 endif()
 
@@ -153,24 +148,21 @@ add_definitions(-DDMLC_MODERN_THREAD_LOCAL=0)
 # disable stack trace in exception by default.
 add_definitions(-DDMLC_LOG_STACK_TRACE_SIZE=0)
 
+add_definitions(-DDMLC_USE_CXX11)
+add_definitions(-DDMLC_STRICT_CXX11)
+add_definitions(-DDMLC_USE_CXX14)
+add_definitions(-DMSHADOW_IN_CXX11)
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
-  add_definitions(-DDMLC_USE_CXX11)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-DMXNET_EXPORTS)
   add_definitions(-DNNVM_EXPORTS)
-  add_definitions(-DDMLC_STRICT_CXX11)
   add_definitions(-DNOMINMAX)
   set(CMAKE_C_FLAGS "/MP")
   set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /bigobj")
 else()
   include(CheckCXXCompilerFlag)
-  if(USE_CXX14_IF_AVAILABLE)
-    check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
-  endif()
-  check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
-  check_cxx_compiler_flag("-std=c++0x"   SUPPORT_CXX0X)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-sign-compare")
   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0 -g")
@@ -184,25 +176,11 @@ else()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   endif()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}")
-  if(SUPPORT_CXX14)
-    add_definitions(-DDMLC_USE_CXX11=1)
-    add_definitions(-DDMLC_USE_CXX14=1)
-    add_definitions(-DMSHADOW_IN_CXX11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-  elseif(SUPPORT_CXX11)
-    add_definitions(-DDMLC_USE_CXX11=1)
-    add_definitions(-DMSHADOW_IN_CXX11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-  elseif(SUPPORT_CXX0X)
-    add_definitions(-DDMLC_USE_CXX11=1)
-    add_definitions(-DMSHADOW_IN_CXX11)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
-  endif()
-endif(MSVC)
+endif()
 
 if(NOT mxnet_LINKER_LIBS)
   set(mxnet_LINKER_LIBS "")
-endif(NOT mxnet_LINKER_LIBS)
+endif()
 
 if(USE_GPROF)
   message(STATUS "Using GPROF")
@@ -530,8 +508,6 @@ if(USE_PLUGIN_CAFFE)
   endif()
   if(NOT DEFINED CAFFE_PATH)
     if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/caffe)
-      # Need newer FindCUDA.cmake that correctly handles -std=c++11
-      cmake_minimum_required(VERSION 3.3)
       set(CAFFE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/caffe)
     else()
       set(CAFFE_PATH $ENV{CAFFE_PATH})
diff --git a/Makefile b/Makefile
index e5d6bb288134..f51199589354 100644
--- a/Makefile
+++ b/Makefile
@@ -92,6 +92,8 @@ include $(DMLC_CORE)/make/dmlc.mk
 # all tge possible warning tread
 WARNFLAGS= -Wall -Wsign-compare
 CFLAGS = -DMSHADOW_FORCE_STREAM $(WARNFLAGS)
+# C++ standard
+CFLAGS+= -DDMLC_USE_CXX11=1 -DDMLC_USE_CXX11=1 -DDMLC_USE_CXX14=1
 # use old thread local implementation in DMLC-CORE
 CFLAGS += -DDMLC_MODERN_THREAD_LOCAL=0
 # disable stack trace in exception by default.
@@ -99,7 +101,9 @@ CFLAGS += -DDMLC_LOG_STACK_TRACE_SIZE=0
 CFLAGS += -DDMLC_LOG_FATAL_THROW=1
 
 ifeq ($(DEV), 1)
-	CFLAGS += -g -Werror
+  # Excluded from Werror:
+  # 1) variables used in '#pragma omp parallel' are considered unused
+	CFLAGS += -g -Werror -Wno-error=unused-variable -Wno-error=maybe-uninitialized -Wno-error=unused-function
 	NVCCFLAGS += -Werror cross-execution-space-call
 endif
 
@@ -131,9 +135,9 @@ endif
 # -L/usr/local/lib
 
 ifeq ($(DEBUG), 1)
-	NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS += -std=c++14 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS += -std=c++14 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 # CFLAGS for segfault logger
@@ -537,7 +541,11 @@ ifeq ($(USE_CUDA), 1)
 			CFLAGS += -I$(USE_NCCL_PATH)/include
 			LDFLAGS += -L$(USE_NCCL_PATH)/lib
 		endif
+		ifdef USE_SYSTEM_CUDA
+		LDFLAGS += -lnccl_static
+		else
 		LDFLAGS += -lnccl
+		endif
 		CFLAGS += -DMXNET_USE_NCCL=1
 	else
 		CFLAGS += -DMXNET_USE_NCCL=0
@@ -567,7 +575,7 @@ ALLX_DEP= $(ALL_DEP)
 
 build/src/%.o: src/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@
+	$(CXX) -std=c++17 -c $(CFLAGS) -MMD -c $< -o $@
 
 build/src/%_gpu.o: src/%.cu | mkldnn
 	@mkdir -p $(@D)
@@ -578,12 +586,12 @@ build/src/%_gpu.o: src/%.cu | mkldnn
 # Use CXX to generate dependency instead.
 build/plugin/%_gpu.o: plugin/%.cu
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
+	$(CXX) -std=c++17 $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" $<
 
 build/plugin/%.o: plugin/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@
+	$(CXX) -std=c++17 -c $(CFLAGS) -MMD -c $< -o $@
 
 %_gpu.o: %.cu
 	@mkdir -p $(@D)
@@ -592,7 +600,7 @@ build/plugin/%.o: plugin/%.cc | mkldnn
 
 %.o: %.cc $(CORE_INC)
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 -c $(CFLAGS) -MMD -Isrc/operator -c $< -o $@
+	$(CXX) -std=c++17 -c $(CFLAGS) -MMD -Isrc/operator -c $< -o $@
 
 # Set install path for libmxnet.so on Mac OS
 ifeq ($(UNAME_S), Darwin)
@@ -653,7 +661,7 @@ bin/im2rec: tools/im2rec.cc $(ALLX_DEP)
 
 $(BIN) :
 	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -std=c++11  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
+	$(CXX) $(CFLAGS) -std=c++17  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
 
 # CPP Package
 ifeq ($(USE_CPP_PACKAGE), 1)
@@ -686,13 +694,13 @@ extension_libs: $(EXT_LIBS)
 
 build/libcustomop_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/gemm_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++17 example/extensions/lib_custom_op/gemm_lib.cc -o $@ -I include/mxnet
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
-	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++14 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
-	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc -o $@ -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++17 example/extensions/lib_subgraph/subgraph_lib.cc -o $@ -I include/mxnet
 
 # Cython build
 cython:
diff --git a/amalgamation/Makefile b/amalgamation/Makefile
index 701c1f155e47..55aad1d470a2 100644
--- a/amalgamation/Makefile
+++ b/amalgamation/Makefile
@@ -50,7 +50,8 @@ endif
 
 DEFS+=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_MKL=0 -DMSHADOW_RABIT_PS=0 -DMSHADOW_DIST_PS=0 -DDMLC_LOG_STACK_TRACE=0
 DEFS+=-DMSHADOW_FORCE_STREAM -DMXNET_USE_OPENCV=0 -DMXNET_PREDICT_ONLY=1
-CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall $(DEFS)
+DEFS+=-DDMLC_USE_CXX11=1 -DDMLC_USE_CXX11=1 -DDMLC_USE_CXX14=1
+CFLAGS=-std=c++17 -Wno-unknown-pragmas -Wall $(DEFS)
 
 # if architecture of the CPU supports F16C instruction set, enable USE_F16C for fast fp16 computation on CPU
 ifeq ($(USE_F16C), 1)
@@ -63,7 +64,7 @@ ifneq ($(MIN), 1)
 	CFLAGS += -I${OPENBLAS_ROOT} -I${OPENBLAS_ROOT}/include
 	LDFLAGS+= -L${OPENBLAS_ROOT} -L${OPENBLAS_ROOT}/lib
 
-	# Define which blas is installed. Uses OpenBLAS by default.
+# Define which blas is installed. Uses OpenBLAS by default.
 	ifeq ($(USE_BLAS), atlas)
                 LDFLAGS += -lcblas
         else ifeq ($(USE_BLAS), blas)
@@ -120,7 +121,7 @@ else
 endif
 
 libmxnet_predict.js: mxnet_predict-all.cc
-	${EMCC} -std=c++11 -O2 $(DEFS) -DMSHADOW_USE_SSE=0 -D__MXNET_JS__  -o $@ $+ \
+	${EMCC} -std=c++17 -O2 $(DEFS) -DMSHADOW_USE_SSE=0 -D__MXNET_JS__  -o $@ $+ \
 	-s EXPORTED_FUNCTIONS="['_MXPredCreate', \
 	                        '_MXPredGetOutputShape', \
 	                        '_MXPredSetInput', \
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 5f825de77483..cb961c699fe8 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -30,7 +30,7 @@
     'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h', 'omp.h',
     'onnx/onnx.pb.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
     'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 'relacy/relacy_std.hpp',
-    'relacy_shims.h', 'ittnotify.h', 'shared_mutex', 'nvToolsExt.h', 'dmlc/build_config.h',
+    'relacy_shims.h', 'ittnotify.h', 'nvToolsExt.h', 'dmlc/build_config.h',
     'sys/isa_defs.h'
     ]
 
diff --git a/cd/mxnet_lib/mxnet_lib_pipeline.groovy b/cd/mxnet_lib/mxnet_lib_pipeline.groovy
index 0310dd991651..0c49bfa8e2c8 100644
--- a/cd/mxnet_lib/mxnet_lib_pipeline.groovy
+++ b/cd/mxnet_lib/mxnet_lib_pipeline.groovy
@@ -42,8 +42,7 @@ def get_pipeline(mxnet_variant, build_fn) {
           }
         }
 
-        // Add quantization tests for all cu variants except cu80
-        if (mxnet_variant.startsWith('cu') && !mxnet_variant.startsWith('cu80')) {
+        if (mxnet_variant.startsWith('cu')) {
           tests["${mxnet_variant}: Quantization Python 3"] = {
             stage("${mxnet_variant}: Quantization Python 3") {
               timeout(time: max_time, unit: 'MINUTES') {
@@ -76,10 +75,9 @@ def get_stash(mxnet_variant) {
 // The environment corresponds to the docker files in the 'docker' directory
 def get_environment(mxnet_variant) {
   if (mxnet_variant.startsWith("cu")) {
-    // Remove 'mkl' suffix from variant to properly format test environment
-    return "ubuntu_gpu_${mxnet_variant.replace('mkl', '')}"
+    return "publish.centos7_gpu_${mxnet_variant}"
   }
-  return "ubuntu_cpu"
+  return "publish.centos7_cpu"
 }
 
 // Returns the variant appropriate jenkins node test in which
diff --git a/cd/mxnet_lib/static/Jenkins_pipeline.groovy b/cd/mxnet_lib/static/Jenkins_pipeline.groovy
index abbafdbef075..61d18083e314 100644
--- a/cd/mxnet_lib/static/Jenkins_pipeline.groovy
+++ b/cd/mxnet_lib/static/Jenkins_pipeline.groovy
@@ -46,9 +46,7 @@ def build(mxnet_variant) {
   node(NODE_LINUX_CPU) {
     ws("workspace/mxnet_${libtype}/${mxnet_variant}/${env.BUILD_NUMBER}") {
       ci_utils.init_git()
-      // Compiling in Ubuntu14.04 due to glibc issues. 
-      // This should be updates once we have clarity on this issue.
-      ci_utils.docker_run('publish.ubuntu1404_cpu', "build_static_libmxnet ${mxnet_variant}", false)
+      ci_utils.docker_run('publish.centos7_cpu', "build_static_libmxnet ${mxnet_variant}", false)
       ci_utils.pack_lib("mxnet_${mxnet_variant}", libmxnet_pipeline.get_stash(mxnet_variant))
     }
   }
diff --git a/cd/python/docker/Jenkins_pipeline.groovy b/cd/python/docker/Jenkins_pipeline.groovy
index 2911a6571288..693acc540874 100644
--- a/cd/python/docker/Jenkins_pipeline.groovy
+++ b/cd/python/docker/Jenkins_pipeline.groovy
@@ -32,10 +32,9 @@ def get_pipeline(mxnet_variant) {
 // The environment corresponds to the docker files in the 'docker' directory
 def get_environment(mxnet_variant) {
   if (mxnet_variant.startsWith("cu")) {
-    // Remove 'mkl' suffix from variant to properly format test environment
-    return "ubuntu_gpu_${mxnet_variant.replace('mkl', '')}"
+    return "publish.centos7_gpu_${mxnet_variant}"
   }
-  return "ubuntu_cpu"
+  return "publish.centos7_cpu"
 }
 
 
diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy
index 125eb2c5c200..dfd864fa1a3b 100644
--- a/cd/python/pypi/Jenkins_pipeline.groovy
+++ b/cd/python/pypi/Jenkins_pipeline.groovy
@@ -35,11 +35,10 @@ def get_pipeline(mxnet_variant) {
 }
 
 def get_environment(mxnet_variant) {
-  def environment = "ubuntu_cpu"
   if (mxnet_variant.startsWith('cu')) {
-    environment = "ubuntu_gpu_${mxnet_variant}".replace("mkl", "")
+    return "publish.centos7_gpu_${mxnet_variant}"
   }
-  return environment
+  return "publish.centos7_cpu"
 }
 
 def build(mxnet_variant) {
diff --git a/cd/python/pypi/pypi_package.sh b/cd/python/pypi/pypi_package.sh
index fafd88e9742b..f9a0b1eb6906 100755
--- a/cd/python/pypi/pypi_package.sh
+++ b/cd/python/pypi/pypi_package.sh
@@ -18,7 +18,7 @@
 
 set -ex
 
-# variant = cpu, native, cu80, cu100, etc.
+# variant = cpu, native, cu92, cu100, etc.
 export mxnet_variant=${1:?"Please specify the mxnet variant"}
 
 # Due to this PR: https://github.com/apache/incubator-mxnet/pull/14899
diff --git a/cd/utils/docker_tag.sh b/cd/utils/docker_tag.sh
index e77cbe7856bf..b56e119f0130 100755
--- a/cd/utils/docker_tag.sh
+++ b/cd/utils/docker_tag.sh
@@ -24,7 +24,7 @@ is_release=${RELEASE_BUILD:-false}
 version=${VERSION:-nightly}
 
 # The docker tags will be in the form <version>_<hardware>(_mkl)
-# Eg. nightly_cpu, 1.4.0_cpu_mkl, nightly_gpu_cu80_mkl, etc.
+# Eg. nightly_cpu, 1.4.0_cpu_mkl, nightly_gpu_cu92_mkl, etc.
 
 if [[ ${mxnet_variant} == "cpu" ]]; then
     tag_suffix="cpu"
diff --git a/cd/utils/mxnet_base_image.sh b/cd/utils/mxnet_base_image.sh
index c87db661818c..1667d4c6f62a 100755
--- a/cd/utils/mxnet_base_image.sh
+++ b/cd/utils/mxnet_base_image.sh
@@ -21,9 +21,6 @@
 mxnet_variant=${1:?"Please specify the mxnet variant as the first parameter"}
 
 case ${mxnet_variant} in
-    cu80*)
-    echo "nvidia/cuda:8.0-cudnn7-runtime-ubuntu16.04"
-    ;;
     cu90*)
     echo "nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04"
     ;;
diff --git a/ci/README.md b/ci/README.md
index 155a0104a125..7172bd955491 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -111,90 +111,37 @@ significantly. You can set this directory explicitly by setting CCACHE_DIR envir
 variable. All ccache instances are currently set to be 10 Gigabytes max in size.
 
 
-## Testing with QEMU
-To run the unit tests under qemu:
-```
-./build.py -p armv7 && ./build.py -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu
-```
-
-To get a shell on the container and debug issues with the emulator itself, we build the container
-and then execute it interactively. We can afterwards use port 2222 on the host to connect with SSH.
-
-
-```
-ci/build.py -p test.arm_qemu -b && docker run -p2222:2222 -ti mxnetci/build.test.arm_qemu
-```
+## Testing with ARM / Edge devices with QEMU
 
-Then from another terminal:
+We build on [QEMU](https://www.qemu.org/) and Linux [Kernel Support for
+miscellaneous Binary
+Formats](https://www.kernel.org/doc/html/v5.6/admin-guide/binfmt-misc.html) for
+testing MXNet on edge devices. Test can be invoked with the same syntax as for
+non-virtualized platforms:
 
 ```
-ssh -o StrictHostKeyChecking=no -p 2222 qemu@localhost
+./build.py -p armv7
+./build.py -p test.armv7 /work/runtime_functions.sh unittest_ubuntu_python3_armv7
 ```
 
-There are two pre-configured users: `root` and `qemu` both without passwords.
-
-
-### Example of reproducing a test result with QEMU on ARM
-
-
-You might want to enable a debug build first:
-
-```
-$ git diff
-diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
-index 39631f9..666ceea 100755
---- a/ci/docker/runtime_functions.sh
-+++ b/ci/docker/runtime_functions.sh
-@@ -172,6 +172,7 @@ build_armv7() {
-         -DUSE_LAPACK=OFF \
-         -DBUILD_CPP_EXAMPLES=OFF \
-         -Dmxnet_LINKER_LIBS=-lgfortran \
-+        -DCMAKE_BUILD_TYPE=Debug \
-         -G Ninja /work/mxnet
-
-     ninja -v
+For the test step to succeed, you must run Linux kernel 4.8 or later and have qemu installed.
 
+On Debian and Ubuntu systems, run the following command to install the dependencies:
 ```
+sudo apt install binfmt-support qemu-user-static
 
-Then we build the project for armv7, the test container and start QEMU inside docker:
-
-```
-ci/build.py -p armv7
-ci/build.py -p test.arm_qemu -b && docker run -p2222:2222 -ti mxnetci/build.test.arm_qemu
+# Use qemu-binfmt-conf.sh to register all binary types with the kernel
+wget https://raw.githubusercontent.com/qemu/qemu/stable-4.1/scripts/qemu-binfmt-conf.sh
+chmod +x qemu-binfmt-conf.sh
+sudo ./qemu-binfmt-conf.sh --persistent yes --qemu-suffix "-static" --qemu-path "/usr/bin" --systemd ALL
 ```
 
-
-
-At this point we copy artifacts and sources to the VM, in another terminal (host) do the following:
+If you run into segmentation faults at the beginning of the emulated tests, you
+probably have a ancient version of Qemu on your system (or found a bug in
+upstream Qemu). In that situation, you can rely on the
+`multiarch/qemu-user-static` Docker project to register a set of up-to-date Qemu
+binaries from their Docker image with your kernel:
 
 ```
-# Copy mxnet sources to the VM
-rsync --delete -e 'ssh -p2222' --exclude='.git/' -zvaP ./ qemu@localhost:mxnet
-
-
-# Ssh into the vm
-ssh -p2222 qemu@localhost
-
-cd mxnet
-
-# Execute a single failing C++ test
-build/tests/mxnet_unit_tests --gtest_filter="ACTIVATION_PERF.ExecuteBidirectional"
-
-# To install MXNet:
-sudo pip3 install --upgrade --force-reinstall build/mxnet-1.3.1-py2.py3-none-any.whl
-
-# Execute a single python test:
-
-nosetests-3.4 -v -s tests/python/unittest/test_ndarray.py
-
-
-# Debug with cgdb
-sudo apt install -y libstdc++6-6-dbg
-cgdb build/tests/mxnet_unit_tests
-
-(gdb) !pwd
-/home/qemu/mxnet
-(gdb) set substitute-path /work /home/qemu
-(gdb) set substitute-path /build/gcc-6-6mK9AW/gcc-6-6.3.0/build/arm-linux-gnueabihf/libstdc++-v3/include/ /usr/include/c++/6/
-(gdb) r --gtest_filter="ACTIVATION_PERF.ExecuteBidirectional"
+docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
 ```
diff --git a/ci/build.py b/ci/build.py
index a21ec44942a8..cbc41218f042 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -70,7 +70,8 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
     return "nvidia-docker" if use_nvidia_docker else "docker"
 
 
-def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool) -> str:
+def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
+                 cache_intermediate: bool) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
@@ -104,6 +105,8 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
            "--build-arg", "GROUP_ID={}".format(os.getgid())]
     if no_cache:
         cmd.append("--no-cache")
+    if cache_intermediate:
+        cmd.append("--rm=false")
     elif registry:
         cmd.extend(["--cache-from", tag])
     cmd.extend(["-t", tag, get_dockerfiles_path()])
@@ -330,6 +333,9 @@ def main() -> int:
     parser.add_argument("--no-cache", action="store_true",
                         help="passes --no-cache to docker build")
 
+    parser.add_argument("--cache-intermediate", action="store_true",
+                        help="passes --rm=false to docker build")
+
     parser.add_argument("-e", "--environment", nargs="*", default=[],
                         help="Environment variables for the docker container. "
                         "Specify with a list containing either names or name=value")
@@ -361,7 +367,8 @@ def main() -> int:
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
         if not args.run_only:
             build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
-                         num_retries=args.docker_build_retries, no_cache=args.no_cache)
+                         num_retries=args.docker_build_retries, no_cache=args.no_cache,
+                         cache_intermediate=args.cache_intermediate)
         else:
             logging.info("Skipping docker build step.")
 
diff --git a/ci/dev_menu.py b/ci/dev_menu.py
index e9f031e1b171..962e4ecfe03f 100755
--- a/ci/dev_menu.py
+++ b/ci/dev_menu.py
@@ -167,7 +167,7 @@ def provision_virtualenv(venv_path=DEFAULT_PYENV):
     ('[Docker] Python3 ARMv7 unittests (QEMU)',
     [
         "ci/build.py -p armv7",
-        "ci/build.py -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
+        "ci/build.py -p test.armv7 /work/runtime_functions.sh unittest_ubuntu_python3_armv7"
     ]),
     ('Clean (RESET HARD) repository (Warning! erases local changes / DATA LOSS)',
        Confirm("ci/docker/runtime_functions.sh clean_repo"))
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index 2c923a015b63..8d9fb6481e2e 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -18,62 +18,41 @@
 #
 # Dockerfile to build MXNet for Android ARMv7
 
-FROM dockcross/base
-MAINTAINER Pedro Larroy "pllarroy@amazon.com"
-
-# The cross-compiling emulator
-RUN apt-get update && apt-get install -y \
-  unzip
-
-ENV CROSS_TRIPLE=arm-linux-androideabi
-ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
-ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
-    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
-    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
-    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
-    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
-    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
-
-ENV ANDROID_NDK_REVISION 17b
-ENV ANDROID_NDK_API 27
-ENV ANDROID_NDK_ARCH arm
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/deps
-RUN /work/deps/android_ndk.sh
-
-ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
-
-# Build-time metadata as defined at http://label-schema.org
-ARG BUILD_DATE
-ARG IMAGE
-ARG VCS_REF
-ARG VCS_URL
-LABEL org.label-schema.build-date=$BUILD_DATE \
-      org.label-schema.name=$IMAGE \
-      org.label-schema.vcs-ref=$VCS_REF \
-      org.label-schema.vcs-url=$VCS_URL \
-      org.label-schema.schema-version="1.0"
-
-
-ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
-ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
-
-WORKDIR /work/deps
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-WORKDIR /work
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-COPY install/android_armv7_openblas.sh /work/deps
-RUN /work/deps/android_armv7_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-WORKDIR /work
+FROM ubuntu:20.04
+
+ENV ARCH=armv7l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV7
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    unzip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o android-ndk-r19-linux-x86_64.zip -L https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
+    unzip android-ndk-r19-linux-x86_64.zip && \
+    rm android-ndk-r19-linux-x86_64.zip
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    mkdir /usr/local/openblas-android && \
+    cd /usr/local/OpenBLAS && \
+    export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    make NOFORTRAN=1 ARM_SOFTFP_ABI=1 NO_SHARED=1 \
+        LDFLAGS="-L/usr/local/android-ndk-r19/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x -lm" \
+        CC=$TOOLCHAIN/bin/armv7a-linux-androideabi16-clang AR=$TOOLCHAIN/bin/arm-linux-androideabi-ar && \
+    make PREFIX=/usr/local/openblas-android NO_SHARED=1 install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
+ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -81,5 +60,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
-
+WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index ca62288129bb..a78113a33bae 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -18,62 +18,41 @@
 #
 # Dockerfile to build MXNet for Android ARM64/ARMv8
 
-FROM dockcross/base
-MAINTAINER Pedro Larroy "pllarroy@amazon.com"
-
-RUN apt-get update && apt-get install -y \
-  unzip
-
-WORKDIR /work/deps
-
-# Build x86 dependencies.
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-# Setup Android cross-compilation environment.
-ENV CROSS_TRIPLE=aarch64-linux-android
-ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
-ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
-    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
-    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
-    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
-    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
-    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
-
-
-ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
-
-# Build-time metadata as defined at http://label-schema.org
-ARG BUILD_DATE
-ARG IMAGE
-ARG VCS_REF
-ARG VCS_URL
-LABEL org.label-schema.build-date=$BUILD_DATE \
-      org.label-schema.name=$IMAGE \
-      org.label-schema.vcs-ref=$VCS_REF \
-      org.label-schema.vcs-url=$VCS_URL \
-      org.label-schema.schema-version="1.0"
-
-ENV ARCH aarch64
-ENV ANDROID_NDK_REVISION 17b
-ENV ANDROID_NDK_API 27
-ENV ANDROID_NDK_ARCH arm64
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/deps
-RUN /work/deps/android_ndk.sh
-
-
-WORKDIR /work/deps
-COPY install/android_ndk.sh /work/
-RUN /work/android_ndk.sh
-
-ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
-ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
-
-# Build ARM dependencies.
-COPY install/android_arm64_openblas.sh /work/
-RUN /work/android_arm64_openblas.sh
-ENV CPLUS_INCLUDE_PATH /work/deps/OpenBLAS
+FROM ubuntu:20.04
+
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV8
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    unzip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o android-ndk-r19-linux-x86_64.zip -L https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
+    unzip android-ndk-r19-linux-x86_64.zip && \
+    rm android-ndk-r19-linux-x86_64.zip
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    mkdir /usr/local/openblas-android && \
+    cd /usr/local/OpenBLAS && \
+    export TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    make NOFORTRAN=1 NO_SHARED=1 \
+        LDFLAGS="-L/usr/local/android-ndk-r21/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/lib/gcc/aarch64-linux-android/4.9.x -lm" \
+        CC=$TOOLCHAIN/bin/aarch64-linux-android21-clang AR=$TOOLCHAIN/bin/aarch64-linux-android-ar && \
+    make PREFIX=/usr/local/openblas-android NO_SHARED=1 install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
+ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -81,5 +60,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index e6a7ffe758b9..83186369d829 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -18,25 +18,42 @@
 #
 # Dockerfile to build MXNet for ARMv6
 
-FROM dockcross/linux-armv6
+FROM ubuntu:20.04
 
-ENV ARCH armv6l
-ENV HOSTCC gcc
-ENV TARGET ARMV6
+ENV ARCH=armv6l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV6
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+ && rm -rf /var/lib/apt/lists/*
 
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
+# We use a toolchain from toolchains.bootlin.com instead of Debian / Ubunut
+# crossbuild-essential-armel toolchain, as the latter targets ARM architecture
+# versions 4T, 5T, and 6, whereas we only wish to target ARMV6 and like to use
+# ARMV6 specific features. https://wiki.debian.org/ArmEabiPort
+RUN curl -o armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 -L https://toolchains.bootlin.com/downloads/releases/toolchains/armv6-eabihf/tarballs/armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 && \
+    tar xf armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 && \
+    rm armv6-eabihf--glibc--stable-2020.02-2.tar.bz2
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/share/buildroot/toolchainfile.cmake
 
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 NO_SHARED=1 CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc && \
+    make PREFIX=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot NO_SHARED=1 install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index bad9ab214050..d207d79485ae 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -16,27 +16,39 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to build MXNet for Android ARMv7
-
-FROM dockcross/linux-armv7
-
-ENV ARCH armv7l
-ENV HOSTCC gcc
-ENV TARGET ARMV7
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+# Dockerfile to build MXNet for ARMv7
+
+FROM ubuntu:20.04
+
+ENV ARCH=armv7l \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV7
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+    crossbuild-essential-armhf \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY toolchains/arm-linux-gnueabihf-toolchain.cmake /usr/local
+ENV CMAKE_TOOLCHAIN_FILE=/usr/local/arm-linux-gnueabihf-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 NO_SHARED=1 CC=arm-linux-gnueabihf-gcc && \
+    make PREFIX=/usr/local/arm-linux-gnueabihf NO_SHARED=1 install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index bd2373180f0b..d318cc2f02d4 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -18,29 +18,37 @@
 #
 # Dockerfile to build MXNet for ARM64/ARMv8
 
-FROM dockcross/linux-arm64
-
-ENV ARCH aarch64
-ENV HOSTCC gcc
-ENV TARGET ARMV8
-
-WORKDIR /work/deps
-
-# gh issue #11567 https://github.com/apache/incubator-mxnet/issues/11567
-#RUN sed -i '\#deb http://cdn-fastly.deb.debian.org/debian-security jessie/updates main#d' /etc/apt/sources.list
-#RUN sed -i 's/cdn-fastly.//' /etc/apt/sources.list
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
+FROM ubuntu:20.04
+
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    HOSTCXX=g++ \
+    TARGET=ARMV8
+
+WORKDIR /usr/local
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    cmake \
+    ccache \
+    git \
+    curl \
+    zip \
+    python3 \
+    python3-pip \
+    crossbuild-essential-arm64 \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
+ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 NO_SHARED=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu NO_SHARED=1 install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -48,4 +56,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/build
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index e31ee43a93d8..93fe5e0a5b0d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -20,68 +20,58 @@
 # This script assumes /work/mxnet exists and contains the mxnet code you wish to compile and
 # that /work/build exists and is the target for your output.
 
-FROM nvidia/cuda:9.0-cudnn7-devel as cudabuilder
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
 
-FROM dockcross/linux-arm64
+ENV ARCH=aarch64 \
+    HOSTCC=gcc \
+    TARGET=ARMV8
 
-ENV ARCH aarch64
-ENV HOSTCC gcc
-ENV TARGET ARMV8
+WORKDIR /usr/local
 
-# gh issue #11567 https://github.com/apache/incubator-mxnet/issues/11567
-#RUN sed -i '\#deb http://cdn-fastly.deb.debian.org/debian-security jessie/updates main#d' /etc/apt/sources.list
-#RUN sed -i 's/cdn-fastly.//' /etc/apt/sources.list
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    ninja-build \
+    git \
+    curl \
+    zip \
+    unzip \
+    python3 \
+    python3-pip \
+    awscli \
+    crossbuild-essential-arm64 \
+ && rm -rf /var/lib/apt/lists/*
 
+# cmake on Ubuntu 18.04 is too old
+RUN python3 -m pip install cmake
 
-WORKDIR /work/deps
-
-COPY install/ubuntu_arm.sh /work/
-RUN /work/ubuntu_arm.sh
-
-COPY install/arm_openblas.sh /work/
-RUN /work/arm_openblas.sh
-
-ENV OpenBLAS_HOME=${CROSS_ROOT}
-ENV OpenBLAS_DIR=${CROSS_ROOT}
-
+# ccache on Ubuntu 18.04 is too old to support Cuda correctly
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-# Setup CUDA build env (including configuring and copying nvcc)
-COPY --from=cudabuilder /usr/local/cuda /usr/local/cuda
-ENV TARGET_ARCH aarch64
-ENV TARGET_OS linux
+COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
+ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
+
+RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+    cd /usr/local/OpenBLAS && \
+    make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
+    make PREFIX=/usr/aarch64-linux-gnu install && \
+    cd /usr/local && \
+    rm -rf OpenBLAS
 
-# Install ARM depedencies based on Jetpack 3.3
-RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcenter/mobile/jetpack_l4t/3.3/lw.xd42/JetPackL4T_33_b39 && \
-    CUDA_REPO_PREFIX=/var/cuda-repo-9-0-local && \
-    ARM_CUDA_INSTALLER_PACKAGE=cuda-repo-l4t-9-0-local_9.0.252-1_arm64.deb && \
-    ARM_CUDNN_INSTALLER_PACKAGE=libcudnn7_7.1.5.14-1+cuda9.0_arm64.deb && \
-    ARM_CUDNN_DEV_INSTALLER_PACKAGE=libcudnn7-dev_7.1.5.14-1+cuda9.0_arm64.deb && \
-    ARM_LICENSE_INSTALLER=cuda-license-9-0_9.0.252-1_arm64.deb && \
-    ARM_CUBLAS_INSTALLER=cuda-cublas-9-0_9.0.252-1_arm64.deb && \
-    ARM_NVINFER_INSTALLER_PACKAGE=libnvinfer4_4.1.3-1+cuda9.0_arm64.deb && \
-    ARM_NVINFER_DEV_INSTALLER_PACKAGE=libnvinfer-dev_4.1.3-1+cuda9.0_arm64.deb && \
-    dpkg --add-architecture arm64 && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDA_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDNN_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_CUDNN_DEV_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_NVINFER_INSTALLER_PACKAGE && \
-    wget -nv $JETPACK_DOWNLOAD_PREFIX/$ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_CUDA_INSTALLER_PACKAGE && \
-    apt-key add $CUDA_REPO_PREFIX/7fa2af80.pub && \
-    dpkg -i --force-architecture  $ARM_CUDNN_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_CUDNN_DEV_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $CUDA_REPO_PREFIX/$ARM_LICENSE_INSTALLER && \
-    dpkg -i --force-architecture  $CUDA_REPO_PREFIX/$ARM_CUBLAS_INSTALLER && \
-    dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
-    dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
-    apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
-RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
-ENV PATH $PATH:/usr/local/cuda/bin
-ENV NVCCFLAGS "-m64"
-ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
-ENV NVCC /usr/local/cuda/bin/nvcc
+# Install aarch64 cross depedencies based on Jetpack 4.3
+# Manually downloaded using SDK Manager tool and placed in a private S3 bucket.
+# We're not allowed to redistribute these files and there is no public version.
+RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb . && \
+    dpkg -i cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb && \
+    rm cuda-repo-ubuntu1804-10-0-local-10.0.326-410.108_1.0-1_amd64.deb && \
+    apt-key add /var/cuda-repo-10-0-local-10.0.326-410.108/7fa2af80.pub && \
+    aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb . && \
+    dpkg -i cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb && \
+    rm cuda-repo-cross-aarch64-10-0-local-10.0.326_1.0-1_all.deb && \
+    apt-get update && \
+    apt-get install -y -f && \
+    apt-get install -y cuda-cross-aarch64 cuda-cross-aarch64-10-0 && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.test.arm_qemu b/ci/docker/Dockerfile.build.test.arm_qemu
deleted file mode 100644
index 5dc610a524b0..000000000000
--- a/ci/docker/Dockerfile.build.test.arm_qemu
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to build and run MXNet on Ubuntu 16.04 for CPU
-
-FROM ubuntu:16.04
-
-WORKDIR /work
-
-RUN apt-get update
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_arm_qemu.sh /work
-RUN /work/ubuntu_arm_qemu.sh
-
-COPY install/ubuntu_arm_qemu_bin.sh /work
-RUN /work/ubuntu_arm_qemu_bin.sh
-
-ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
-COPY qemu/* /work/
-
-# SSH to the Qemu VM
-EXPOSE 2222/tcp
-
-CMD ["./runtime_functions.py","run_qemu_interactive"]
diff --git a/ci/docker/Dockerfile.publish.ubuntu1404_gpu b/ci/docker/Dockerfile.build.test.armv7
similarity index 72%
rename from ci/docker/Dockerfile.publish.ubuntu1404_gpu
rename to ci/docker/Dockerfile.build.test.armv7
index 3a005cadecea..d49e7a5582c1 100644
--- a/ci/docker/Dockerfile.publish.ubuntu1404_gpu
+++ b/ci/docker/Dockerfile.build.test.armv7
@@ -16,17 +16,21 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to run MXNet on Ubuntu 14.04 for GPU
+# Dockerfile to test MXNet on Ubuntu 20.04 ARMv7 CPU
 
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu14.04
+FROM arm32v7/ubuntu:20.04
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
-COPY install/ubuntu_publish.sh /work/
-RUN /work/ubuntu_publish.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-numpy \
+    python3-scipy \
+    python3-nose \
+    python3-nose-timer \
+    python3-requests \
+ && rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -34,6 +38,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+WORKDIR /work/mxnet
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1404_gpu b/ci/docker/Dockerfile.build.test.armv8
similarity index 72%
rename from ci/docker/Dockerfile.publish.test.ubuntu1404_gpu
rename to ci/docker/Dockerfile.build.test.armv8
index 854dd68a63c1..bee4d85c6a97 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1404_gpu
+++ b/ci/docker/Dockerfile.build.test.armv8
@@ -16,18 +16,21 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to run MXNet on Ubuntu 14.04 for GPU
+# Dockerfile to test MXNet on Ubuntu 20.04 ARMv8 CPU
 
-# Use CPU with setup_gpu script
-FROM ubuntu:14.04
+FROM arm64v8/ubuntu:20.04
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
-COPY install/ubuntu_base.sh /work/
-RUN /work/ubuntu_base.sh
-
-COPY install/ubuntu_scala.sh /work/
-RUN /work/ubuntu_scala.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-numpy \
+    python3-scipy \
+    python3-nose \
+    python3-nose-timer \
+    python3-requests \
+ && rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -35,6 +38,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+WORKDIR /work/mxnet
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index 07f67d178c2c..4f5d07a40236 100644
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -35,15 +35,19 @@ RUN /work/ubuntu_python.sh
 COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
 RUN /work/ubuntu_scala.sh
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
+COPY install/ubuntu_gcc8.sh /work/
+RUN /work/ubuntu_gcc8.sh
 COPY install/ubuntu_binutils.sh /work/
 RUN /work/ubuntu_binutils.sh
+COPY install/thrust.sh /work/
+RUN /work/thrust.sh
+COPY install/ubuntu_r.sh /work/
+COPY install/r.gpg /work/
+RUN /work/ubuntu_r.sh
 
 ENV CUDNN_VERSION=7.6.5.32
 COPY install/ubuntu_cudnn.sh /work/
@@ -51,6 +55,7 @@ RUN /work/ubuntu_cudnn.sh
 
 # Special case because the CPP-Package requires the CUDA runtime libs
 # and not only stubs (which are provided by the base image)
+# This prevents usage of this image for actual GPU tests with Docker.
 COPY install/ubuntu_nvidia.sh /work/
 RUN /work/ubuntu_nvidia.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
index b1eb89bb3f36..3c17b748e3ab 100644
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -39,10 +39,6 @@ RUN /work/ubuntu_scala.sh
 COPY install/ubuntu_clojure.sh /work/
 RUN /work/ubuntu_clojure.sh
 
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 
@@ -64,6 +60,10 @@ RUN /work/ubuntu_caffe.sh
 COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
+COPY install/ubuntu_r.sh /work/
+COPY install/r.gpg /work/
+RUN /work/ubuntu_r.sh
+
 COPY install/ubuntu_docs.sh /work/
 RUN /work/ubuntu_docs.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu_julia b/ci/docker/Dockerfile.build.ubuntu_cpu_julia
index b1eb89bb3f36..3c17b748e3ab 100644
--- a/ci/docker/Dockerfile.build.ubuntu_cpu_julia
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu_julia
@@ -39,10 +39,6 @@ RUN /work/ubuntu_scala.sh
 COPY install/ubuntu_clojure.sh /work/
 RUN /work/ubuntu_clojure.sh
 
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 
@@ -64,6 +60,10 @@ RUN /work/ubuntu_caffe.sh
 COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
+COPY install/ubuntu_r.sh /work/
+COPY install/r.gpg /work/
+RUN /work/ubuntu_r.sh
+
 COPY install/ubuntu_docs.sh /work/
 RUN /work/ubuntu_docs.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu_r b/ci/docker/Dockerfile.build.ubuntu_cpu_r
index 264d34cd6422..2354cb3b66d6 100644
--- a/ci/docker/Dockerfile.build.ubuntu_cpu_r
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu_r
@@ -28,6 +28,9 @@ RUN /work/ubuntu_core.sh
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+COPY install/ubuntu_gcc8.sh /work/
+RUN /work/ubuntu_gcc8.sh
+
 COPY install/ubuntu_r.sh /work/
 COPY install/r.gpg /work/
 RUN /work/ubuntu_r.sh
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu_scala b/ci/docker/Dockerfile.build.ubuntu_cpu_scala
index 38874d290e1d..d0ce47784e27 100644
--- a/ci/docker/Dockerfile.build.ubuntu_cpu_scala
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu_scala
@@ -28,6 +28,9 @@ RUN /work/ubuntu_core.sh
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+COPY install/ubuntu_gcc8.sh /work/
+RUN /work/ubuntu_gcc8.sh
+
 COPY install/ubuntu_python.sh /work/
 COPY install/requirements /work/
 RUN /work/ubuntu_python.sh
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu100 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu100
deleted file mode 100644
index e35c64eeca5d..000000000000
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu100
+++ /dev/null
@@ -1,84 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
-
-FROM nvidia/cuda:10.0-devel-ubuntu16.04
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_scala.sh /work/
-COPY install/sbt.gpg /work/
-RUN /work/ubuntu_scala.sh
-
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
-COPY install/ubuntu_clang.sh /work/
-RUN /work/ubuntu_clang.sh
-
-COPY install/ubuntu_tvm.sh /work/
-RUN /work/ubuntu_tvm.sh
-
-COPY install/ubuntu_llvm.sh /work/
-RUN /work/ubuntu_llvm.sh
-
-COPY install/ubuntu_caffe.sh /work/
-RUN /work/ubuntu_caffe.sh
-
-COPY install/ubuntu_onnx.sh /work/
-RUN /work/ubuntu_onnx.sh
-
-COPY install/ubuntu_docs.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_docs.sh
-
-COPY install/ubuntu_tutorials.sh /work/
-RUN /work/ubuntu_tutorials.sh
-
-ENV CUDA_VERSION=10.0.130
-ENV CUDNN_VERSION=7.6.5.32
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
-
-# Always last
-ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu101 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
index aa62fbc6307e..aa2fdba837e3 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu101
@@ -36,16 +36,15 @@ COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
 RUN /work/ubuntu_scala.sh
 
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 
 COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 
+COPY install/ubuntu_gcc8.sh /work/
+RUN /work/ubuntu_gcc8.sh
+
 COPY install/ubuntu_tvm.sh /work/
 RUN /work/ubuntu_tvm.sh
 
@@ -70,9 +69,16 @@ ENV CUDNN_VERSION=7.6.5.32
 COPY install/ubuntu_cudnn.sh /work/
 RUN /work/ubuntu_cudnn.sh
 
+COPY install/thrust.sh /work/
+RUN /work/thrust.sh
+
 COPY install/ubuntu_binutils.sh /work/
 RUN /work/ubuntu_binutils.sh
 
+COPY install/ubuntu_r.sh /work/
+COPY install/r.gpg /work/
+RUN /work/ubuntu_r.sh
+
 # Always last
 ARG USER_ID=0
 ARG GROUP_ID=0
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu102 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu102
deleted file mode 100644
index 8badadbb1bdb..000000000000
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu102
+++ /dev/null
@@ -1,85 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
-
-FROM nvidia/cuda:10.2-devel-ubuntu16.04
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_scala.sh /work/
-COPY install/sbt.gpg /work/
-RUN /work/ubuntu_scala.sh
-
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
-COPY install/ubuntu_clang.sh /work/
-RUN /work/ubuntu_clang.sh
-
-COPY install/ubuntu_tvm.sh /work/
-RUN /work/ubuntu_tvm.sh
-
-COPY install/ubuntu_llvm.sh /work/
-RUN /work/ubuntu_llvm.sh
-
-COPY install/ubuntu_caffe.sh /work/
-RUN /work/ubuntu_caffe.sh
-
-COPY install/ubuntu_onnx.sh /work/
-RUN /work/ubuntu_onnx.sh
-
-COPY install/ubuntu_docs.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_docs.sh
-
-COPY install/ubuntu_tutorials.sh /work/
-RUN /work/ubuntu_tutorials.sh
-
-ENV CUDA_VERSION=10.2.89
-ENV CUDNN_VERSION=7.6.5.32
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
-
-# Always last
-ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/compat
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu80 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu80
deleted file mode 100644
index 30971b0a5c6e..000000000000
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu80
+++ /dev/null
@@ -1,79 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
-
-FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_scala.sh /work/
-COPY install/sbt.gpg /work/
-RUN /work/ubuntu_scala.sh
-
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
-COPY install/ubuntu_clang.sh /work/
-RUN /work/ubuntu_clang.sh
-
-COPY install/ubuntu_tvm.sh /work/
-RUN /work/ubuntu_tvm.sh
-
-COPY install/ubuntu_llvm.sh /work/
-RUN /work/ubuntu_llvm.sh
-
-COPY install/ubuntu_caffe.sh /work/
-RUN /work/ubuntu_caffe.sh
-
-COPY install/ubuntu_onnx.sh /work/
-RUN /work/ubuntu_onnx.sh
-
-COPY install/ubuntu_docs.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_docs.sh
-
-COPY install/ubuntu_tutorials.sh /work/
-RUN /work/ubuntu_tutorials.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
-
-ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu90 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu90
deleted file mode 100644
index cc50e7e55191..000000000000
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu90
+++ /dev/null
@@ -1,85 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
-
-FROM nvidia/cuda:9.0-devel-ubuntu16.04
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_scala.sh /work/
-COPY install/sbt.gpg /work/
-RUN /work/ubuntu_scala.sh
-
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
-COPY install/ubuntu_clang.sh /work/
-RUN /work/ubuntu_clang.sh
-
-COPY install/ubuntu_tvm.sh /work/
-RUN /work/ubuntu_tvm.sh
-
-COPY install/ubuntu_llvm.sh /work/
-RUN /work/ubuntu_llvm.sh
-
-COPY install/ubuntu_caffe.sh /work/
-RUN /work/ubuntu_caffe.sh
-
-COPY install/ubuntu_onnx.sh /work/
-RUN /work/ubuntu_onnx.sh
-
-COPY install/ubuntu_docs.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_docs.sh
-
-COPY install/ubuntu_tutorials.sh /work/
-RUN /work/ubuntu_tutorials.sh
-
-ENV CUDA_VERSION=9.0.176
-ENV CUDNN_VERSION=7.6.5.32
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
-
-# Always last
-ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu92 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu92
deleted file mode 100644
index 40a4f44abeb5..000000000000
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu92
+++ /dev/null
@@ -1,84 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
-
-FROM nvidia/cuda:9.2-devel-ubuntu16.04
-
-WORKDIR /work/deps
-
-COPY install/ubuntu_core.sh /work/
-RUN /work/ubuntu_core.sh
-
-COPY install/deb_ubuntu_ccache.sh /work/
-RUN /work/deb_ubuntu_ccache.sh
-
-COPY install/ubuntu_python.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_python.sh
-
-COPY install/ubuntu_scala.sh /work/
-COPY install/sbt.gpg /work/
-RUN /work/ubuntu_scala.sh
-
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
-COPY install/ubuntu_clang.sh /work/
-RUN /work/ubuntu_clang.sh
-
-COPY install/ubuntu_tvm.sh /work/
-RUN /work/ubuntu_tvm.sh
-
-COPY install/ubuntu_llvm.sh /work/
-RUN /work/ubuntu_llvm.sh
-
-COPY install/ubuntu_caffe.sh /work/
-RUN /work/ubuntu_caffe.sh
-
-COPY install/ubuntu_onnx.sh /work/
-RUN /work/ubuntu_onnx.sh
-
-COPY install/ubuntu_docs.sh /work/
-COPY install/requirements /work/
-RUN /work/ubuntu_docs.sh
-
-COPY install/ubuntu_tutorials.sh /work/
-RUN /work/ubuntu_tutorials.sh
-
-ENV CUDA_VERSION=9.2.148
-ENV CUDNN_VERSION=7.6.5.32
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
-
-ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
-
-WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_cpu b/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
index 5717df1b9130..49a665e57c33 100644
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
@@ -36,10 +36,6 @@ COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
 RUN /work/ubuntu_scala.sh
 
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 
@@ -52,6 +48,10 @@ RUN /work/ubuntu_caffe.sh
 COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
+COPY install/ubuntu_r.sh /work/
+COPY install/r.gpg /work/
+RUN /work/ubuntu_r.sh
+
 COPY install/ubuntu_docs.sh /work/
 COPY install/requirements /work/
 RUN /work/ubuntu_docs.sh
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
index 5e812c433b43..82d049792c1b 100644
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
@@ -36,10 +36,6 @@ COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
 RUN /work/ubuntu_scala.sh
 
-COPY install/ubuntu_r.sh /work/
-COPY install/r.gpg /work/
-RUN /work/ubuntu_r.sh
-
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
 
@@ -58,6 +54,10 @@ RUN /work/ubuntu_caffe.sh
 COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
+COPY install/ubuntu_r.sh /work/
+COPY install/r.gpg /work/
+RUN /work/ubuntu_r.sh
+
 COPY install/ubuntu_docs.sh /work/
 COPY install/requirements /work/
 RUN /work/ubuntu_docs.sh
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1404_cpu b/ci/docker/Dockerfile.publish.centos7_cpu
similarity index 68%
rename from ci/docker/Dockerfile.publish.test.ubuntu1404_cpu
rename to ci/docker/Dockerfile.publish.centos7_cpu
index 035837686554..2010238cb71d 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1404_cpu
+++ b/ci/docker/Dockerfile.publish.centos7_cpu
@@ -16,24 +16,26 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Dockerfile to build and run MXNet on Ubuntu 14.04 for CPU
+# Dockerfile to build and run MXNet on CentOS 7 for CPU
 
-FROM ubuntu:14.04
+FROM centos:7
 
 WORKDIR /work/deps
 
-COPY install/ubuntu_base.sh /work/
-RUN /work/ubuntu_base.sh
-
-COPY install/ubuntu_scala.sh /work/
-RUN /work/ubuntu_scala.sh
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
 
 ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
 
+ENV PYTHONPATH=./python/
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu100 b/ci/docker/Dockerfile.publish.centos7_gpu_cu100
new file mode 100644
index 000000000000..f9469fcb186f
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu100
@@ -0,0 +1,43 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM nvidia/cuda:10.0-cudnn7-devel-centos7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+ENV SHORT_CUDA_VERSION=10.0
+ENV SHORT_NCCL_VERSION=2.4.8
+COPY install/centos7_nccl.sh /work/
+RUN /work/centos7_nccl.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu101 b/ci/docker/Dockerfile.publish.centos7_gpu_cu101
new file mode 100644
index 000000000000..00be436c0412
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu101
@@ -0,0 +1,43 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM nvidia/cuda:10.1-cudnn7-devel-centos7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+ENV SHORT_CUDA_VERSION=10.1
+ENV SHORT_NCCL_VERSION=2.4.8
+COPY install/centos7_nccl.sh /work/
+RUN /work/centos7_nccl.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu102 b/ci/docker/Dockerfile.publish.centos7_gpu_cu102
new file mode 100644
index 000000000000..27a625e4641d
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu102
@@ -0,0 +1,43 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM nvidia/cuda:10.2-cudnn7-devel-centos7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+ENV SHORT_CUDA_VERSION=10.2
+ENV SHORT_NCCL_VERSION=2.4.8
+COPY install/centos7_nccl.sh /work/
+RUN /work/centos7_nccl.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.ubuntu1404_cpu b/ci/docker/Dockerfile.publish.centos7_gpu_cu90
similarity index 63%
rename from ci/docker/Dockerfile.publish.ubuntu1404_cpu
rename to ci/docker/Dockerfile.publish.centos7_gpu_cu90
index 8ccc41b2143c..23217148f87c 100644
--- a/ci/docker/Dockerfile.publish.ubuntu1404_cpu
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu90
@@ -15,25 +15,29 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#
-# Dockerfile to build and run MXNet on Ubuntu 14.04 for CPU
 
-FROM ubuntu:14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-centos7
 
 WORKDIR /work/deps
 
-COPY install/ubuntu_publish.sh /work/
-RUN /work/ubuntu_publish.sh
-
-COPY install/ubuntu_binutils.sh /work/
-RUN /work/ubuntu_binutils.sh
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+ENV SHORT_CUDA_VERSION=9.0
+ENV SHORT_NCCL_VERSION=2.4.8
+COPY install/centos7_nccl.sh /work/
+RUN /work/centos7_nccl.sh
 
 ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
 
+ENV PYTHONPATH=./python/
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu92 b/ci/docker/Dockerfile.publish.centos7_gpu_cu92
new file mode 100644
index 000000000000..75277f0f1fd2
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu92
@@ -0,0 +1,43 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM nvidia/cuda:9.2-cudnn7-devel-centos7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+ENV SHORT_CUDA_VERSION=9.2
+ENV SHORT_NCCL_VERSION=2.4.8
+COPY install/centos7_nccl.sh /work/
+RUN /work/centos7_nccl.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/install/android_armv7_openblas.sh b/ci/docker/install/android_armv7_openblas.sh
deleted file mode 100755
index 55c098909654..000000000000
--- a/ci/docker/install/android_armv7_openblas.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make TARGET=ARMV7 HOSTCC=gcc NOFORTRAN=1 ARM_SOFTFP_ABI=1 -j$(nproc) libs
-#make PREFIX=${CROSS_ROOT} TARGET=ARMV7 HOSTCC=gcc NOFORTRAN=1 ARM_SOFTFP_ABI=1 install
-cp *.h ${CROSS_ROOT}/include
-cp libopenblas*.a ${CROSS_ROOT}/lib
-popd
diff --git a/ci/docker/install/android_ndk.sh b/ci/docker/install/android_ndk.sh
deleted file mode 100755
index cb83aa65639a..000000000000
--- a/ci/docker/install/android_ndk.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-# This environment variable comes from the docker file
-echo "Downloading android SDK rev ${ANDROID_NDK_REVISION}"
-curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
-unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
-cd android-ndk-r${ANDROID_NDK_REVISION} && \
-./build/tools/make_standalone_toolchain.py \
-    --stl=libc++ \
-    --arch ${ANDROID_NDK_ARCH}\
-    --api ${ANDROID_NDK_API}\
-    --install-dir=${CROSS_ROOT} && \
-
-find ${CROSS_ROOT} -exec chmod a+r '{}' \; && \
-find ${CROSS_ROOT} -executable -exec chmod a+x '{}' \;
-popd
diff --git a/ci/docker/install/arm64_openblas.sh b/ci/docker/install/arm64_openblas.sh
deleted file mode 100755
index 88f2e98cd65b..000000000000
--- a/ci/docker/install/arm64_openblas.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-pushd .
-wget -nv https://api.github.com/repos/xianyi/OpenBLAS/git/refs/heads/master -O openblas_version.json
-echo "Using openblas:"
-cat openblas_version.json
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make -j$(nproc) TARGET=ARMV8
-make install
-ln -s /opt/OpenBLAS/lib/libopenblas.so /usr/lib/libopenblas.so
-ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/libopenblas.a
-ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/liblapack.a
-popd
diff --git a/ci/docker/install/centos7_base.sh b/ci/docker/install/centos7_base.sh
index c5f860e6e7a7..72896cbc42ad 100755
--- a/ci/docker/install/centos7_base.sh
+++ b/ci/docker/install/centos7_base.sh
@@ -30,9 +30,17 @@ yum -y install make
 yum -y install unzip
 yum -y install ninja-build
 yum -y install gcc-gfortran
+yum -y install automake
+yum -y install autoconf
+yum -y install libtool
 yum -y install protobuf-compiler
 yum -y install protobuf-devel
 yum -y install zeromq-devel
+yum -y install patchelf
+
+# gcc7
+yum -y install centos-release-scl
+yum -y install devtoolset-7
 
 # Centos 7 only provides ninja-build
 ln -s /usr/bin/ninja-build /usr/bin/ninja
diff --git a/ci/docker/install/centos7_ccache.sh b/ci/docker/install/centos7_ccache.sh
index 19f7cefec3ad..955287b228e8 100755
--- a/ci/docker/install/centos7_ccache.sh
+++ b/ci/docker/install/centos7_ccache.sh
@@ -23,19 +23,17 @@ set -ex
 
 pushd .
 
-yum -y install autoconf libb2-devel libzstd-devel
+yum -y install autoconf libb2-devel libzstd-devel gperf
 
 mkdir -p /work/deps
 cd /work/deps
 
 git clone --recursive https://github.com/ccache/ccache.git
 cd ccache
-# Checkout a fixed & tested pre-release commit of ccache 4
-# ccache 4 contains fixes for caching nvcc output: https://github.com/ccache/ccache/pull/381
-git checkout 2e7154e67a5dd56852dae29d4c418d4ddc07c230
+git checkout v3.7.9
 
 ./autogen.sh
-CXXFLAGS="-Wno-missing-field-initializers" ./configure --disable-man
+./configure --disable-man
 make -j$(nproc)
 make install
 
diff --git a/ci/docker/install/centos7_core.sh b/ci/docker/install/centos7_core.sh
index fbdb239cf0c2..7f1c3d70aebc 100755
--- a/ci/docker/install/centos7_core.sh
+++ b/ci/docker/install/centos7_core.sh
@@ -39,6 +39,14 @@ yum -y install make
 yum -y install wget
 yum -y install unzip
 yum -y install ninja-build
+yum -y install automake
+yum -y install autoconf
+yum -y install libtool
+yum -y install patchelf
+
+# gcc7
+yum -y install centos-release-scl
+yum -y install devtoolset-7
 
 # Centos 7 only provides ninja-build
 ln -s /usr/bin/ninja-build /usr/bin/ninja
diff --git a/ci/docker/install/arm_openblas.sh b/ci/docker/install/centos7_nccl.sh
similarity index 53%
rename from ci/docker/install/arm_openblas.sh
rename to ci/docker/install/centos7_nccl.sh
index fa2e5cae9cba..7a14f104b328 100755
--- a/ci/docker/install/arm_openblas.sh
+++ b/ci/docker/install/centos7_nccl.sh
@@ -19,12 +19,17 @@
 
 set -ex
 
-git clone --recursive -b v0.2.20 https://github.com/xianyi/OpenBLAS.git
 
-cd OpenBLAS
-make -j$(nproc)
-PREFIX=${CROSS_ROOT} make install
+if [ -z ${SHORT_CUDA_VERSION} ]; then
+    echo "Error: SHORT_CUDA_VERSION environment variable undefined"
+    exit 1
+fi
+if [ -z ${SHORT_NCCL_VERSION} ]; then
+    echo "Error: SHORT_NCCL_VERSION environment variable undefined"
+    exit 1
+fi
 
-cd ..
-
-rm -rf OpenBLAS
+curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm -O
+rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm
+yum check-update || true  # exit code 100 in case of available updates
+yum install -y libnccl-${SHORT_NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} libnccl-devel-${SHORT_NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} libnccl-static-${SHORT_NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION}
diff --git a/ci/docker/install/deb_ubuntu_ccache.sh b/ci/docker/install/deb_ubuntu_ccache.sh
index cdc9354e220f..ef913ba36e55 100755
--- a/ci/docker/install/deb_ubuntu_ccache.sh
+++ b/ci/docker/install/deb_ubuntu_ccache.sh
@@ -23,7 +23,7 @@ set -ex
 
 pushd .
 
-apt update || true
+apt update
 apt install -y \
     autoconf \
     gperf \
@@ -32,31 +32,9 @@ apt install -y \
 mkdir -p /work/deps
 cd /work/deps
 
-# Unset ARM toolchain cross-compilation configuration on dockcross
-unset ARCH
-unset DEFAULT_DOCKCROSS_IMAGE
-unset CROSS_TRIPLE
-unset CC
-unset AS
-unset AR
-unset FC
-unset CXX
-unset CROSS_ROOT
-unset CROSS_COMPILE
-unset PKG_CONFIG_PATH
-unset CMAKE_TOOLCHAIN_FILE
-unset CPP
-unset LD
-export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-
 git clone --recursive https://github.com/ccache/ccache.git
 cd ccache
-git checkout v3.7.8
-# Backport cuda related fixes: https://github.com/ccache/ccache/pull/381
-git config user.name "MXNet CI"
-git config user.email "MXNetCI@example.com"
-git cherry-pick --strategy-option=theirs c4fffda031034f930df2cf188878b8f9160027df
-git cherry-pick 0dec5c2df3e3ebc1fbbf33f74c992bef6264f37a
+git checkout v3.7.9
 
 ./autogen.sh
 ./configure --disable-man
diff --git a/ci/docker/install/ubuntu_arm.sh b/ci/docker/install/thrust.sh
similarity index 75%
rename from ci/docker/install/ubuntu_arm.sh
rename to ci/docker/install/thrust.sh
index 608d0362f138..b307604dcd85 100755
--- a/ci/docker/install/ubuntu_arm.sh
+++ b/ci/docker/install/thrust.sh
@@ -17,12 +17,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -ex
+# Install Thrust 1.9.8 to be shipped with Cuda 11.
+# Fixes https://github.com/thrust/thrust/issues/1072 for Clang 10
+# This file can be deleted when using Cuda 11 on CI
 
-apt update || true
-apt install -y \
-    unzip \
-    python3 \
-    python3-pip
+set -ex
 
-pip3 install setuptools
+cd /usr/local
+git clone https://github.com/thrust/thrust.git
+cd thrust
+git checkout 1.9.8
diff --git a/ci/docker/install/ubuntu_arm_qemu_bin.sh b/ci/docker/install/ubuntu_arm_qemu_bin.sh
deleted file mode 100755
index d4f81185c169..000000000000
--- a/ci/docker/install/ubuntu_arm_qemu_bin.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -exuo pipefail
-
-#
-# This disk image and kernels for virtual testing with QEMU  is generated with some manual OS
-# installation steps with the scripts and documentation found in the ci/qemu/ folder.
-#
-# The image has a base Debian OS and MXNet runtime dependencies installed.
-# The root password is empty and there's a "qemu" user without password. SSH access is enabled as
-# well.
-#
-# See also: ci/qemu/README.md
-#
-
-REMOTE="https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data"
-curl -f ${REMOTE}/vda_debian_stretch.qcow2.bz2 | bunzip2 > vda.qcow2
-curl -f ${REMOTE}/vmlinuz -o vmlinuz
-curl -f ${REMOTE}/initrd.img -o initrd.img
-
diff --git a/ci/docker/install/ubuntu_gcc8.sh b/ci/docker/install/ubuntu_gcc8.sh
index cd31f8213c1a..e0f2986e101f 100755
--- a/ci/docker/install/ubuntu_gcc8.sh
+++ b/ci/docker/install/ubuntu_gcc8.sh
@@ -20,4 +20,4 @@
 sudo add-apt-repository ppa:jonathonf/gcc-8.0
 sudo add-apt-repository ppa:jonathonf/gcc-7.3
 sudo apt-get update || true
-sudo apt-get install -y gcc-8 g++-8
+sudo apt-get install -y gcc-8 g++-8 gcc-7 g++-7
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
deleted file mode 100755
index 4690a2c3dfad..000000000000
--- a/ci/docker/install/ubuntu_publish.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Build on Ubuntu 14.04 LTS for LINUX CPU/GPU
-set -ex
-
-# replace https with http to force apt-get update to use http
-# nvidia-docker no longer supports ubuntu 14.04
-# refer https://github.com/apache/incubator-mxnet/issues/18005
-sudo sed -i 's/https/http/g' /etc/apt/sources.list.d/*.list
-apt-get update
-apt-get install -y software-properties-common
-add-apt-repository ppa:ubuntu-toolchain-r/test -y
-add-apt-repository ppa:openjdk-r/ppa -y # Java lib
-apt-get update
-apt-get install -y git \
-    cmake3 \
-    ninja-build \
-    libcurl4-openssl-dev \
-    unzip \
-    gcc-4.8 \
-    g++-4.8 \
-    gfortran \
-    gfortran-4.8 \
-    binutils \
-    nasm \
-    libtool \
-    curl \
-    wget \
-    sudo \
-    gnupg \
-    gnupg2 \
-    gnupg-agent \
-    pandoc \
-    python3-pip \
-    automake \
-    pkg-config \
-    openjdk-8-jdk
-
-curl -o apache-maven-3.3.9-bin.tar.gz -L http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
-    || curl -o apache-maven-3.3.9-bin.tar.gz -L https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
-
-tar xzf apache-maven-3.3.9-bin.tar.gz
-mkdir /usr/local/maven
-mv apache-maven-3.3.9/ /usr/local/maven/
-update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
-update-ca-certificates -f
-
-# patchelf available starting Ubuntu 16.04; compile from source for Ubuntu 14.04
-mkdir /usr/local/patchelf
-cd /usr/local/patchelf
-curl -L -o patchelf-0.10.tar.gz https://github.com/NixOS/patchelf/archive/0.10.tar.gz
-tar xzf patchelf-0.10.tar.gz
-cd /usr/local/patchelf/patchelf-0.10
-./bootstrap.sh
-./configure
-make
-sudo make install
-cd /
-
-apt-get install -y python python-pip python3 python3-pip
-
-# the version of the pip shipped with ubuntu may be too lower, install a recent version here
-# Restrict pip version to <19 due to use of Python 3.4 on Ubuntu 14.04
-python3 -m pip install --upgrade 'pip<19'
-
-# Restrict numpy version to <1.18 due to use of Python 3.4 on Ubuntu 14.04
-python3 -m pip install --upgrade --ignore-installed nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<1.18' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-
-# CMake 3.13.2+ is required
-mkdir /opt/cmake && cd /opt/cmake
-wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh
-sh cmake-3.13.5-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
-ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
-rm cmake-3.13.5-Linux-x86_64.sh
-cmake --version
diff --git a/ci/docker/install/ubuntu_r.sh b/ci/docker/install/ubuntu_r.sh
index b7ddea78f90a..44ebf7c0799e 100755
--- a/ci/docker/install/ubuntu_r.sh
+++ b/ci/docker/install/ubuntu_r.sh
@@ -44,4 +44,7 @@ apt-get install -y --allow-unauthenticated \
     r-base-dev \
     texinfo \
     texlive \
-    texlive-fonts-extra 
+    texlive-fonts-extra
+
+# Delete cran repository as it requires --allow-unauthenticated
+find /etc/apt -name "*.list" | xargs sed -i 's/.*cran\.rstudio.com.*//'
diff --git a/ci/docker/install/ubuntu_scala.sh b/ci/docker/install/ubuntu_scala.sh
index d223b8e173ae..355e978e075c 100755
--- a/ci/docker/install/ubuntu_scala.sh
+++ b/ci/docker/install/ubuntu_scala.sh
@@ -21,33 +21,11 @@
 # the whole docker cache for the image
 
 set -ex
-cd "$(dirname "$0")"
-# install libraries for mxnet's scala package on ubuntu
-echo 'Installing Scala...'
 
-# Ubuntu 14.04
-if [[ $(lsb_release -r | grep 14.04) ]]; then
-   add-apt-repository -y ppa:openjdk-r/ppa
-fi
-
-# All Ubuntu
 apt-get update || true
 apt-get install -y \
     openjdk-8-jdk \
     openjdk-8-jre \
     software-properties-common \
-    scala
-
-# Ubuntu 14.04
-if [[ $(lsb_release -r | grep 14.04) ]]; then
-    curl -o apache-maven-3.3.9-bin.tar.gz -L http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
-        || curl -o apache-maven-3.3.9-bin.tar.gz -L https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
-
-    tar xzf apache-maven-3.3.9-bin.tar.gz
-    mkdir /usr/local/maven
-    mv apache-maven-3.3.9/ /usr/local/maven/
-    update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
-    update-ca-certificates -f
-else
-    apt-get install -y maven
-fi
+    scala \
+    maven
diff --git a/ci/docker/qemu/README.md b/ci/docker/qemu/README.md
deleted file mode 100644
index c06b34562b57..000000000000
--- a/ci/docker/qemu/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-These are files used in the docker container that runs QEMU
diff --git a/ci/docker/qemu/runtime_functions.py b/ci/docker/qemu/runtime_functions.py
deleted file mode 100755
index 5a57cb8dae6a..000000000000
--- a/ci/docker/qemu/runtime_functions.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# -*- coding: utf-8 -*-
-"""Runtime functions to use in docker / testing"""
-
-__author__ = 'Pedro Larroy'
-__version__ = '0.1'
-
-import os
-import sys
-import subprocess
-import argparse
-import logging
-from subprocess import call, check_call, Popen, DEVNULL, PIPE
-import time
-import sys
-import types
-import glob
-import vmcontrol
-from vmcontrol import qemu_ssh, qemu_provision, qemu_rsync_to_host, VM
-
-def activate_this(base):
-    import site
-    import os
-    import sys
-    if sys.platform == 'win32':
-        site_packages = os.path.join(base, 'Lib', 'site-packages')
-    else:
-        site_packages = os.path.join(base, 'lib', 'python%s' % sys.version[:3], 'site-packages')
-    prev_sys_path = list(sys.path)
-    sys.real_prefix = sys.prefix
-    sys.prefix = base
-    # Move the added items to the front of the path:
-    new_sys_path = []
-    for item in list(sys.path):
-        if item not in prev_sys_path:
-            new_sys_path.append(item)
-            sys.path.remove(item)
-    sys.path[:0] = new_sys_path
-
-
-
-
-def run_ut_py3_qemu():
-    """Run unit tests in the emulator and copy the results back to the host through the mounted
-    volume in /mxnet"""
-    from vmcontrol import VM
-    with VM() as vm:
-        qemu_provision(vm.ssh_port)
-        logging.info("execute tests")
-        qemu_ssh(vm.ssh_port, "./runtime_functions.py", "run_ut_python3_qemu_internal")
-        qemu_rsync_to_host(vm.ssh_port, "*.xml", "mxnet")
-        logging.info("copied to host")
-        logging.info("tests finished, vm shutdown.")
-        vm.shutdown()
-
-def run_ut_python3_qemu_internal():
-    """this runs inside the vm"""
-    pkg = glob.glob('mxnet_dist/*.whl')[0]
-    logging.info("=== NOW Running inside QEMU ===")
-    logging.info("PIP Installing %s", pkg)
-    check_call(['sudo', 'pip3', 'install', pkg])
-    logging.info("PIP Installing mxnet/test_requirements.txt") 
-    check_call(['sudo', 'pip3', 'install', '-r', 'mxnet/test_requirements.txt'])
-    logging.info("Running tests in mxnet/tests/python/unittest/")
-    check_call(['nosetests', '--with-timer', '--with-xunit', '--xunit-file', 'nosetests_unittest.xml', '--verbose', 'mxnet/tests/python/unittest/test_engine.py'])
-    # Example to run a single unit test:
-    # check_call(['nosetests', '--with-timer', '--with-xunit', '--xunit-file', 'nosetests_unittest.xml', '--verbose', 'mxnet/tests/python/unittest/test_ndarray.py:test_ndarray_fluent'])
-
-
-
-def run_qemu_interactive():
-    vm = VM(interactive=True)
-    vm.detach()
-    vm.start()
-    vm.wait()
-    logging.info("QEMU finished")
-
-################################
-
-def parsed_args():
-    parser = argparse.ArgumentParser(description="""python runtime functions""", epilog="")
-    parser.add_argument('command',nargs='*',
-        help="Name of the function to run with arguments")
-    args = parser.parse_args()
-    return (args, parser)
-
-def script_name() -> str:
-    return os.path.split(sys.argv[0])[1]
-
-def chdir_to_script_directory():
-    # We need to be in the same directory than the script so the commands in the dockerfiles work as
-    # expected. But the script can be invoked from a different path
-    base = os.path.split(os.path.realpath(__file__))[0]
-    os.chdir(base)
-
-def main():
-    logging.getLogger().setLevel(logging.INFO)
-    logging.basicConfig(format='{}: %(asctime)-15s %(message)s'.format(script_name()))
-    chdir_to_script_directory()
-
-    # Run function with name passed as argument
-    (args, parser) = parsed_args()
-    logging.info("%s", args.command)
-    if args.command:
-        fargs = args.command[1:]
-        globals()[args.command[0]](*fargs)
-        return 0
-    else:
-        parser.print_help()
-        fnames = [x for x in globals() if type(globals()[x]) is types.FunctionType]
-        print('\nAvailable functions: {}'.format(' '.join(fnames)))
-        return 1
-
-if __name__ == '__main__':
-    sys.exit(main())
-
diff --git a/ci/docker/qemu/vmcontrol.py b/ci/docker/qemu/vmcontrol.py
deleted file mode 100644
index 31ef4d2550c3..000000000000
--- a/ci/docker/qemu/vmcontrol.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# -*- coding: utf-8 -*-
-"""Utilities to control a guest VM, used for virtual testing with QEMU"""
-
-__author__ = 'Pedro Larroy'
-__version__ = '0.1'
-
-import os
-import sys
-import subprocess
-import argparse
-import logging
-from subprocess import call, check_call, Popen, DEVNULL, PIPE
-import time
-import sys
-import multiprocessing
-import shlex
-
-###################################################
-#
-# Virtual testing with QEMU
-#
-# We start QEMU instances that have a local port in the host redirected to the ssh port.
-#
-# The VMs are provisioned after boot, tests are run and then they are stopped
-#
-QEMU_SSH_PORT=2222
-QEMU_RAM=4096
-
-QEMU_RUN="""
-qemu-system-arm -M virt -m {ram} \
-  -kernel vmlinuz \
-  -initrd initrd.img \
-  -append 'root=/dev/vda1' \
-  -drive if=none,file=vda.qcow2,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet,hostfwd=tcp::{ssh_port}-:22 \
-  -device virtio-net-device,netdev=mynet \
-  -display none -nographic
-"""
-
-QEMU_RUN_INTERACTIVE="""
-qemu-system-arm -M virt -m {ram} \
-  -kernel vmlinuz \
-  -initrd initrd.img \
-  -append 'root=/dev/vda1' \
-  -drive if=none,file=vda.qcow2,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet,hostfwd=tcp::{ssh_port}-:22 \
-  -device virtio-net-device,netdev=mynet \
-  -nographic
-"""
-
-def retry(target_exception, tries=4, delay_s=1, backoff=2):
-    """Retry calling the decorated function using an exponential backoff.
-
-    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
-    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
-
-    :param target_exception: the exception to check. may be a tuple of
-        exceptions to check
-    :type target_exception: Exception or tuple
-    :param tries: number of times to try (not retry) before giving up
-    :type tries: int
-    :param delay_s: initial delay between retries in seconds
-    :type delay_s: int
-    :param backoff: backoff multiplier e.g. value of 2 will double the delay
-        each retry
-    :type backoff: int
-    """
-    import time
-    from functools import wraps
-
-    def decorated_retry(f):
-        @wraps(f)
-        def f_retry(*args, **kwargs):
-            mtries, mdelay = tries, delay_s
-            while mtries > 1:
-                try:
-                    return f(*args, **kwargs)
-                except target_exception as e:
-                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
-                    time.sleep(mdelay)
-                    mtries -= 1
-                    mdelay *= backoff
-            return f(*args, **kwargs)
-
-        return f_retry  # true decorator
-
-    return decorated_retry
-
-
-
-
-class VMError(RuntimeError):
-    pass
-
-class VM:
-    """Control of the virtual machine"""
-    def __init__(self, ssh_port=QEMU_SSH_PORT, ram=QEMU_RAM, interactive=False):
-        self.log = logging.getLogger(VM.__name__)
-        self.ssh_port = ssh_port
-        self.timeout_s = 300
-        self.qemu_process = None
-        self._detach = False
-        self._interactive = interactive
-        self.ram = ram
-
-    def __enter__(self):
-        self.start()
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        if not self._detach:
-            self.shutdown()
-            self.terminate()
-
-    def start(self):
-        sys.stderr.flush()
-        call(['toilet', '-f', 'smbraille', 'Starting QEMU'])
-        sys.stdout.flush()
-        self.log.info("Starting VM, ssh port redirected to localhost:%s (inside docker, not exposed by default)", self.ssh_port)
-        if self.is_running():
-            raise VMError("VM is running, shutdown first")
-        if self._interactive:
-            self.qemu_process = Popen(shlex.split(QEMU_RUN_INTERACTIVE.format(ssh_port=self.ssh_port, ram=self.ram)))
-            return
-        else:
-            self.log.info("Starting in non-interactive mode. Terminal output is disabled.")
-            self.qemu_process = Popen(shlex.split(QEMU_RUN.format(ssh_port=self.ssh_port, ram=self.ram)), stdout=DEVNULL, stdin=DEVNULL, stderr=PIPE)
-        def keep_waiting():
-            return self.is_running()
-
-        logging.info("waiting for ssh to be open in the VM (timeout {}s)".format(self.timeout_s))
-        ssh_working = wait_ssh_open('127.0.0.1', self.ssh_port, keep_waiting, self.timeout_s)
-
-        if not self.is_running():
-            (_, stderr) = self.qemu_process.communicate()
-            raise VMError("VM failed to start, retcode: {}, stderr: {}".format( self.retcode(), stderr.decode()))
-
-        if not ssh_working:
-            if self.is_running():
-                self.log.error("VM running but SSH is not working")
-            self.terminate()
-            raise VMError("SSH is not working after {} seconds".format(self.timeout_s))
-        self.log.info("VM is online and SSH is up")
-
-    def is_running(self):
-        return self.qemu_process and self.qemu_process.poll() is None
-
-    def retcode(self):
-        if self.qemu_process:
-            return self.qemu_process.poll()
-        else:
-            raise RuntimeError('qemu process was not started')
-
-    def terminate(self):
-        if self.qemu_process:
-            logging.info("send term signal")
-            self.qemu_process.terminate()
-            time.sleep(3)
-            logging.info("send kill signal")
-            self.qemu_process.kill()
-            self.qemu_process.wait()
-            self.qemu_process = None
-        else:
-            logging.warn("VM.terminate: QEMU process not running")
-
-    def detach(self):
-        self._detach = True
-
-    def shutdown(self):
-        if self.qemu_process:
-            logging.info("Shutdown via ssh")
-            # ssh connection will be closed with an error
-            call(["ssh", "-o", "StrictHostKeyChecking=no", "-p", str(self.ssh_port), "qemu@localhost",
-            "sudo", "poweroff"])
-            ret = self.qemu_process.wait(timeout=90)
-            self.log.info("VM on port %s has shutdown (exit code %d)", self.ssh_port, ret)
-            self.qemu_process = None
-
-    def wait(self):
-        if self.qemu_process:
-            self.qemu_process.wait()
-
-    def __del__(self):
-        if self.is_running and not self._detach:
-            logging.info("VM destructor hit")
-            self.terminate()
-
-
-def qemu_ssh(ssh_port=QEMU_SSH_PORT, *args):
-    check_call(["ssh", "-o", "ServerAliveInterval=5", "-o", "StrictHostKeyChecking=no", "-p{}".format(ssh_port), "qemu@localhost", *args])
-
-
-def qemu_rsync(ssh_port, local_path, remote_path):
-    check_call(['rsync', '-e', 'ssh -o StrictHostKeyChecking=no -p{}'.format(ssh_port), '-a', local_path, 'qemu@localhost:{}'.format(remote_path)])
-
-def qemu_rsync_to_host(ssh_port, remote_path, local_path):
-    check_call(['rsync', '-e', 'ssh -o StrictHostKeyChecking=no -p{}'.format(ssh_port), '-va', 'qemu@localhost:{}'.format(remote_path), local_path])
-
-
-@retry(subprocess.CalledProcessError)
-def qemu_provision(ssh_port=QEMU_SSH_PORT):
-    import glob
-    logging.info("Provisioning the VM with artifacts and sources")
-
-    artifact = glob.glob('/work/mxnet/build/*.whl')
-    for x in artifact:
-        qemu_rsync(ssh_port, x, 'mxnet_dist/')
-    qemu_rsync(ssh_port, '/work/runtime_functions.py','')
-    qemu_rsync(ssh_port, '/work/vmcontrol.py','')
-    qemu_rsync(ssh_port, 'mxnet/tests', 'mxnet')
-    qemu_rsync(ssh_port, 'mxnet/ci/qemu/test_requirements.txt', 'mxnet/test_requirements.txt')
-    logging.info("Provisioning completed successfully.")
-
-
-def wait_ssh_open(server, port, keep_waiting=None, timeout=None):
-    """ Wait for network service to appear
-        @param server: host to connect to (str)
-        @param port: port (int)
-        @param timeout: in seconds, if None or 0 wait forever
-        @return: True of False, if timeout is None may return only True or
-                 throw unhandled network exception
-    """
-    import socket
-    import errno
-    import time
-    log = logging.getLogger('wait_ssh_open')
-    sleep_s = 1
-    if timeout:
-        from time import time as now
-        # time module is needed to calc timeout shared between two exceptions
-        end = now() + timeout
-
-    while True:
-        log.debug("Sleeping for %s second(s)", sleep_s)
-        time.sleep(sleep_s)
-        s = socket.socket()
-        try:
-            if keep_waiting and not keep_waiting():
-                log.debug("keep_waiting() is set and evaluates to False")
-                return False
-
-            if timeout:
-                next_timeout = end - now()
-                if next_timeout < 0:
-                    log.debug("connect time out")
-                    return False
-                else:
-                    log.debug("connect timeout %d s", next_timeout)
-                    s.settimeout(next_timeout)
-
-            log.debug("connect %s:%d", server, port)
-            s.connect((server, port))
-            ret = s.recv(1024).decode()
-            if ret and ret.startswith('SSH'):
-                s.close()
-                log.info("wait_ssh_open: port %s:%s is open and ssh is ready", server, port)
-                return True
-            else:
-                log.debug("Didn't get the SSH banner")
-                s.close()
-
-        except ConnectionError as err:
-            log.debug("ConnectionError %s", err)
-            if sleep_s == 0:
-                sleep_s = 1
-            else:
-                sleep_s *= 2
-
-        except socket.gaierror as err:
-            log.debug("gaierror %s",err)
-            return False
-
-        except socket.timeout as err:
-            # this exception occurs only if timeout is set
-            if timeout:
-                return False
-
-        except TimeoutError as err:
-            # catch timeout exception from underlying network library
-            # this one is different from socket.timeout
-            raise
-
-
-def wait_port_open(server, port, timeout=None):
-    """ Wait for network service to appear
-        @param server: host to connect to (str)
-        @param port: port (int)
-        @param timeout: in seconds, if None or 0 wait forever
-        @return: True of False, if timeout is None may return only True or
-                 throw unhandled network exception
-    """
-    import socket
-    import errno
-    import time
-    sleep_s = 0
-    if timeout:
-        from time import time as now
-        # time module is needed to calc timeout shared between two exceptions
-        end = now() + timeout
-
-    while True:
-        logging.debug("Sleeping for %s second(s)", sleep_s)
-        time.sleep(sleep_s)
-        s = socket.socket()
-        try:
-            if timeout:
-                next_timeout = end - now()
-                if next_timeout < 0:
-                    return False
-                else:
-                    s.settimeout(next_timeout)
-
-            logging.info("connect %s %d", server, port)
-            s.connect((server, port))
-
-        except ConnectionError as err:
-            logging.debug("ConnectionError %s", err)
-            if sleep_s == 0:
-                sleep_s = 1
-
-        except socket.gaierror as err:
-            logging.debug("gaierror %s",err)
-            return False
-
-        except socket.timeout as err:
-            # this exception occurs only if timeout is set
-            if timeout:
-                return False
-
-        except TimeoutError as err:
-            # catch timeout exception from underlying network library
-            # this one is different from socket.timeout
-            raise
-
-        else:
-            s.close()
-            logging.info("wait_port_open: port %s:%s is open", server, port)
-            return True
-
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index e171767d51f3..dc119bb10256 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -159,7 +159,7 @@ gather_licenses() {
 build_ubuntu_cpu_release() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=OFF \
@@ -170,7 +170,7 @@ build_ubuntu_cpu_release() {
 build_ubuntu_cpu_native_release() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=OFF \
         -DUSE_CUDA=OFF \
@@ -181,7 +181,7 @@ build_ubuntu_cpu_native_release() {
 build_ubuntu_gpu_release() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=ON \
         -DUSE_DIST_KVSTORE=ON \
@@ -216,13 +216,22 @@ build_dynamic_libmxnet() {
 
 build_jetson() {
     set -ex
-    pushd .
-
-    cp make/crosscompile.jetson.mk ./config.mk
-    make -j$(nproc)
-
-    build_wheel /work/mxnet/python /work/mxnet/lib
-    popd
+    cd /work/build
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="5.2" \
+        -DENABLE_CUDA_RTC=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=ON \
+        -DUSE_LAPACK=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -G Ninja /work/mxnet
+    ninja
+    build_wheel
 }
 
 #
@@ -250,7 +259,7 @@ build_armv6() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
-        -Dmxnet_LINKER_LIBS=-lgfortran \
+        -Dmxnet_LINKER_LIBS=-latomic \
         -G Ninja /work/mxnet
 
     ninja
@@ -277,7 +286,6 @@ build_armv7() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_LAPACK=OFF \
         -DBUILD_CPP_EXAMPLES=OFF \
-        -Dmxnet_LINKER_LIBS=-lgfortran \
         -G Ninja /work/mxnet
 
     ninja
@@ -287,14 +295,15 @@ build_armv7() {
 build_armv8() {
     cd /work/build
     cmake \
-        -DUSE_CUDA=OFF\
-        -DSUPPORT_F16C=OFF\
-        -DUSE_OPENCV=OFF\
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DUSE_CUDA=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=ON \
-        -DUSE_LAPACK=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=Release\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_LAPACK=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
     build_wheel
@@ -309,15 +318,18 @@ build_android_armv7() {
     set -ex
     cd /work/build
     cmake \
-        -DANDROID=ON\
-        -DUSE_CUDA=OFF\
-        -DUSE_SSE=OFF\
-        -DSUPPORT_F16C=OFF\
-        -DUSE_LAPACK=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DANDROID_ABI="armeabi-v7a" \
+        -DANDROID_STL="c++_shared" \
+        -DANDROID=ON \
+        -DUSE_CUDA=OFF \
+        -DUSE_SSE=OFF \
+        -DSUPPORT_F16C=OFF \
+        -DUSE_LAPACK=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -325,15 +337,18 @@ build_android_armv7() {
 build_android_armv8() {
     set -ex
     cd /work/build
-    cmake\
+    cmake \
+        -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
+        -DANDROID_ABI="arm64-v8a" \
+        -DANDROID_STL="c++_shared" \
         -DANDROID=ON \
-        -DUSE_CUDA=OFF\
-        -DUSE_SSE=OFF\
-        -DUSE_LAPACK=OFF\
-        -DUSE_OPENCV=OFF\
-        -DUSE_OPENMP=OFF\
-        -DUSE_SIGNAL_HANDLER=ON\
-        -DUSE_MKL_IF_AVAILABLE=OFF\
+        -DUSE_CUDA=OFF \
+        -DUSE_SSE=OFF \
+        -DUSE_LAPACK=OFF \
+        -DUSE_OPENCV=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -341,6 +356,7 @@ build_android_armv8() {
 build_centos7_cpu() {
     set -ex
     cd /work/build
+    source /opt/rh/devtoolset-7/enable
     cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
@@ -355,9 +371,7 @@ build_centos7_cpu() {
 build_centos7_cpu_make() {
     set -ex
     cd /work/mxnet
-    export CC="ccache gcc"
-    export CXX="ccache g++"
-    build_ccache_wrappers
+    source /opt/rh/devtoolset-7/enable
     make \
         DEV=1 \
         USE_LAPACK=1 \
@@ -372,6 +386,7 @@ build_centos7_cpu_make() {
 build_centos7_mkldnn() {
     set -ex
     cd /work/build
+    source /opt/rh/devtoolset-7/enable
     cmake \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=ON \
@@ -383,6 +398,7 @@ build_centos7_mkldnn() {
 build_centos7_gpu() {
     set -ex
     cd /work/build
+    source /opt/rh/devtoolset-7/enable
     cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
@@ -401,7 +417,7 @@ build_ubuntu_cpu() {
 build_ubuntu_cpu_openblas() {
     set -ex
     cd /work/build
-    CXXFLAGS="-Wno-error=strict-overflow" cmake \
+    CXXFLAGS="-Wno-error=strict-overflow" CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_TVM_OP=ON \
@@ -417,8 +433,8 @@ build_ubuntu_cpu_openblas() {
 
 build_ubuntu_cpu_openblas_make() {
     set -ex
-    export CC="gcc"
-    export CXX="g++"
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     make \
         DEV=1                         \
@@ -436,7 +452,7 @@ build_ubuntu_cpu_openblas_make() {
 build_ubuntu_cpu_mkl() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_MKLDNN=OFF \
@@ -451,7 +467,7 @@ build_ubuntu_cpu_mkl() {
 build_ubuntu_cpu_cmake_debug() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE=Debug \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_CUDA=OFF \
@@ -468,7 +484,7 @@ build_ubuntu_cpu_cmake_debug() {
 build_ubuntu_cpu_cmake_no_tvm_op() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_CUDA=OFF \
         -DUSE_TVM_OP=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
@@ -538,6 +554,10 @@ build_ubuntu_gpu_clang10_werror() {
     # Disable cpp package as OpWrapperGenerator.py dlopens libmxnet.so,
     # requiring presence of cuda driver libraries that are missing on CI host
     export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda-10.1/targets/x86_64-linux/lib/stubs
+    # Workaround https://github.com/thrust/thrust/issues/1072
+    # Can be deleted on Cuda 11
+    export CXXFLAGS="-I/usr/local/thrust"
+
     # Set CMAKE_AR and CMAKE_RANLIB due to Ubuntu 16.04 default binutils 4GB limitation
     CXX=clang++-10 CC=clang-10 cmake \
        -DCMAKE_AR=/usr/local/bin/ar \
@@ -550,10 +570,10 @@ build_ubuntu_gpu_clang10_werror() {
     ninja
 }
 
-build_ubuntu_cpu_clang39() {
+build_ubuntu_cpu_clang6() {
     set -ex
     cd /work/build
-    CXX=clang++-3.9 CC=clang-3.9 cmake \
+    CXX=clang++-6.0 CC=clang-6.0 cmake \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=OFF \
         -DUSE_CUDA=OFF \
@@ -598,10 +618,10 @@ build_ubuntu_cpu_clang_tidy() {
     $CLANG_TIDY -p /work/build -j $(nproc) -clang-tidy-binary clang-tidy-6.0 /work/mxnet/src
 }
 
-build_ubuntu_cpu_clang39_mkldnn() {
+build_ubuntu_cpu_clang6_mkldnn() {
     set -ex
     cd /work/build
-    CXX=clang++-3.9 CC=clang-3.9 cmake \
+    CXX=clang++-6.0 CC=clang-6.0 cmake \
        -DUSE_MKL_IF_AVAILABLE=OFF \
        -DUSE_MKLDNN=ON \
        -DUSE_CUDA=OFF \
@@ -626,6 +646,8 @@ build_ubuntu_cpu_clang100_mkldnn() {
 build_ubuntu_cpu_mkldnn_make() {
     set -ex
 
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
 
     make  \
@@ -640,7 +662,7 @@ build_ubuntu_cpu_mkldnn_make() {
 build_ubuntu_cpu_mkldnn() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_MKL_IF_AVAILABLE=OFF \
@@ -655,7 +677,7 @@ build_ubuntu_cpu_mkldnn() {
 build_ubuntu_cpu_mkldnn_mkl() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DENABLE_TESTCOVERAGE=ON \
         -DUSE_MKLDNN=ON \
@@ -675,6 +697,9 @@ build_ubuntu_gpu_tensorrt() {
 
     set -ex
 
+    export CC=gcc-7
+    export CXX=g++-7
+
     # Build ONNX
     pushd .
     echo "Installing ONNX."
@@ -726,7 +751,7 @@ build_ubuntu_gpu_mkldnn() {
     set -ex
     cd /work/build
     # Set CMAKE_AR and CMAKE_RANLIB due to Ubuntu 16.04 default binutils 4GB limitation
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_AR=/usr/local/bin/ar \
         -DCMAKE_RANLIB=/usr/local/bin/ranlib \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
@@ -743,7 +768,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
     set -ex
     cd /work/build
     # Set CMAKE_AR and CMAKE_RANLIB due to Ubuntu 16.04 default binutils 4GB limitation
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_AR=/usr/local/bin/ar \
         -DCMAKE_RANLIB=/usr/local/bin/ranlib \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
@@ -761,7 +786,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     set -ex
     cd /work/build
     # Set CMAKE_AR and CMAKE_RANLIB due to Ubuntu 16.04 default binutils 4GB limitation
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_AR=/usr/local/bin/ar \
         -DCMAKE_RANLIB=/usr/local/bin/ranlib \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
@@ -780,9 +805,10 @@ build_ubuntu_gpu_cuda101_cudnn7() {
 
 build_ubuntu_gpu_cuda101_cudnn7_make() {
     set -ex
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     make \
-        DEV=1                                     \
         USE_BLAS=openblas                         \
         USE_MKLDNN=0                              \
         USE_CUDA=1                                \
@@ -799,9 +825,10 @@ build_ubuntu_gpu_cuda101_cudnn7_make() {
 
 build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
     set -ex
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     make \
-        DEV=1                                     \
         USE_BLAS=openblas                         \
         USE_MKLDNN=1                              \
         USE_CUDA=1                                \
@@ -821,7 +848,7 @@ build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
     set -ex
     cd /work/build
     # Set CMAKE_AR and CMAKE_RANLIB due to Ubuntu 16.04 default binutils 4GB limitation
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_AR=/usr/local/bin/ar \
         -DCMAKE_RANLIB=/usr/local/bin/ranlib \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
@@ -840,6 +867,8 @@ build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
@@ -849,6 +878,8 @@ build_ubuntu_amalgamation() {
 build_ubuntu_amalgamation_min() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
@@ -859,7 +890,7 @@ build_ubuntu_amalgamation_min() {
 build_ubuntu_gpu_cmake() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
@@ -880,7 +911,7 @@ build_ubuntu_gpu_cmake() {
 build_ubuntu_gpu_cmake_no_rtc() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
@@ -902,7 +933,7 @@ build_ubuntu_gpu_cmake_no_rtc() {
 build_ubuntu_gpu_cmake_no_tvm_op() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
@@ -923,7 +954,7 @@ build_ubuntu_gpu_cmake_no_tvm_op() {
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=OFF                          \
         -DUSE_CUDNN=OFF                         \
@@ -938,7 +969,7 @@ build_ubuntu_cpu_large_tensor() {
 build_ubuntu_gpu_large_tensor() {
     set -ex
     cd /work/build
-    cmake \
+    CC=gcc-7 CXX=g++-7 cmake \
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
@@ -1105,6 +1136,7 @@ unittest_ubuntu_python3_quantization_gpu() {
 
 unittest_centos7_cpu_scala() {
     set -ex
+    source /opt/rh/devtoolset-7/enable
     cd /work/mxnet
     scala_prepare
     cd scala-package
@@ -1144,6 +1176,8 @@ unittest_ubuntu_cpu_R() {
     mkdir -p /tmp/r-site-library
     # build R packages in parallel
     mkdir -p ~/.R/
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     echo  "MAKEFLAGS = -j"$(nproc) > ~/.R/Makevars
     # make -j not supported
@@ -1159,8 +1193,10 @@ unittest_ubuntu_minimal_R() {
     mkdir -p /tmp/r-site-library
     # build R packages in parallel
     mkdir -p ~/.R/
-    build_ccache_wrappers
     echo  "MAKEFLAGS = -j"$(nproc) > ~/.R/Makevars
+    export CC=gcc-7
+    export CXX=g++-7
+    build_ccache_wrappers
     # make -j not supported
     make -f R-package/Makefile rpkg \
         R_LIBS=/tmp/r-site-library
@@ -1188,6 +1224,8 @@ unittest_ubuntu_gpu_R() {
     mkdir -p /tmp/r-site-library
     # build R packages in parallel
     mkdir -p ~/.R/
+    export CC=gcc-7
+    export CXX=g++-7
     build_ccache_wrappers
     echo  "MAKEFLAGS = -j"$(nproc) > ~/.R/Makevars
     # make -j not supported
@@ -1232,6 +1270,7 @@ unittest_ubuntu_cpu_julia10() {
 
 unittest_centos7_cpu() {
     set -ex
+    source /opt/rh/devtoolset-7/enable
     cd /work/mxnet
     python3.6 -m "nose" $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     python3.6 -m "nose" $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
@@ -1239,6 +1278,7 @@ unittest_centos7_cpu() {
 
 unittest_centos7_gpu() {
     set -ex
+    source /opt/rh/devtoolset-7/enable
     cd /work/mxnet
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export DMLC_LOG_STACK_TRACE_DEPTH=10
@@ -1369,6 +1409,18 @@ test_ubuntu_cpu_python3() {
     popd
 }
 
+# QEMU based ARM tests
+unittest_ubuntu_python3_arm() {
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    python3 -m nose --verbose tests/python/unittest/test_engine.py
+}
+
 # Functions that run the nightly Tests:
 
 #Runs Apache RAT Check on MXNet Source for License Headers
@@ -1469,6 +1521,8 @@ nightly_test_large_vector() {
 nightly_test_amalgamation() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
+    export CC=gcc-7
+    export CXX=g++-7
     # Amalgamation can not be run with -j nproc
     make -C amalgamation/ clean
     make -C amalgamation/ ${1} ${2}
@@ -1479,6 +1533,8 @@ nightly_test_javascript() {
     set -ex
     export LLVM=/work/deps/emscripten-fastcomp/build/bin
     export DMLC_LOG_STACK_TRACE_DEPTH=10
+    export CC=gcc-7
+    export CXX=g++-7
     # This part is needed to run emcc correctly
     cd /work/deps/emscripten
     ./emcc
@@ -1598,8 +1654,8 @@ build_docs_setup() {
 
 build_ubuntu_cpu_docs() {
     set -ex
-    export CC="gcc"
-    export CXX="g++"
+    export CC="gcc-7"
+    export CXX="g++-7"
     build_ccache_wrappers
     make \
         DEV=1                         \
@@ -1912,6 +1968,8 @@ checkout() {
 build_static_libmxnet() {
     set -ex
     pushd .
+    source /opt/rh/devtoolset-7/enable
+    export USE_SYSTEM_CUDA=1
     local mxnet_variant=${1:?"This function requires a python command as the first argument"}
     source tools/staticbuild/build.sh ${mxnet_variant}
     popd
@@ -1921,6 +1979,7 @@ build_static_libmxnet() {
 cd_package_pypi() {
     set -ex
     pushd .
+    source /opt/rh/devtoolset-7/enable
     local mxnet_variant=${1:?"This function requires a python command as the first argument"}
     ./cd/python/pypi/pypi_package.sh ${mxnet_variant}
     popd
@@ -1975,6 +2034,7 @@ build_static_scala_cpu() {
     scala_prepare
     export MAVEN_PUBLISH_OS_TYPE=linux-x86_64-cpu
     export mxnet_variant=cpu
+    source /opt/rh/devtoolset-7/enable
     ./ci/publish/scala/build.sh
     popd
 }
@@ -1983,6 +2043,7 @@ build_static_python_cpu() {
     set -ex
     pushd .
     export mxnet_variant=cpu
+    source /opt/rh/devtoolset-7/enable
     ./ci/publish/python/build.sh
     popd
 }
@@ -1991,6 +2052,8 @@ build_static_python_cu101() {
     set -ex
     pushd .
     export mxnet_variant=cu101
+    export USE_SYSTEM_CUDA=1
+    source /opt/rh/devtoolset-7/enable
     ./ci/publish/python/build.sh
     popd
 }
@@ -2000,6 +2063,7 @@ build_static_python_cpu_cmake() {
     pushd .
     export mxnet_variant=cpu
     export CMAKE_STATICBUILD=1
+    source /opt/rh/devtoolset-7/enable
     ./ci/publish/python/build.sh
     popd
 }
@@ -2009,6 +2073,8 @@ build_static_python_cu101_cmake() {
     pushd .
     export mxnet_variant=cu101
     export CMAKE_STATICBUILD=1
+    export USE_SYSTEM_CUDA=1
+    source /opt/rh/devtoolset-7/enable
     ./ci/publish/python/build.sh
     popd
 }
@@ -2017,6 +2083,7 @@ publish_scala_build() {
     set -ex
     pushd .
     scala_prepare
+    source /opt/rh/devtoolset-7/enable
     ./ci/publish/scala/build.sh
     popd
 }
diff --git a/ci/docker/install/ubuntu_arm_qemu.sh b/ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
old mode 100755
new mode 100644
similarity index 64%
rename from ci/docker/install/ubuntu_arm_qemu.sh
rename to ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
index 79ab67bfdbe6..3780415c4b15
--- a/ci/docker/install/ubuntu_arm_qemu.sh
+++ b/ci/docker/toolchains/aarch64-linux-gnu-toolchain.cmake
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,21 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -exuo pipefail
-
-apt-get install -y \
-    cmake \
-    curl \
-    wget \
-    git \
-    qemu \
-    qemu-system-arm \
-    unzip \
-    bzip2 \
-    vim-nox \
-    toilet
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+set(CMAKE_CUDA_HOST_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_FIND_ROOT_PATH "/usr/aarch64-linux-gnu")
 
-pip3 install ipython
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/ci/docker/install/android_arm64_openblas.sh b/ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
old mode 100755
new mode 100644
similarity index 65%
rename from ci/docker/install/android_arm64_openblas.sh
rename to ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
index 1c3014f6cca9..62038ecee16a
--- a/ci/docker/install/android_arm64_openblas.sh
+++ b/ci/docker/toolchains/arm-linux-gnueabihf-toolchain.cmake
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,16 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "armv7l")
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+set(CMAKE_FIND_ROOT_PATH "/usr/arm-linux-gnueabihf" "/usr/local/arm-linux-gnueabihf")
 
-set -ex
-pushd .
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS
-make -j$(nproc) TARGET=ARMV8 ARM_SOFTFP_ABI=1 HOSTCC=gcc NOFORTRAN=1 libs
-# Can't be run (utility not compiled for the target platform)
-#make install
-cp *.h /usr/include
-cp libopenblas.a /usr/local/lib
-popd
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index e5ce8de24485..eb4c0099579f 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -27,9 +27,6 @@ mx_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmo
 mx_lib_cython = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, python/mxnet/_cy3/*.so, build/3rdparty/openmp/runtime/src/libomp.so, python/mxnet/_ffi/_cy3/*.so'
 mx_lib_make = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
-// Python wheels
-mx_pip = 'build/*.whl'
-
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 mx_cmake_lib_no_tvm_op = 'build/libmxnet.so, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
@@ -422,13 +419,13 @@ def compile_centos7_gpu() {
     }]
 }
 
-def compile_unix_clang_3_9_cpu() {
-    return ['CPU: Clang 3.9': {
+def compile_unix_clang_6_cpu() {
+    return ['CPU: Clang 6': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-clang39') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39', false)
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang6', false)
           }
         }
       }
@@ -462,13 +459,13 @@ def compile_unix_clang_tidy_cpu() {
     }]
 }
 
-def compile_unix_clang_3_9_mkldnn_cpu() {
-    return ['CPU: Clang 3.9 MKLDNN': {
+def compile_unix_clang_6_mkldnn_cpu() {
+    return ['CPU: Clang 6 MKLDNN': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-mkldnn-clang39') {
+        ws('workspace/build-cpu-mkldnn-clang6') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39_mkldnn', false)
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang6_mkldnn', false)
           }
         }
       }
@@ -502,27 +499,28 @@ def compile_armv8_jetson_gpu() {
     }]
 }
 
-def compile_armv7_cpu() {
-    return ['ARMv7':{
+def compile_armv6_cpu() {
+    return ['ARMv6':{
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv7') {
+        ws('workspace/build-ARMv6') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('armv7', 'build_armv7', false)
-            utils.pack_lib('armv7', mx_pip)
+            utils.docker_run('armv6', 'build_armv6', false)
+            utils.pack_lib('armv6', mx_lib)
           }
         }
       }
     }]
 }
 
-def compile_armv6_cpu() {
-    return ['ARMv6':{
+def compile_armv7_cpu() {
+    return ['ARMv7':{
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv6') {
+        ws('workspace/build-ARMv7') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('armv6', 'build_armv6', false)
+            utils.docker_run('armv7', 'build_armv7', false)
+            utils.pack_lib('armv7', mx_lib)
           }
         }
       }
@@ -536,6 +534,7 @@ def compile_armv8_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('armv8', 'build_armv8', false)
+            utils.pack_lib('armv8', mx_lib)
           }
         }
       }
@@ -740,7 +739,7 @@ def test_static_scala_cpu() {
         ws('workspace/ut-publish-scala-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run("publish.ubuntu1404_cpu", 'build_static_scala_cpu', false)
+            utils.docker_run("publish.centos7_cpu", 'build_static_scala_cpu', false)
           }
         }
     }
@@ -748,12 +747,12 @@ def test_static_scala_cpu() {
 }
 
 def test_static_python_cpu() {
-  return ['Static build CPU 14.04 Python' : {
+  return ['Static build CPU CentOS7 Python' : {
     node(NODE_LINUX_CPU) {
         ws('workspace/ut-publish-python-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run("publish.ubuntu1404_cpu", 'build_static_python_cpu', false)
+            utils.docker_run("publish.centos7_cpu", 'build_static_python_cpu', false)
           }
         }
     }
@@ -761,25 +760,25 @@ def test_static_python_cpu() {
 }
 
 def test_static_python_cpu_cmake() {
-    return ['Static build CPU 14.04 Python with CMake' : {
-        node(NODE_LINUX_CPU) {
-            ws('workspace/ut-publish-python-cpu') {
-                timeout(time: max_time, unit: 'MINUTES') {
-                    utils.init_git()
-                    utils.docker_run("publish.ubuntu1404_cpu", 'build_static_python_cpu_cmake', false)
-                }
-            }
+  return ['Static build CPU CentOS7 Python with CMake' : {
+    node(NODE_LINUX_CPU) {
+        ws('workspace/ut-publish-python-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run("publish.centos7_cpu", 'build_static_python_cpu_cmake', false)
+          }
         }
-    }]
+    }
+  }]
 }
 
 def test_static_python_gpu() {
-  return ['Static build GPU 14.04 Python' : {
+  return ['Static build GPU CentOS7 Python' : {
     node(NODE_LINUX_GPU) {
         ws('workspace/ut-publish-python-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run("publish.ubuntu1404_gpu", 'build_static_python_cu101', true)
+            utils.docker_run("publish.centos7_gpu_cu101", 'build_static_python_cu101', true)
           }
         }
     }
@@ -787,16 +786,16 @@ def test_static_python_gpu() {
 }
 
 def test_static_python_gpu_cmake() {
-    return ['Static build GPU 14.04 Python' : {
-        node(NODE_LINUX_GPU) {
-            ws('workspace/ut-publish-python-gpu') {
-                timeout(time: max_time, unit: 'MINUTES') {
-                    utils.init_git()
-                    utils.docker_run("publish.ubuntu1404_gpu", 'build_static_python_cu101_cmake', true)
-                }
-            }
+  return ['Static build GPU CentOS7 Python with CMake' : {
+    node(NODE_LINUX_GPU) {
+        ws('workspace/ut-publish-python-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run("publish.centos7_gpu_cu101", 'build_static_python_cu101_cmake', true)
+          }
         }
-    }]
+    }
+  }]
 }
 
 def test_unix_python3_cpu() {
@@ -1431,39 +1430,27 @@ def test_qemu_armv7_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-armv7-qemu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('armv7', mx_pip)
-            sh "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
+            utils.unpack_and_init('armv7', mx_lib)
+            utils.docker_run('test.armv7', 'unittest_ubuntu_python3_arm', false)
           }
         }
       }
     }]
 }
 
-// This is for running on PRs
-def docs_website() {
-    return ['Docs': {
+def test_qemu_armv8_cpu() {
+    return ['ARMv8 QEMU': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/docs') {
+        ws('workspace/ut-armv8-qemu') {
           timeout(time: max_time, unit: 'MINUTES') {
-
-            unstash 'jekyll-artifacts'
-            unstash 'python-artifacts'
-            utils.docker_run('ubuntu_cpu_jekyll', 'build_docs_small', false)
-
-            master_url = utils.get_jenkins_master_url()
-            if ( master_url == 'jenkins.mxnet-ci.amazon-ml.com') {
-                // TODO: Make sure this scripts publish the website from the right folder
-                sh "ci/other/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
-            } else {
-                print "Skipping staging documentation publishing since we are not running in prod. Host: {$master_url}"
-            }
+            utils.unpack_and_init('armv8', mx_lib)
+            utils.docker_run('test.armv8', 'unittest_ubuntu_python3_arm', false)
           }
         }
       }
     }]
 }
 
-
 // This creates the MXNet binary needed for generating different docs sets
 def compile_unix_lite() {
     return ['MXNet lib': {
diff --git a/ci/jenkins/Jenkinsfile_clang b/ci/jenkins/Jenkinsfile_clang
index 28c40915acd7..1365b31b701d 100644
--- a/ci/jenkins/Jenkinsfile_clang
+++ b/ci/jenkins/Jenkinsfile_clang
@@ -34,10 +34,10 @@ utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_clang_3_9_cpu(),
+    custom_steps.compile_unix_clang_6_cpu(),
     custom_steps.compile_unix_clang_10_cpu(),
     custom_steps.compile_unix_clang_tidy_cpu(),
-    custom_steps.compile_unix_clang_3_9_mkldnn_cpu(),
+    custom_steps.compile_unix_clang_6_mkldnn_cpu(),
     custom_steps.compile_unix_clang_10_mkldnn_cpu()
   ]) 
 }
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index 9d8e01399d7c..9e2abf558dd2 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -40,11 +40,12 @@ core_logic: {
     custom_steps.compile_armv8_cpu(),
     custom_steps.compile_armv8_android_cpu(),
     custom_steps.compile_armv7_android_cpu()
-  ]) 
+  ])
 
   utils.parallel_stage('Tests', [
-    custom_steps.test_qemu_armv7_cpu()
-  ]) 
+    custom_steps.test_qemu_armv7_cpu(),
+    custom_steps.test_qemu_armv8_cpu()
+  ])
 }
 ,
 failure_handler: {
diff --git a/ci/publish/Jenkinsfile b/ci/publish/Jenkinsfile
index ed09b4c2ef0f..366758d85665 100644
--- a/ci/publish/Jenkinsfile
+++ b/ci/publish/Jenkinsfile
@@ -57,7 +57,7 @@ for (x in labels) {
   toBuild["Scala Build ${label}"] = wrapStep(nodeMap['cpu'], "build-scala-${label}") {
     withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}", "mxnet_variant=${scalaVariantMap[label]}"]) {
       utils.init_git()
-      utils.docker_run("publish.ubuntu1404_cpu", 'publish_scala_build', false, '500m', 'MAVEN_PUBLISH_OS_TYPE mxnet_variant')
+      utils.docker_run("publish.centos7_cpu", 'publish_scala_build', false, '500m', 'MAVEN_PUBLISH_OS_TYPE mxnet_variant')
       utils.pack_lib("scala_${label}", mx_scala_pub, false)
     }
   }
diff --git a/ci/publish/README.md b/ci/publish/README.md
index cdd70ce82258..3d315a9a57ec 100644
--- a/ci/publish/README.md
+++ b/ci/publish/README.md
@@ -30,11 +30,14 @@ Currently, we are supporting tests in the following systems:
 - Ubuntu 18.04
 - Cent OS 7
 
-All packages are currently built in `Ubuntu 14.04`. All Dockerfile used for publishing are available in `ci/docker/` with prefix `Dockerfile.publish`.
+All packages are currently built in `Cent OS 7` with Developer Toolset 7.
+Developer Toolset 7 provides `GCC 7` with C++17 support on `Cent OS 7`, enabling
+us to build binaries that support all major Linux distributions released after
+2014 (cf. Python Enhancement Proposals 599). All Dockerfile used for publishing
+are available in `ci/docker/` with prefix `Dockerfile.publish`.
 
 Apart from that, the script used to create the environment and publish are available under `ci/docker/install`:
 
-- `ubuntu_publish.sh` installs all required dependencies for Ubuntu 14.04 for publishing
 - `ubuntu_base.sh` installs minimum dependencies required to run the published packages
 
 ## Scala publishing
diff --git a/ci/qemu/README.md b/ci/qemu/README.md
deleted file mode 100644
index 4beca4a03690..000000000000
--- a/ci/qemu/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# QEMU base image creation
-
-This folder contains scripts and configuration to create a QEMU virtual drive with a debian system.
-
-The order of execution is:
-- `init.sh` to download the installation kernel and ramdisk
-- `preseed.sh` to preseed the debian installer so it doesn't ask questions 
-- `copy.sh` to extract the kernel and ramdisk from the installed system
-- `run.sh` to boot the system and fine tune the image
-
-# Description of the process:
-
-# Preparing the base image
-
-First, an installation is made using installer kernel and initrd by using the scripts above.
-
-# After installation, we extract initrd and kernel from the installation drive
-
-The commands look like this:
-
-`virt-copy-out -a hda.qcow2 /boot/initrd.img-4.15.0-30-generic-lpae .`
-
-In the same way for the kernel.
-
-Then we install packages and dependencies on the qemu image:
-
-apt install -y sudo python3-dev virtualenv wget libgfortran3 libopenblas-base rsync build-essential
-libopenblas-dev libomp5
-
-We enable sudo and passwordless logins:
-
-Add file `/etc/sudoers.d/01-qemu`
-With content:
-```
-qemu ALL=(ALL) NOPASSWD: ALL
-```
-
-Edit: `/etc/ssh/sshd_config`
-
-And set the following options:
-```
-PermitEmptyPasswords yes
-PasswordAuthentication yes
-PermitRootLogin yes
-```
-
-Disable root and user passwords with `passwd -d`
-
-Edit ` /etc/pam.d/common-auth`
-
-Replace `auth    [success=1 default=ignore]      pam_unix.so nullok_secure` by 
-```
-auth    [success=1 default=ignore]      pam_unix.so nullok
-```
-
-As root to install system wide:
-
-```
-wget -nv https://bootstrap.pypa.io/get-pip.py
-python3 get-pip.py
-apt-get clean
-```
-
-Afterwards install mxnet python3 deps:
-
-```
-pip3 install -r mxnet_requirements.txt
-```
-
-
-To access qemu control console from tmux: `ctrl-a a c`
-
-# CI and Testing
-
-Formally, [runtime_functions.py](https://github.com/apache/incubator-mxnet/blob/master/ci/docker/qemu/runtime_functions.py) would [run](https://github.com/apache/incubator-mxnet/blob/8beea18e3d9835f90b59d3f9de8f9945ac819423/ci/docker/qemu/runtime_functions.py#L81) *pip install -r [mxnet/tests/requirements.txt](https://github.com/apache/incubator-mxnet/blob/master/tests/requirements.txt)*. If the requirements change, there can be an unfortunate side-effect that there are no wheel files for Raspberry Pi for the new requirement. This would trigger a build from source on the emulator, which can take a long time and cause job timeouts. Therefore, we no longer install the `tests/requirements.txt` requirements, but rather rely on [test_requirements.txt](https://github.com/apache/incubator-mxnet/blob/master/ci/qemu/test_requirements.txt) to maintain the requirements for the qemu tests. Should any requirements changes lead to a job time out, it is incumbent on the submitter to update the image to include the requirement and unblock ci.
diff --git a/ci/qemu/copy.sh b/ci/qemu/copy.sh
deleted file mode 100755
index f39a9d083509..000000000000
--- a/ci/qemu/copy.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Extract kernel from image
-
-set -ex
-virt-copy-out -a vda.qcow2 /boot/vmlinuz-3.16.0-6-armmp-lpae /boot/initrd.img-3.16.0-6-armmp-lpae .
diff --git a/ci/qemu/init.sh b/ci/qemu/init.sh
deleted file mode 100755
index 1698cb10f272..000000000000
--- a/ci/qemu/init.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Download the installer and ramdisk for intallation
-set -ex
-wget -O installer-vmlinuz http://http.us.debian.org/debian/dists/jessie/main/installer-armhf/current/images/netboot/vmlinuz
-wget -O installer-initrd.gz http://http.us.debian.org/debian/dists/jessie/main/installer-armhf/current/images/netboot/initrd.gz
diff --git a/ci/qemu/initrd_modif/inittab b/ci/qemu/initrd_modif/inittab
deleted file mode 100644
index 064512595fbc..000000000000
--- a/ci/qemu/initrd_modif/inittab
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# /etc/inittab
-# busybox init configuration for debian-installer
-
-# main rc script
-::sysinit:/sbin/reopen-console /sbin/debian-installer-startup
-
-# main setup program
-::respawn:/sbin/reopen-console /sbin/debian-installer
-
-# convenience shells
-tty2::askfirst:-/bin/sh
-tty3::askfirst:-/bin/sh
-
-# logging
-#tty4::respawn:/usr/bin/tail -f /var/log/syslog
-
-# Stuff to do before rebooting
-::ctrlaltdel:/sbin/shutdown > /dev/null 2>&1
-
-# re-exec init on receipt of SIGHUP/SIGUSR1
-::restart:/sbin/init
diff --git a/ci/qemu/install.sh b/ci/qemu/install.sh
deleted file mode 100755
index 8531b033d074..000000000000
--- a/ci/qemu/install.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-rm -f vda.qcow2
-sudo ./preseed.sh
-qemu-img create -f qcow2 vda.qcow2 10G
-qemu-system-arm -M virt -m 1024 \
-  -kernel installer-vmlinuz \
-  -append BOOT_DEBUG=2,DEBIAN_FRONTEND=noninteractive \
-  -initrd installer-initrd_automated.gz \
-  -drive if=none,file=vda.qcow2,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet \
-  -device virtio-net-device,netdev=mynet \
-  -nographic -no-reboot
diff --git a/ci/qemu/mxnet_requirements.txt b/ci/qemu/mxnet_requirements.txt
deleted file mode 100644
index 2ab0fd9612e5..000000000000
--- a/ci/qemu/mxnet_requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-urllib3<1.23,>=1.21.1
-requests<2.19.0,>=2.18.4
-graphviz<0.9.0,>=0.8.1
-numpy>1.16.0,<2.0.0
-mock
-nose
-nose-timer
diff --git a/ci/qemu/preseed.cfg b/ci/qemu/preseed.cfg
deleted file mode 100644
index 23a8fc3baebf..000000000000
--- a/ci/qemu/preseed.cfg
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-d-i debian-installer/locale string en_US
-d-i keyboard-configuration/xkb-keymap select us
-d-i netcfg/get_hostname string debian-qemu
-d-i netcfg/get_domain string lab
-d-i passwd/root-login boolean true
-d-i passwd/root-password password debian
-d-i passwd/root-password-again password debian
-d-i clock-setup/utc boolean true
-d-i	mirror/country	string	US
-d-i	mirror/https/proxy	string
-d-i	mirror/http/proxy	string
-d-i	mirror/ftp/proxy	string
-d-i	mirror/http/countries	select	US
-d-i	mirror/http/hostname	string	ftp.us.debian.org
-d-i	mirror/http/mirror	select	ftp.us.debian.org
-d-i	localechooser/preferred-locale	select	en_US.UTF-8
-apt-mirror-setup	apt-setup/use_mirror	boolean	false
-apt-mirror-setup	apt-setup/mirror/error	select	Retry
-d-i passwd/username string qemu
-d-i passwd/user-password password qemu
-d-i passwd/user-password-again password qemu
-user-setup-udeb	passwd/username	string	qemu
-user-setup-udeb	passwd/user-fullname	string qemu
-d-i time/zone string GMT
-d-i partman-auto/choose_recipe select atomic
-#partman-auto	partman-auto/select_disk	select	/var/lib/partman/devices/=dev=vda
-#partman-auto	partman-auto/automatically_partition	select
-#partman-target	partman-target/no_root	error	
-#partman-auto	partman-auto/init_automatically_partition	select	50some_device__________regular
-#partman-auto	partman-auto/disk	string vda
-#partman-auto partman-auto/expert_recipe string                \
-#      boot-root ::                                            \
-#		100 10000 1000000000 ext4                             \
-#				$primary{ }                                   \
-#                lv_name{ root }                               \
-#				method{ format }                              \
-#				format{ }                                     \
-#				use_filesystem{ }                             \
-#				filesystem{ ext4 }                            \
-#				mountpoint{ / } .
-#
-#d-i partman-partitioning/confirm_write_new_label boolean true
-#d-i partman/choose_partition select finish
-#d-i partman/confirm boolean true
-#d-i partman/confirm_nooverwrite boolean true
-#partman-base	partman/choose_partition	select	90finish__________finish
-#partman-basicfilesystems	partman-basicfilesystems/swap_check_failed	boolean
-d-i	popularity-contest/participate	boolean	false
-d-i	tasksel/first	multiselect	SSH server, standard system utilities
-d-i	debian-installer/main-menu	select	Finish the installation
-d-i debian-installer/exit/poweroff boolean true
diff --git a/ci/qemu/preseed.sh b/ci/qemu/preseed.sh
deleted file mode 100755
index ad005548fbbe..000000000000
--- a/ci/qemu/preseed.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-rm -rf initrd
-mkdir -p initrd
-cd initrd
-gunzip -c ../installer-initrd.gz | cpio -i
-cp ../preseed.cfg .
-cp ../initrd_modif/inittab etc/inittab
-cp ../initrd_modif/S10syslog lib/debian-installer-startup.d/S10syslog
-find .  | cpio --create --format 'newc'  | gzip -c > ../installer-initrd_automated.gz
-echo "Done!"
diff --git a/ci/qemu/run.sh b/ci/qemu/run.sh
deleted file mode 100755
index eeff4e1fdccb..000000000000
--- a/ci/qemu/run.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash -exuo pipefail
- 
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-disk=${1:-vda.qcow2}
-qemu-system-arm -M virt -m 1024 \
-  -kernel vmlinuz-3.16.0-6-armmp-lpae \
-  -initrd initrd.img-3.16.0-6-armmp-lpae \
-  -smp 4 \
-  -append 'root=/dev/vda1' \
-  -drive if=none,file=$disk,format=qcow2,id=hd \
-  -device virtio-blk-device,drive=hd \
-  -netdev user,id=mynet,hostfwd=tcp::2222-:22 \
-  -device virtio-net-device,netdev=mynet \
-  -nographic
-#  -display none
diff --git a/ci/qemu/test_requirements.txt b/ci/qemu/test_requirements.txt
deleted file mode 100644
index 77037d89c673..000000000000
--- a/ci/qemu/test_requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-mock
-nose
-nose-timer
\ No newline at end of file
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
index 1bd901cf374e..a94020baf8e3 100644
--- a/cmake/Modules/FindNCCL.cmake
+++ b/cmake/Modules/FindNCCL.cmake
@@ -45,8 +45,14 @@ find_path(NCCL_INCLUDE_DIRS
   $ENV{NCCL_DIR}/include
   )
 
+if(CMAKE_BUILD_TYPE STREQUAL "Distribution" AND UNIX)
+  set(NCCL_LIB_NAME "nccl_static")
+else()
+  set(NCCL_LIB_NAME "nccl")
+endif()
+
 find_library(NCCL_LIBRARIES
-  NAMES nccl
+  NAMES ${NCCL_LIB_NAME}
   HINTS
   ${NCCL_LIB_DIR}
   ${NCCL_ROOT_DIR}
@@ -68,7 +74,7 @@ if (UNIX)
   )
 
   find_library(NCCL_LIBRARIES
-    NAMES nccl
+    NAMES ${NCCL_LIB_NAME}
     PATHS ${search_paths}
     PATH_SUFFIXES lib
   )
diff --git a/cmake/upstream/FindCUDAToolkit.cmake b/cmake/upstream/FindCUDAToolkit.cmake
index d37c44d9c782..fee4f3f4f698 100644
--- a/cmake/upstream/FindCUDAToolkit.cmake
+++ b/cmake/upstream/FindCUDAToolkit.cmake
@@ -132,6 +132,7 @@ of the following libraries that are part of the CUDAToolkit:
 - :ref:`cuRAND<cuda_toolkit_cuRAND>`
 - :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
 - :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
 - :ref:`NPP<cuda_toolkit_NPP>`
 - :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
 - :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
@@ -149,7 +150,6 @@ CUDA Runtime Library
 
 The CUDA Runtime library (cudart) are what most applications will typically
 need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
-They are an explicit dependency of almost every library.
 
 Targets Created:
 
@@ -230,6 +230,18 @@ Targets Created:
 - ``CUDA::cusparse``
 - ``CUDA::cusparse_static``
 
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
 .. _`cuda_toolkit_NPP`:
 
 NPP
@@ -361,8 +373,6 @@ Targets Created:
 
 - ``CUDA::nvml``
 
-.. _`cuda_toolkit_opencl`:
-
 .. _`cuda_toolkit_nvToolsExt`:
 
 nvToolsExt
@@ -375,6 +385,8 @@ Targets Created:
 
 - ``CUDA::nvToolsExt``
 
+.. _`cuda_toolkit_opencl`:
+
 OpenCL
 """"""
 
@@ -436,6 +448,11 @@ Result variables
     The path to the CUDA Toolkit library directory that contains the CUDA
     Runtime library ``cudart``.
 
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalant to
+    ``CUDAToolkit_ROOT_DIR``.
+
 ``CUDAToolkit_NVCC_EXECUTABLE``
     The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
     **not** be the same as
@@ -487,6 +504,7 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR)
   get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
   # use the already detected cuda compiler
   set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
   unset(cuda_dir)
 endif()
 
@@ -641,6 +659,7 @@ endif()
 if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
   get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
   set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
   unset(cuda_dir)
 endif()
 
@@ -669,8 +688,47 @@ endif()
 
 get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
 
-# Now that we have the real ROOT_DIR, find components inside it.
-list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+# Handle cross compilation
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+else()
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
 
 # Find the include/ directory
 find_path(CUDAToolkit_INCLUDE_DIR
@@ -680,14 +738,17 @@ find_path(CUDAToolkit_INCLUDE_DIR
 # And find the CUDA Runtime Library libcudart
 find_library(CUDA_CUDART
   NAMES cudart
-  PATH_SUFFIXES lib64 lib/x64
+  PATH_SUFFIXES lib64 lib64/stubs lib/x64
 )
 if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
   message(STATUS "Unable to find cudart library.")
 endif()
 
 unset(CUDAToolkit_ROOT_DIR)
-list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
 
 #-----------------------------------------------------------------------------
 # Perform version comparison and validate all required variables are set.
@@ -702,6 +763,10 @@ find_package_handle_standard_args(CUDAToolkit
   VERSION_VAR
     CUDAToolkit_VERSION
 )
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 )
 
 #-----------------------------------------------------------------------------
 # Construct result variables
@@ -714,78 +779,103 @@ endif()
 # Construct import targets
 if(CUDAToolkit_FOUND)
 
-  function(find_and_add_cuda_import_lib lib_name)
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN})
 
-    if(ARGC GREATER 1)
-      set(search_names ${ARGN})
-    else()
-      set(search_names ${lib_name})
-    endif()
+    set(search_names ${lib_name} ${arg_ALT})
 
     find_library(CUDA_${lib_name}_LIBRARY
       NAMES ${search_names}
-      PATHS ${CUDAToolkit_LIBRARY_DIR}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
             ENV CUDA_PATH
-      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+      PATH_SUFFIXES nvidia/current lib64 lib64/stubs lib/x64 lib lib/stubs stubs
+                    ${arg_EXTRA_PATH_SUFFIXES}
     )
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
 
-    if (NOT CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
       add_library(CUDA::${lib_name} IMPORTED INTERFACE)
       target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
       target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
     endif()
   endfunction()
 
-  function(add_cuda_link_dependency lib_name)
-    foreach(dependency IN LISTS ${ARGN})
-      target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
-    endforeach()
-  endfunction()
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
 
-  add_library(CUDA::toolkit IMPORTED INTERFACE)
-  target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
-  target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
 
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
 
-  find_and_add_cuda_import_lib(cuda_driver cuda)
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
 
-  find_and_add_cuda_import_lib(cudart)
-  find_and_add_cuda_import_lib(cudart_static)
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
 
-  foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
-    find_and_add_cuda_import_lib(${cuda_lib})
-    add_cuda_link_dependency(${cuda_lib} cudart)
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
 
-    find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib}_static cudart_static)
+    if(UNIX AND NOT APPLE)
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
   endforeach()
 
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static)
+
   # cuSOLVER depends on cuBLAS, and cuSPARSE
-  add_cuda_link_dependency(cusolver cublas cusparse)
-  add_cuda_link_dependency(cusolver_static cublas_static cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
 
   # nvGRAPH depends on cuRAND, and cuSOLVER.
-  add_cuda_link_dependency(nvgraph curand cusolver)
-  add_cuda_link_dependency(nvgraph_static curand_static cusolver_static)
-
-  find_and_add_cuda_import_lib(nppc)
-  find_and_add_cuda_import_lib(nppc_static)
-
-  add_cuda_link_dependency(nppc cudart)
-  add_cuda_link_dependency(nppc_static cudart_static culibos)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
 
   # Process the majority of the NPP libraries.
   foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
-    find_and_add_cuda_import_lib(${cuda_lib})
-    find_and_add_cuda_import_lib(${cuda_lib}_static)
-    add_cuda_link_dependency(${cuda_lib} nppc)
-    add_cuda_link_dependency(${cuda_lib}_static nppc_static)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
   endforeach()
 
-  find_and_add_cuda_import_lib(nvrtc)
-  add_cuda_link_dependency(nvrtc cuda_driver)
+  _CUDAToolkit_find_and_add_import_lib(cupti
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+  _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
 
-  find_and_add_cuda_import_lib(nvml nvidia-ml nvml)
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
 
   if(WIN32)
     # nvtools can be installed outside the CUDA toolkit directory
@@ -798,17 +888,12 @@ if(CUDAToolkit_FOUND)
       PATH_SUFFIXES lib/x64 lib
     )
   endif()
-  find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
+  _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
 
-  add_cuda_link_dependency(nvToolsExt cudart)
-
-  find_and_add_cuda_import_lib(OpenCL)
-
-  find_and_add_cuda_import_lib(culibos)
-  if(TARGET CUDA::culibos)
-    foreach (cuda_lib cublas cufft cusparse curand nvjpeg)
-      add_cuda_link_dependency(${cuda_lib}_static culibos)
-    endforeach()
-  endif()
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
 
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
 endif()
diff --git a/config/distribution/linux_cu100.cmake b/config/distribution/linux_cu100.cmake
index bdbec7e63005..250f494d0963 100644
--- a/config/distribution/linux_cu100.cmake
+++ b/config/distribution/linux_cu100.cmake
@@ -20,7 +20,8 @@ set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
 set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
 
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
 set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
 set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
 set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
diff --git a/config/distribution/linux_cu101.cmake b/config/distribution/linux_cu101.cmake
index fd773e88193b..ab11bcf69067 100644
--- a/config/distribution/linux_cu101.cmake
+++ b/config/distribution/linux_cu101.cmake
@@ -22,7 +22,8 @@ set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
 set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
 
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
 set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
 set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
 set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
diff --git a/config/distribution/linux_cu102.cmake b/config/distribution/linux_cu102.cmake
index 9f740f543ecb..9e2848c7fed6 100644
--- a/config/distribution/linux_cu102.cmake
+++ b/config/distribution/linux_cu102.cmake
@@ -20,7 +20,8 @@ set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
 set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
 
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
 set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
 set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
 set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
diff --git a/config/distribution/linux_cu75.cmake b/config/distribution/linux_cu75.cmake
deleted file mode 100644
index 91ef97150519..000000000000
--- a/config/distribution/linux_cu75.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set(CMAKE_BUILD_TYPE "Distribution" CACHE STRING "Build type")
-set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
-set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
-
-set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
-set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
-set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
-set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
-set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
-set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
-set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
-set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
-set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
-set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
-
-set(CUDACXX "/usr/local/cuda-7.5/bin/nvcc" CACHE STRING "Cuda compiler")
-set(MXNET_CUDA_ARCH "3.0;3.5;5.0;5.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu80.cmake b/config/distribution/linux_cu80.cmake
deleted file mode 100644
index 6b98538e6c89..000000000000
--- a/config/distribution/linux_cu80.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set(CMAKE_BUILD_TYPE "Distribution" CACHE STRING "Build type")
-set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
-set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
-
-set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
-set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
-set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
-set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
-set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
-set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
-set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
-set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
-set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
-set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
-
-set(CUDACXX "/usr/local/cuda-8.0/bin/nvcc" CACHE STRING "Cuda compiler")
-set(MXNET_CUDA_ARCH "3.0;5.0;6.0;6.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu90.cmake b/config/distribution/linux_cu90.cmake
index 1932a320f615..e4249cd609c8 100644
--- a/config/distribution/linux_cu90.cmake
+++ b/config/distribution/linux_cu90.cmake
@@ -20,7 +20,8 @@ set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
 set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
 
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
 set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
 set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
 set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
diff --git a/config/distribution/linux_cu91.cmake b/config/distribution/linux_cu91.cmake
index 36e10a624e40..a239ada43454 100644
--- a/config/distribution/linux_cu91.cmake
+++ b/config/distribution/linux_cu91.cmake
@@ -20,7 +20,8 @@ set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
 set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
 
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
 set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
 set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
 set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
diff --git a/config/distribution/linux_cu92.cmake b/config/distribution/linux_cu92.cmake
index 285daccdabc0..74f31c8ae031 100644
--- a/config/distribution/linux_cu92.cmake
+++ b/config/distribution/linux_cu92.cmake
@@ -20,7 +20,8 @@ set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
 set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
 
 set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
 set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
 set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
 set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
index 237ab96a3e32..d42cf455386c 100644
--- a/cpp-package/example/Makefile
+++ b/cpp-package/example/Makefile
@@ -46,7 +46,7 @@ debug: CPPEX_CFLAGS += -DDEBUG -g
 debug: prebuild all
 
 $(CPPEX_EXE):% : %.cpp
-	$(CXX) -std=c++11 $(CFLAGS)  $(CPPEX_CFLAGS) -o build/$@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+	$(CXX) -std=c++17 $(CFLAGS)  $(CPPEX_CFLAGS) -o build/$@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
 ifeq ($(UNAME_S), Darwin)
 	install_name_tool -add_rpath @loader_path build/$@
 	install_name_tool -add_rpath $(MXNET_LIB_PATH) build/$@
diff --git a/cpp-package/example/example.mk b/cpp-package/example/example.mk
index ef99d7426414..cf92e4076d18 100644
--- a/cpp-package/example/example.mk
+++ b/cpp-package/example/example.mk
@@ -30,8 +30,8 @@ cpp-package-example-all: cpp-package-all $(CPPEX_EXE)
 
 build/cpp-package/example/% : cpp-package/example/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/$* $< >build/cpp-package/example//$*.d
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
+	$(CXX) -std=c++17 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/$* $< >build/cpp-package/example//$*.d
+	$(CXX) -std=c++17 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
 
 cpp-package-example-clean:
 	rm -rf build/cpp-package/example/*
diff --git a/cpp-package/example/feature_extract/Makefile b/cpp-package/example/feature_extract/Makefile
index 193eaa7e850b..084b60632729 100644
--- a/cpp-package/example/feature_extract/Makefile
+++ b/cpp-package/example/feature_extract/Makefile
@@ -27,12 +27,12 @@ LDFLAGS=$(COMMFLAGS) -L ../../../lib -lmxnet $(BLAS) $(CUDA) -lgomp -pthread
 all: feature_extract prepare_data_with_opencv
 
 feature_extract: ./feature_extract.cpp
-	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) -c -std=c++17 $(CFLAGS) $^
 	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
 	-rm -f $(basename $@).o
 
 prepare_data_with_opencv: ./prepare_data_with_opencv.cpp
-	$(CXX) -c -std=c++11 $(OPENCV_CFLAGS) $^
+	$(CXX) -c -std=c++17 $(OPENCV_CFLAGS) $^
 	$(CXX) $(basename $@).o -o $@ $(OPENCV_LDFLAGS)
 	-rm -f $(basename $@).o
 
diff --git a/cpp-package/example/inference/Makefile b/cpp-package/example/inference/Makefile
index 5efe6cfb68e5..a0ec819e3749 100644
--- a/cpp-package/example/inference/Makefile
+++ b/cpp-package/example/inference/Makefile
@@ -34,7 +34,7 @@ debug: all
 
 
 $(CPPEX_EXE):% : %.cpp
-	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+	$(CXX) -std=c++17 $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
 
 clean:
 	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/inference/inference.mk b/cpp-package/example/inference/inference.mk
index b03055395f21..7708db6e029a 100644
--- a/cpp-package/example/inference/inference.mk
+++ b/cpp-package/example/inference/inference.mk
@@ -30,8 +30,8 @@ cpp-package-inference-example-all: cpp-package-all $(CPPEX_EXE)
 
 build/cpp-package/example/% : cpp-package/example/inference/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/inference/$* $< >build/cpp-package/example/$*.d
-	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
+	$(CXX) -std=c++17 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/inference/$* $< >build/cpp-package/example/$*.d
+	$(CXX) -std=c++17 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
 
 cpp-package-inference-example-clean:
 	rm -rf build/cpp-package/example/inference*
diff --git a/example/image-classification/predict-cpp/Makefile b/example/image-classification/predict-cpp/Makefile
index 5c084119b966..05f1afc53821 100644
--- a/example/image-classification/predict-cpp/Makefile
+++ b/example/image-classification/predict-cpp/Makefile
@@ -1,5 +1,5 @@
 # Special thanks to https://github.com/pertusa for the Makefile
-CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall
+CFLAGS=-std=c++17 -Wno-unknown-pragmas -Wall
 
 # Added for openblas
 # export OPENBLAS_ROOT=/usr/local/opt/openblas
@@ -22,8 +22,8 @@ image-classification-predict: image-classification-predict.o
 
 image-classification-predict.o: image-classification-predict.cc
 	g++ -O3 -c image-classification-predict.cc ${CFLAGS}
-	
-clean: 
+
+clean:
 	rm image-classification-predict
 	rm -f *.d *.o
 
diff --git a/example/multi_threaded_inference/Makefile b/example/multi_threaded_inference/Makefile
index 3189738fbfff..a58928b12759 100644
--- a/example/multi_threaded_inference/Makefile
+++ b/example/multi_threaded_inference/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 
-CFLAGS=-std=c++11 -g -Wno-unknown-pragmas -Wall -DMXNET_USE_CUDA=1 -DMXNET_USE_CUDNN=1 -DMXNET_USE_MKLDNN=1
+CFLAGS=-std=c++17 -g -Wno-unknown-pragmas -Wall -DMXNET_USE_CUDA=1 -DMXNET_USE_CUDNN=1 -DMXNET_USE_MKLDNN=1
 
 export MXNET_ROOT = `pwd`/../..
 export CPP_PACKAGE = $(MXNET_ROOT)/cpp-package
diff --git a/example/multi_threaded_inference/multi_threaded_inference.cc b/example/multi_threaded_inference/multi_threaded_inference.cc
index e90d55307e53..8b1864feea93 100644
--- a/example/multi_threaded_inference/multi_threaded_inference.cc
+++ b/example/multi_threaded_inference/multi_threaded_inference.cc
@@ -34,6 +34,7 @@
 #include <opencv2/opencv.hpp>
 #include <mxnet/c_predict_api.h>
 #include "mxnet-cpp/MxNetCpp.h"
+#include <random>
 
 const float DEFAULT_MEAN = 117.0;
 
@@ -248,7 +249,9 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
   auto func = [&](int num) {
     unsigned next = num;
     if (random_sleep) {
-      int sleep_time = rand_r(&next) % 5;
+      static thread_local std::mt19937 generator;
+      std::uniform_int_distribution<int> distribution(0, 5);
+      int sleep_time = distribution(generator);
       std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
     }
     int num_output = 0;
diff --git a/example/rnn/large_word_lm/setup.py b/example/rnn/large_word_lm/setup.py
index 09c4fb0965a9..54404c183ed4 100644
--- a/example/rnn/large_word_lm/setup.py
+++ b/example/rnn/large_word_lm/setup.py
@@ -24,5 +24,5 @@
 setup(ext_modules = cythonize(Extension(extension_name,
                                         sources=sources,
                                         language="c++",
-                                        extra_compile_args=["-std=c++11"],
+                                        extra_compile_args=["-std=c++17"],
                                         include_dirs=[numpy.get_include()])))
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 860028393e49..aa0021d543a0 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -38,18 +38,6 @@
 #include "tuple.h"
 
 
-/*!
- * \brief define compatible keywords in g++
- *  Used to support g++-4.6 and g++4.7
- */
-#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
-#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
-#error "Currently we need g++ 4.8 or higher to fully support c++11 features"
-#define override
-#define final
-#endif
-#endif
-
 /*!
  * \brief define dllexport for Visual Studio
  */
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
deleted file mode 100644
index 880e2cf5b466..000000000000
--- a/make/crosscompile.jetson.mk
+++ /dev/null
@@ -1,216 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet
-#
-#  If you want to change the configuration, please use the following
-#  steps. Assume you are on the root directory of mxnet. First copy the this
-#  file so that any local changes will be ignored by git
-#
-#  $ cp make/config.mk .
-#
-#  Next modify the according entries, and then compile by
-#
-#  $ make
-#
-#  or build in parallel with 8 threads
-#
-#  $ make -j8
-#-------------------------------------------------------------------------------
-
-#---------------------
-# For cross compilation we only explictily set a compiler when one is not already present.
-#--------------------
-
-ifndef CC
-export CC = gcc
-endif
-ifndef CXX
-export CXX = g++
-endif
-ifndef NVCC
-export NVCC = nvcc
-endif
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on segfault signal handler to log the stack trace
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
-
-# the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-USE_CUDA_PATH = /usr/local/cuda-9.0/targets/aarch64-linux
-
-# whether to enable CUDA runtime compilation
-ENABLE_CUDA_RTC = 0
-
-# whether use CuDNN R3 library
-USE_CUDNN = 1
-
-#whether to use NCCL library
-USE_NCCL = 0
-#add the path to NCCL library
-USE_NCCL_PATH = NONE
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 0
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-#whether use libjpeg-turbo for image decode without OpenCV wrapper
-USE_LIBJPEG_TURBO = 0
-#add the path to libjpeg-turbo library
-USE_LIBJPEG_TURBO_PATH = NONE
-
-# use openmp for parallelization
-USE_OPENMP = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 0
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-UNAME_S := $(shell uname -s)
-USE_BLAS = openblas
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH =
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL only for BLAS, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-USE_SSE=0
-
-# Turn off F16C instruction set support
-USE_F16C=0
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 0
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 0
-
-#----------------------------
-# performance settings
-#----------------------------
-# Use operator tuning
-USE_OPERATOR_TUNING = 1
-
-# Use gperftools if found
-# Disable because of #8968
-USE_GPERFTOOLS = 0
-
-# path to gperftools (tcmalloc) library in case of a non-standard installation
-USE_GPERFTOOLS_PATH =
-
-# Use JEMalloc if found, and not using gperftools
-USE_JEMALLOC = 1
-
-# path to jemalloc library in case of a non-standard installation
-USE_JEMALLOC_PATH =
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-#----------------------------
-# other features
-#----------------------------
-
-# Create C++ interface package
-USE_CPP_PACKAGE = 0
-
-# Use int64_t type to represent the total number of elements in the tensor
-# This will cause performance degradation reported in issue #14496
-# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
-# Note: the size of each dimension is still bounded by INT32_MAX
-USE_INT64_TENSOR_SIZE = 0
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/staticbuild/linux_cu100.mk b/make/staticbuild/linux_cu100.mk
index 862c1f56f8ae..855485c5b6df 100644
--- a/make/staticbuild/linux_cu100.mk
+++ b/make/staticbuild/linux_cu100.mk
@@ -37,7 +37,11 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
 ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
 
 # the additional compile flags you want to add
 ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
@@ -66,7 +70,11 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-10.0
+else
 USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.0
+endif
 
 # whether to use CuDNN library
 USE_CUDNN = 1
diff --git a/make/staticbuild/linux_cu101.mk b/make/staticbuild/linux_cu101.mk
index 6161431454ba..7bbde85bee11 100644
--- a/make/staticbuild/linux_cu101.mk
+++ b/make/staticbuild/linux_cu101.mk
@@ -37,7 +37,11 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
 ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
 
 # the additional compile flags you want to add
 ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
@@ -66,7 +70,11 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-10.1
+else
 USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.1
+endif
 
 # whether to use CuDNN library
 USE_CUDNN = 1
diff --git a/make/staticbuild/linux_cu102.mk b/make/staticbuild/linux_cu102.mk
index 4bc649fb5423..963842a19cff 100644
--- a/make/staticbuild/linux_cu102.mk
+++ b/make/staticbuild/linux_cu102.mk
@@ -37,7 +37,11 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
 ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
 
 # the additional compile flags you want to add
 ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
@@ -66,7 +70,11 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-10.2
+else
 USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.2
+endif
 
 # whether to use CuDNN library
 USE_CUDNN = 1
diff --git a/make/staticbuild/linux_cu75.mk b/make/staticbuild/linux_cu75.mk
deleted file mode 100644
index e263794600df..000000000000
--- a/make/staticbuild/linux_cu75.mk
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet for making python wheel
-#-------------------------------------------------------------------------------
-
-#---------------------
-# choice of compiler
-#--------------------
-
-export CC = gcc
-export CXX = g++
-export NVCC = nvcc
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on signal handler (e.g. segfault logger)
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
-
-# the additional compile flags you want to add
-ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-USE_BLAS=openblas
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 1
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-7.5
-
-# whether use CuDNN R3 library
-USE_CUDNN = 1
-
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-# CUDA_ARCH :=
-
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
-# use openmp for parallelization
-USE_OPENMP = 1
-USE_OPERATOR_TUNING = 1
-USE_LIBJPEG_TURBO = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 1
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH = $(DEPS_PATH)/lib
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-ARCH := $(shell uname -a)
-ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
-	USE_SSE=0
-else
-	USE_SSE=1
-endif
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 1
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 1
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# whether to use torch integration. This requires installing torch.
-# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
-# TORCH_PATH = $(HOME)/torch
-# MXNET_PLUGINS += plugin/torch/torch.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/staticbuild/linux_cu80.mk b/make/staticbuild/linux_cu80.mk
deleted file mode 100644
index a42220d3d467..000000000000
--- a/make/staticbuild/linux_cu80.mk
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet for making python wheel
-#-------------------------------------------------------------------------------
-
-#---------------------
-# choice of compiler
-#--------------------
-
-export CC = gcc
-export CXX = g++
-export NVCC = nvcc
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on signal handler (e.g. segfault logger)
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
-
-# the additional compile flags you want to add
-ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-USE_BLAS=openblas
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 1
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-8.0
-
-# whether to use CuDNN library
-USE_CUDNN = 1
-
-# whether to use NCCL library
-USE_NCCL = 1
-
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-# CUDA_ARCH :=
-
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
-# use openmp for parallelization
-USE_OPENMP = 1
-USE_OPERATOR_TUNING = 1
-USE_LIBJPEG_TURBO = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 1
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH = $(DEPS_PATH)/lib
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-ARCH := $(shell uname -a)
-ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
-	USE_SSE=0
-else
-	USE_SSE=1
-endif
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 1
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 1
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# whether to use torch integration. This requires installing torch.
-# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
-# TORCH_PATH = $(HOME)/torch
-# MXNET_PLUGINS += plugin/torch/torch.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/staticbuild/linux_cu90.mk b/make/staticbuild/linux_cu90.mk
index c46c10f6358b..1d0669ef82b6 100644
--- a/make/staticbuild/linux_cu90.mk
+++ b/make/staticbuild/linux_cu90.mk
@@ -37,7 +37,11 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
 ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
 
 # the additional compile flags you want to add
 ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
@@ -66,7 +70,11 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-9.0
+else
 USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
+endif
 
 # whether to use CuDNN library
 USE_CUDNN = 1
diff --git a/make/staticbuild/linux_cu91.mk b/make/staticbuild/linux_cu91.mk
index b2a33d7e36c8..89b35b10f6fa 100644
--- a/make/staticbuild/linux_cu91.mk
+++ b/make/staticbuild/linux_cu91.mk
@@ -37,7 +37,11 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
 ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
 
 # the additional compile flags you want to add
 ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
@@ -66,7 +70,11 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-9.1
+else
 USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.1
+endif
 
 # whether to use CuDNN library
 USE_CUDNN = 1
diff --git a/make/staticbuild/linux_cu92.mk b/make/staticbuild/linux_cu92.mk
index bbaa4bfcd772..2cbbdd25eeaf 100644
--- a/make/staticbuild/linux_cu92.mk
+++ b/make/staticbuild/linux_cu92.mk
@@ -37,7 +37,11 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
 ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
 
 # the additional compile flags you want to add
 ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
@@ -66,7 +70,11 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-9.2
+else
 USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.2
+endif
 
 # whether to use CuDNN library
 USE_CUDNN = 1
diff --git a/perl-package/AI-MXNet/t/test_init.t b/perl-package/AI-MXNet/t/test_init.t
index bf811f8584b7..c697e99bce0f 100644
--- a/perl-package/AI-MXNet/t/test_init.t
+++ b/perl-package/AI-MXNet/t/test_init.t
@@ -17,7 +17,8 @@
 
 use strict;
 use warnings;
-use Test::More tests => 7;
+# use Test::More tests => 7;  https://github.com/apache/incubator-mxnet/issues/17988
+use Test::More tests => 4;
 use AI::MXNet qw(mx);
 
 sub test_default_init
@@ -71,7 +72,7 @@ sub test_rsp_const_init
     $check_rsp_const_init->(mx->initializer->One(), 1);
 }
 
-test_rsp_const_init();
+# test_rsp_const_init();  https://github.com/apache/incubator-mxnet/issues/17988
 test_default_init();
 test_variable_init();
 test_aux_init();
diff --git a/python/setup.py b/python/setup.py
index dcd84cef1ea1..ccfccb3f3f74 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -94,7 +94,7 @@ def config_cython():
                 libraries=libraries,
                 extra_link_args=extra_link_args,
                 language="c++"))
-        
+
         path = "mxnet/_ffi/_cython"
         for fn in os.listdir(path):
             if not fn.endswith(".pyx"):
@@ -105,7 +105,7 @@ def config_cython():
                 include_dirs=["../include/", "../3rdparty/tvm/nnvm/include"],
                 library_dirs=library_dirs,
                 libraries=libraries,
-                extra_compile_args=["-std=c++11"],
+                extra_compile_args=["-std=c++17"],
                 extra_link_args=extra_link_args,
                 language="c++"))
 
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index afc64f73de7c..41193b5966ef 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -809,7 +809,7 @@ int _SimpleBindImpl(SymbolHandle symbol_handle,
       ret->ret_vec_charp.push_back(ret->ret_vec_str.back().c_str());
     }
     *shared_buffer_len = shared_buffer_map.size();
-    *updated_shared_buffer_handle_list = &(ret->ret_handles[nd_idx]);
+    *updated_shared_buffer_handle_list = &(ret->ret_handles.at(nd_idx));
     *updated_shared_buffer_name_list = &(ret->ret_vec_charp[0]);
   }
 
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 428899791a5d..89bee8abf655 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -24,6 +24,9 @@
 #include <mxnet/operator_util.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
+#include <algorithm>
+#include <random>
+
 #include "../elemwise_op_common.h"
 #include "../../imperative/imperative_utils.h"
 #include "../subgraph_op_common.h"
@@ -41,7 +44,9 @@ typedef int64_t dgl_id_t;
  */
 class ArrayHeap {
  public:
-  explicit ArrayHeap(const std::vector<float>& prob) {
+  explicit ArrayHeap(const std::vector<float>& prob, unsigned int seed) {
+    generator_ = std::mt19937(seed);
+    distribution_ = std::uniform_real_distribution<float>(0.0, 1.0);
     vec_size_ = prob.size();
     bit_len_ = ceil(log2(vec_size_));
     limit_ = 1 << bit_len_;
@@ -86,8 +91,8 @@ class ArrayHeap {
   /*
    * Sample from arrayHeap
    */
-  size_t Sample(unsigned int* seed) {
-    float xi = heap_[1] * (rand_r(seed)%100/101.0);
+  size_t Sample() {
+    float xi = heap_[1] * distribution_(generator_);
     int i = 1;
     while (i < limit_) {
       i = i << 1;
@@ -102,10 +107,10 @@ class ArrayHeap {
   /*
    * Sample a vector by given the size n
    */
-  void SampleWithoutReplacement(size_t n, std::vector<size_t>* samples, unsigned int* seed) {
+  void SampleWithoutReplacement(size_t n, std::vector<size_t>* samples) {
     // sample n elements
     for (size_t i = 0; i < n; ++i) {
-      samples->at(i) = this->Sample(seed);
+      samples->at(i) = this->Sample();
       this->Delete(samples->at(i));
     }
   }
@@ -115,6 +120,8 @@ class ArrayHeap {
   int bit_len_;   // bit size
   int limit_;
   std::vector<float> heap_;
+  std::mt19937 generator_;
+  std::uniform_real_distribution<float> distribution_;
 };
 
 struct NeighborSampleParam : public dmlc::Parameter<NeighborSampleParam> {
@@ -402,10 +409,12 @@ static bool CSRNeighborNonUniformSampleType(const nnvm::NodeAttrs& attrs,
 static void RandomSample(size_t set_size,
                          size_t num,
                          std::vector<size_t>* out,
-                         unsigned int* seed) {
+                         unsigned int seed) {
+  std::mt19937 generator(seed);
   std::unordered_set<size_t> sampled_idxs;
+  std::uniform_int_distribution<size_t> distribution(0, set_size - 1);
   while (sampled_idxs.size() < num) {
-    sampled_idxs.insert(rand_r(seed) % set_size);
+    sampled_idxs.insert(distribution(generator));
   }
   out->clear();
   for (auto it = sampled_idxs.begin(); it != sampled_idxs.end(); it++) {
@@ -441,7 +450,7 @@ static void GetUniformSample(const dgl_id_t* val_list,
                              const size_t max_num_neighbor,
                              std::vector<dgl_id_t>* out_ver,
                              std::vector<dgl_id_t>* out_edge,
-                             unsigned int* seed) {
+                             unsigned int seed) {
   // Copy ver_list to output
   if (ver_len <= max_num_neighbor) {
     for (size_t i = 0; i < ver_len; ++i) {
@@ -485,7 +494,7 @@ static void GetNonUniformSample(const float* probability,
                                 const size_t max_num_neighbor,
                                 std::vector<dgl_id_t>* out_ver,
                                 std::vector<dgl_id_t>* out_edge,
-                                unsigned int* seed) {
+                                unsigned int seed) {
   // Copy ver_list to output
   if (ver_len <= max_num_neighbor) {
     for (size_t i = 0; i < ver_len; ++i) {
@@ -500,8 +509,8 @@ static void GetNonUniformSample(const float* probability,
   for (size_t i = 0; i < ver_len; ++i) {
     sp_prob[i] = probability[col_list[i]];
   }
-  ArrayHeap arrayHeap(sp_prob);
-  arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index, seed);
+  ArrayHeap arrayHeap(sp_prob, seed);
+  arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index);
   out_ver->resize(max_num_neighbor);
   out_edge->resize(max_num_neighbor);
   for (size_t i = 0; i < max_num_neighbor; ++i) {
@@ -536,8 +545,8 @@ static void SampleSubgraph(const NDArray &csr,
                            const float* probability,
                            int num_hops,
                            size_t num_neighbor,
-                           size_t max_num_vertices) {
-  unsigned int time_seed = time(nullptr);
+                           size_t max_num_vertices,
+                           unsigned int random_seed) {
   size_t num_seeds = seed_arr.shape().Size();
   CHECK_GE(max_num_vertices, num_seeds);
 
@@ -594,7 +603,7 @@ static void SampleSubgraph(const NDArray &csr,
                        num_neighbor,
                        &tmp_sampled_src_list,
                        &tmp_sampled_edge_list,
-                       &time_seed);
+                       random_seed);
     } else {  // non-uniform-sample
       GetNonUniformSample(probability,
                        val_list + *(indptr + dst_id),
@@ -603,7 +612,7 @@ static void SampleSubgraph(const NDArray &csr,
                        num_neighbor,
                        &tmp_sampled_src_list,
                        &tmp_sampled_edge_list,
-                       &time_seed);
+                       random_seed);
     }
     CHECK_EQ(tmp_sampled_src_list.size(), tmp_sampled_edge_list.size());
     size_t pos = neighbor_list.size();
@@ -720,12 +729,15 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
                                           const std::vector<NDArray>& inputs,
                                           const std::vector<OpReqType>& req,
                                           const std::vector<NDArray>& outputs) {
-  const NeighborSampleParam& params =
-    nnvm::get<NeighborSampleParam>(attrs.parsed);
+  const NeighborSampleParam& params = nnvm::get<NeighborSampleParam>(attrs.parsed);
 
   int num_subgraphs = inputs.size() - 1;
   CHECK_EQ(outputs.size(), 3 * num_subgraphs);
 
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  mshadow::Random<cpu, unsigned int> *prnd = ctx.requested[0].get_random<cpu, unsigned int>(s);
+  unsigned int seed = prnd->GetRandInt();
+
 #pragma omp parallel for
   for (int i = 0; i < num_subgraphs; i++) {
     SampleSubgraph(inputs[0],                     // graph_csr
@@ -737,7 +749,12 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
                    nullptr,                       // probability
                    params.num_hops,
                    params.num_neighbor,
-                   params.max_num_vertices);
+                   params.max_num_vertices,
+#if defined(_OPENMP)
+                   seed + omp_get_thread_num());
+#else
+                   seed);
+#endif
   }
 }
 
@@ -798,6 +815,9 @@ of max_num_vertices, and the valid number of vertices is the same as the ones in
 .set_attr<mxnet::FInferShape>("FInferShape", CSRNeighborUniformSampleShape)
 .set_attr<nnvm::FInferType>("FInferType", CSRNeighborUniformSampleType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", CSRNeighborUniformSampleComputeExCPU)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest>{ResourceRequest::kRandom};
+})
 .add_argument("csr_matrix", "NDArray-or-Symbol", "csr matrix")
 .add_argument("seed_arrays", "NDArray-or-Symbol[]", "seed vertices")
 .set_attr<std::string>("key_var_num_args", "num_args")
@@ -811,14 +831,17 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
                                               const std::vector<NDArray>& inputs,
                                               const std::vector<OpReqType>& req,
                                               const std::vector<NDArray>& outputs) {
-  const NeighborSampleParam& params =
-    nnvm::get<NeighborSampleParam>(attrs.parsed);
+  const NeighborSampleParam& params = nnvm::get<NeighborSampleParam>(attrs.parsed);
 
   int num_subgraphs = inputs.size() - 2;
   CHECK_EQ(outputs.size(), 4 * num_subgraphs);
 
   const float* probability = inputs[1].data().dptr<float>();
 
+  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+  mshadow::Random<cpu, unsigned int> *prnd = ctx.requested[0].get_random<cpu, unsigned int>(s);
+  unsigned int seed = prnd->GetRandInt();
+
 #pragma omp parallel for
   for (int i = 0; i < num_subgraphs; i++) {
     float* sub_prob = outputs[i+2*num_subgraphs].data().dptr<float>();
@@ -831,7 +854,12 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
                    probability,
                    params.num_hops,
                    params.num_neighbor,
-                   params.max_num_vertices);
+                   params.max_num_vertices,
+#if defined(_OPENMP)
+                   seed + omp_get_thread_num());
+#else
+                   seed);
+#endif
   }
 }
 
@@ -897,6 +925,9 @@ of max_num_vertices, and the valid number of vertices is the same as the ones in
 .set_attr<mxnet::FInferShape>("FInferShape", CSRNeighborNonUniformSampleShape)
 .set_attr<nnvm::FInferType>("FInferType", CSRNeighborNonUniformSampleType)
 .set_attr<FComputeEx>("FComputeEx<cpu>", CSRNeighborNonUniformSampleComputeExCPU)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest>{ResourceRequest::kRandom};
+})
 .add_argument("csr_matrix", "NDArray-or-Symbol", "csr matrix")
 .add_argument("probability", "NDArray-or-Symbol", "probability vector")
 .add_argument("seed_arrays", "NDArray-or-Symbol[]", "seed vertices")
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index 7f86c056cf13..00887240aa56 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -601,7 +601,7 @@ CUfunction FusedOp::CompileCode(const std::string &code,
 
     std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
     const char *opts[] = {gpu_arch_arg.c_str(),
-                          "--std=c++11"};
+                          "--std=c++14"};
     const std::string kernel_name_demangled = "FusedKernel_" + kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
index 1cf9e2269b60..6e8a1505e15e 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -299,7 +299,9 @@ void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
     MKLDNNStream::Get()->RegisterPrimArgs(
         mkldnn::inner_product_backward_weights(ipBwdWeights_pd), args);
     CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
-    CommitOutput(in_grad[fullc::kBias], in_grad_bias);
+    if (!param.no_bias) {
+      CommitOutput(in_grad[fullc::kBias], in_grad_bias);
+    }
   }
   if (req[fullc::kData]) {
     mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetFCBwdData(
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn.cc b/src/operator/nn/mkldnn/mkldnn_rnn.cc
index c830080cee6d..c33ad484ddda 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn.cc
+++ b/src/operator/nn/mkldnn/mkldnn_rnn.cc
@@ -995,7 +995,7 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
                           const std::vector<NDArray> &inputs,
                           const std::vector<OpReqType> &req,
                           const std::vector<NDArray> &outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  TmpMemMgr::Get()->Init(ctx.requested[1]);
   // In the `autograd.record()` context, RNNOp is required to run into
   // forward_training mode.
   const bool is_training = (ctx.is_train || ctx.need_grad);
@@ -1132,7 +1132,7 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
                            const std::vector<OpReqType>& req,
                            const std::vector<NDArray>& outputs) {
   using tag = mkldnn::memory::format_tag;
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  TmpMemMgr::Get()->Init(ctx.requested[1]);
   const RNNParam& default_param = full_param_.default_param;
   const int data_dtype = inputs[rnn_enum::kData].dtype();
   const int w_dtype = inputs[rnn_enum::kParams].dtype();
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index f8a3fe429c53..1b5e9921a62c 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -713,7 +713,7 @@ static inline bool SoftmaxGradOpType(const nnvm::NodeAttrs& attrs,
     }
 
     return (*out_attrs)[0] != -1 && (*in_attrs)[0] != -1 &&
-           (*out_attrs)[1] != -1 && (*in_attrs)[1] != -1;
+           (!softmax_use_length(attrs) || ((*out_attrs)[1] != -1 && (*in_attrs)[1] != -1));
   } else {
     CHECK_EQ(in_attrs->size(), 2U);
     int out_dtype = (*in_attrs)[1];
diff --git a/src/operator/numpy/np_boolean_mask_assign.cc b/src/operator/numpy/np_boolean_mask_assign.cc
index e01ebb7c6c24..d5ab00835638 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cc
+++ b/src/operator/numpy/np_boolean_mask_assign.cc
@@ -220,7 +220,7 @@ void NumpyBooleanAssignForwardCPU(const nnvm::NodeAttrs& attrs,
   // If there's no True in mask, return directly
   if (valid_num == 0) return;
 
-  const TShape& vshape = inputs[2].shape_;
+  const TShape& vshape = inputs.at(2).shape_;
 
   if (inputs.size() == 3U) {
     // tensor case
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 3f94cca530c3..c81d90689d58 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -23,7 +23,7 @@
  * \brief Operator to shuffle elements of an NDArray
  */
 #if ((__GNUC__ > 4 && !defined(__clang__major__)) || (__clang_major__ > 4 && __linux__)) && \
-  defined(_OPENMP)
+  defined(_OPENMP) && !defined(__ANDROID__)
 #define USE_GNU_PARALLEL_SHUFFLE
 #endif
 
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 63f30b17d1b9..24c9985e7346 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -31,6 +31,7 @@
 #include <mxnet/operator.h>
 #include <mxnet/storage.h>
 #include <algorithm>
+#include <random>
 #include <map>
 #include <vector>
 #include <string>
@@ -293,23 +294,24 @@ void RNNForwardTraining(DType* ws,
                         DType* hy_ptr,
                         DType* cy_ptr,
                         const float dropout,
-                        int mode) {
+                        int mode,
+                        std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   switch (mode) {
     case rnn_enum::kLstm:
       LstmForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
                                  batch_size, input_size, state_size, x_ptr, hx_ptr, cx_ptr,
-                                 w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, dropout);
+                                 w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, dropout, rnd_engine);
       break;
     case rnn_enum::kGru:
       GruForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
                                 batch_size, input_size, state_size, x_ptr, hx_ptr,
-                                w_ptr, y_ptr, hy_ptr, dropout);
+                                w_ptr, y_ptr, hy_ptr, dropout, rnd_engine);
       break;
     case rnn_enum::kRnnTanh:
     case rnn_enum::kRnnRelu:
       VanillaRNNForwardTraining<DType>(ws, rs, state_outputs, num_layers, direction, seq_length,
                                        batch_size, input_size, state_size, x_ptr, hx_ptr,
-                                       w_ptr, y_ptr, hy_ptr, dropout, mode);
+                                       w_ptr, y_ptr, hy_ptr, dropout, mode, rnd_engine);
       break;
     default:
       LOG(FATAL) << "unknown RNN mode " << mode;
@@ -842,7 +844,8 @@ class RNNOp {
     }
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
-    if (ctx_.dev_type == kCPU) {
+#if !defined(__CUDACC__)  // cuda doesn't support C++17
+    if constexpr (std::is_same<xpu, cpu>::value) {
       int projection_size = 0;
       if (param_.projection_size.has_value()) {
         projection_size = param_.projection_size.value();
@@ -860,6 +863,9 @@ class RNNOp {
       DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
 
       if (ctx.is_train || ctx.need_grad) {
+        mshadow::Random<cpu, unsigned> *prnd = ctx.requested[0].get_random<xpu, unsigned int>(s);
+        std::mt19937 &rnd_engine = prnd->GetRndEngine();
+
         // allocate reserve space
         if (param_.projection_size.has_value()) {
           LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
@@ -894,7 +900,8 @@ class RNNOp {
                                   hy_ptr,
                                   cy_ptr,
                                   param_.p,
-                                  param_.mode);
+                                  param_.mode,
+                                  rnd_engine);
       } else {
         RNNForwardInference<DType>(work_cpu_space,
                                    param_.state_outputs,
@@ -916,6 +923,7 @@ class RNNOp {
                                    param_.mode);
       }
     }
+#endif
   }
 
   void Backward(const OpContext &ctx,
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index ecd38a88736d..7ff8a2f993bc 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -184,6 +184,7 @@ static std::vector<ResourceRequest> RNNResourceEx(const NodeAttrs& attrs, const
     }
 #endif
   } else {
+    request.emplace_back(ResourceRequest::kRandom);
 #if MXNET_USE_MKLDNN == 1
     request.emplace_back(ResourceRequest::kTempSpace);
 #endif
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index 5acf4eb7b3bd..779ac8839d6c 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -30,6 +30,7 @@
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <algorithm>
+#include <random>
 #include <map>
 #include <vector>
 #include <string>
@@ -139,7 +140,8 @@ void LstmForwardTraining(DType* ws,
                          DType* y_ptr,
                          DType* hy_ptr,
                          DType* cy_ptr,
-                         const float dropout) {
+                         const float dropout,
+                         std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   DType* dropout_random = rs;
   DType* rs2 = dropout_random + (L - 1) * D * T * N * H;
   const int total_layers = D * L;
@@ -149,7 +151,6 @@ void LstmForwardTraining(DType* ws,
   const index_t r_size = D * T * N * H * 6;
   const index_t y_offset = T * N * H * 5;
   const index_t cell_size = N * H;
-  unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   int idx = 0;  // state & cell state's idx;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < L; ++i) {
@@ -174,10 +175,9 @@ void LstmForwardTraining(DType* ws,
       w_ptr += w_size;
       b_ptr += b_size;
       if (dropout > 0.0f) {
-        #pragma omp parallel for num_threads(omp_threads)
+        std::uniform_real_distribution<float> distribution(0, 1);
         for (index_t j = 0; j < T * N * H * D; j++) {
-          int rand_data = rand_r(&seed_);
-          if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
+          if (distribution(rnd_engine) < dropout) {
             dropout_random[i * T * N * H * D + j] = 0;
             y.dptr_[j] = 0;
           } else {
@@ -1000,7 +1000,8 @@ void GruForwardTraining(DType* ws,
                         DType* w_ptr,
                         DType* y_ptr,
                         DType* hy_ptr,
-                        const float dropout) {
+                        const float dropout,
+                        std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   DType* wx = w_ptr;
   DType* wh = wx + I * H * 3;
   DType* bx = wh + H * H * 3 + (D - 1) * (H * H * 3 + I * H * 3)
@@ -1021,18 +1022,15 @@ void GruForwardTraining(DType* ws,
   DType* bx_l = bx;
   DType* bh_l = bh;
   DType* y_tmp = x_ptr;
-  unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   for (int l = 0; l < L; l++) {
     if (l != 0) {
       y_tmp = y_l;
       y_l = y_l + T * N * H * D;
     }
     if (dropout > 0.0f && l > 0) {
-      const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-      #pragma omp parallel for num_threads(omp_threads)
+      std::uniform_real_distribution<float> distribution(0, 1);
       for (index_t i = 0; i < T * N * I; i++) {
-        int rand_data = rand_r(&seed_);
-        if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
+        if (distribution(rnd_engine) < dropout) {
           dropout_random[(l - 1) * T * N * I + i] = 0;
           y_tmp[i] = 0;
         } else {
@@ -1889,7 +1887,8 @@ void VanillaRNNForwardTraining(DType* ws,
                                DType* y_ptr,
                                DType* hy_ptr,
                                const float dropout,
-                               int mode) {
+                               int mode,
+                               std::mt19937 &rnd_engine) {  // NOLINT(runtime/references)
   DType* wx = w_ptr;
   DType* wh = wx + I * H;
   DType* bx = wh + H * H + (D - 1) * (H * H + I * H)
@@ -1908,17 +1907,15 @@ void VanillaRNNForwardTraining(DType* ws,
   DType* bh_l = bh;
   DType* y_tmp = x_ptr;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  unsigned int seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   for (int l = 0; l < L; l++) {
     if (l != 0) {
       y_tmp = y_l;
       y_l = y_l + T * N * H * D;
     }
     if (dropout > 0.0f && l > 0) {
-      #pragma omp parallel for num_threads(omp_threads)
+      std::uniform_real_distribution<float> distribution(0, 1);
       for (index_t i = 0; i < T * N * I; i++) {
-        int rand_data = rand_r(&seed_);
-        if (static_cast<float>(rand_data % 1000) < static_cast<float>(1000 * dropout)) {
+        if (distribution(rnd_engine) < dropout) {
           dropout_random[(l - 1) * T * N * I + i] = 0;
           y_tmp[i] = 0;
         } else {
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index cea92a01e799..e1e3a53e656c 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -35,6 +35,7 @@
 #include <thread>
 #include <chrono>
 #include <vector>
+#include <random>
 
 #include "../src/engine/engine_impl.h"
 #include "../include/test_util.h"
@@ -62,15 +63,18 @@ void GenerateWorkload(int num_workloads, int num_var,
                       std::vector<Workload>* workloads) {
   workloads->clear();
   workloads->resize(num_workloads);
+  static thread_local std::mt19937 generator;
+  std::uniform_int_distribution<int> distribution_var(0, num_var - 1);
+  std::uniform_int_distribution<int> distribution_time(min_time, max_time - 1);
+  std::uniform_int_distribution<int> distribution_read(min_read, max_read - 1);
   for (int i = 0; i < num_workloads; ++i) {
     auto& wl = workloads->at(i);
-    wl.write = rand_r(&seed_) % num_var;
-    int r = rand_r(&seed_);
-    int num_read = min_read + (r % (max_read - min_read));
+    wl.write = distribution_var(generator);
+    int num_read = distribution_read(generator);
     for (int j = 0; j < num_read; ++j) {
-      wl.reads.push_back(rand_r(&seed_) % num_var);
+      wl.reads.push_back(distribution_var(generator));
     }
-    wl.time = min_time + rand_r(&seed_) % (max_time - min_time);
+    wl.time = distribution_time(generator);
   }
 }
 
diff --git a/tests/cpp/thread_safety/thread_safety_test.cc b/tests/cpp/thread_safety/thread_safety_test.cc
index 1f811d8c3fd7..9566adfd9d13 100644
--- a/tests/cpp/thread_safety/thread_safety_test.cc
+++ b/tests/cpp/thread_safety/thread_safety_test.cc
@@ -25,15 +25,17 @@
 #if MXNET_USE_CPP_PACKAGE == 1
 #include <stdio.h>
 #include <gtest/gtest.h>
-#include <mxnet/op_attr_types.h>
 #include <mxnet/ndarray.h>
-#include <thread>
+#include <mxnet/op_attr_types.h>
 #include <chrono>
 #include <cstdlib>
+#include <random>
+#include <thread>
 #include "../src/engine/engine_impl.h"
 #include "../src/imperative/imperative_utils.h"
 #include "../include/test_util.h"
 #include "mxnet-cpp/MxNetCpp.h"
+
 /*
  * Prepares input data for the ops/models used in this file
  */
@@ -298,8 +300,10 @@ void run_inference(const std::string& model,
       unsigned next = num;
       for (size_t i = 0; i < num_inf_per_thread; ++i) {
         if (random_sleep) {
-            int sleep_time = rand_r(&next) % 5;
-            std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
+          static thread_local std::mt19937 generator;
+          std::uniform_int_distribution<int> distribution(0, 5);
+          int sleep_time = distribution(generator);
+          std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
         }
         int num_output = 0;
         const int *stypes;
@@ -479,7 +483,9 @@ void run_inference_unsupported(const std::string& model,
       unsigned next = num;
       for (size_t i = 0; i < num_inf_per_thread; ++i) {
         if (random_sleep) {
-          int sleep_time = rand_r(&next) % 5;
+          static thread_local std::mt19937 generator;
+          std::uniform_int_distribution<int> distribution(0, 5);
+          int sleep_time = distribution(generator);
           std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
         }
         int num_output = 0;
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 2c674b9ec9d7..8534db91b52a 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -42,38 +42,38 @@ endif
 .PHONY: runtest testclean
 
 gtest-all.o : $(GTEST_SRCS_)
-	$(CXX) -std=c++11 $(CPPFLAGS) -I$(GTEST_INC) -I$(GTEST_DIR) $(CXXFLAGS) -c $(GTEST_DIR)/src/gtest-all.cc
+	$(CXX) -std=c++17 $(CPPFLAGS) -I$(GTEST_INC) -I$(GTEST_DIR) $(CXXFLAGS) -c $(GTEST_DIR)/src/gtest-all.cc
 
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
 build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
-	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
+	$(CXX) -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
+	$(CXX) -c -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
 build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
-	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
+	$(CXX) -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
+	$(CXX) -c -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
 build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
-	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
+	$(CXX) -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
+	$(CXX) -c -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
 build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
-	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
+	$(CXX) -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
+	$(CXX) -c -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
 
 build/tests/cpp/thread_safety/%.o : tests/cpp/thread_safety/%.cc | mkldnn
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(TEST_CFLAGS) $(TEST_CPPFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/thread_safety/$* $< > build/tests/cpp/thread_safety/$*.d
-	$(CXX) -c -std=c++11 $(TEST_CFLAGS) $(TEST_CPPFLAGS) -I$(GTEST_INC) -o build/tests/cpp/thread_safety/$*.o $(filter %.cc %.a, $^)
+	$(CXX) -std=c++17 $(TEST_CFLAGS) $(TEST_CPPFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/thread_safety/$* $< > build/tests/cpp/thread_safety/$*.d
+	$(CXX) -c -std=c++17 $(TEST_CFLAGS) $(TEST_CPPFLAGS) -I$(GTEST_INC) -o build/tests/cpp/thread_safety/$*.o $(filter %.cc %.a, $^)
 
 $(TEST): $(TEST_OBJ) lib/libmxnet.so $(TEST_LIB_DEP)
-	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o $@ $^ $(TEST_LDFLAGS)
+	$(CXX) -std=c++17 $(TEST_CFLAGS) -I$(GTEST_INC) -o $@ $^ $(TEST_LDFLAGS)
 
 runtest: $(TEST)
 	LD_LIBRARY_PATH=$(shell pwd)/lib:$(LD_LIBRARY_PATH) $(TEST)
diff --git a/tests/jenkins/run_test_pip_installations.sh b/tests/jenkins/run_test_pip_installations.sh
index 44788bfaf772..f2b4b245be5c 100755
--- a/tests/jenkins/run_test_pip_installations.sh
+++ b/tests/jenkins/run_test_pip_installations.sh
@@ -29,8 +29,8 @@ fi
 
 WORKSPACE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
 
-PYTHON_VERSIONS=('2.7' '3.4' '3.6' '3.5')
-DEVICES=('pip_cu75' 'pip_cu80' 'pip_cpu')
+PYTHON_VERSIONS=('3.5' '3.6')
+DEVICES=('pip_cu92' 'pip_cu101' 'pip_cpu')
 
 CI_BUILD_DIR=tests/ci_build/pip_tests
 # build Docker images and test pip installation for each device
@@ -61,10 +61,10 @@ for DEV in "${DEVICES[@]}"; do
         DOCKER_CMD="virtualenv -p \"/usr/bin/${PYTHON}\" ${PYTHON}; source \"${PYTHON}/bin/activate\"; cd ${WORKSPACE};"
         if [[ "${DEV}" == *"cpu"* ]]; then
             DOCKER_CMD="${DOCKER_CMD} pip install mxnet --pre; python tests/python/train/test_conv.py"
-        elif [[ "${DEV}" == *"cu75"* ]]; then
-            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu75 --pre; python tests/python/train/test_conv.py --gpu"
-        elif [[ "${DEV}" == *"cu80"* ]]; then
-            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu80 --pre; python tests/python/train/test_conv.py --gpu"
+        elif [[ "${DEV}" == *"cu92"* ]]; then
+            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu92 --pre -f pip install --pre mxnet -f https://dist.mxnet.io/python/cu92; python tests/python/train/test_conv.py --gpu"
+        elif [[ "${DEV}" == *"cu101"* ]]; then
+            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu101 --pre -f https://dist.mxnet.io/python/cu101; python tests/python/train/test_conv.py --gpu"
         fi
         ${DOCKER_BINARY} run --rm -v ${WORKSPACE}:${WORKSPACE} -w ${WORKSPACE} ${DOCKER_TAG} bash -c "tests/jenkins/run_as_user.sh `id -u` `id -un` `id -g` `id -un` '${DOCKER_CMD}'"
     done
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 8c6100d50765..4e42a5dfcf60 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -353,6 +353,10 @@ def check_quantized_elemwise_mul(data_shape, qtype):
         if is_test_for_native_cpu():
             print('skipped testing quantized_elemwise_mul for native cpu since it is not supported yet')
             return
+        if is_test_for_mkldnn():
+            print('skipped testing quantized_elemwise_mul for mkldnn due to '
+                  'https://github.com/apache/incubator-mxnet/issues/18034')
+            return
         elif qtype != 'int8':
             print('skipped testing quantized_elemwise_mul for not supported data type')
             return
diff --git a/tests/python/unittest/test_init.py b/tests/python/unittest/test_init.py
index 6d8830c1d089..290f84b18781 100644
--- a/tests/python/unittest/test_init.py
+++ b/tests/python/unittest/test_init.py
@@ -15,9 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import json
+import unittest
+
 import mxnet as mx
 import numpy as np
-import json
+
 
 def test_default_init():
     data = mx.sym.Variable('data')
@@ -45,6 +48,7 @@ def test_aux_init():
     assert (mod.get_params()[1]['bn_moving_var'].asnumpy() == 1).all()
     assert (mod.get_params()[1]['bn_moving_mean'].asnumpy() == 0).all()
 
+@unittest.skip("rsp const init is broken: https://github.com/apache/incubator-mxnet/issues/17988")
 def test_rsp_const_init():
     def check_rsp_const_init(init, val):
         shape = (10, 10)
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 3ce53c6a6e80..0f57947b6c1f 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -1125,6 +1125,7 @@ def test_np_multinomial():
 @unittest.skipUnless(is_op_runnable(), "Comparison ops can only run on either CPU instances, or GPU instances with"
                                        " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
 @use_np
+@unittest.skip("NumpyBooleanAssignForwardCPU broken: https://github.com/apache/incubator-mxnet/issues/17990")
 def test_np_ndarray_boolean_indexing():
     def test_single_bool_index():
         # adapted from numpy's test_indexing.py
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 111f0282283e..802bfb78bb01 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1366,6 +1366,7 @@ def hybrid_forward(self, F, a):
 
 @with_seed()
 @use_np
+@unittest.skip("NumpyBooleanAssignForwardCPU broken: https://github.com/apache/incubator-mxnet/issues/17990")
 def test_npx_batch_dot():
     ctx = mx.context.current_context()
     dtypes = ['float32', 'float64']
@@ -1485,6 +1486,7 @@ def gt_grad_batch_dot_numpy(lhs, rhs, ograd, transpose_a, transpose_b, lhs_req,
 
 @with_seed()
 @use_np
+@unittest.skip("NumpyBooleanAssignForwardCPU broken: https://github.com/apache/incubator-mxnet/issues/17990")
 def test_npi_boolean_assign():
     class TestBooleanAssignScalar(HybridBlock):
         def __init__(self, val, start_axis):
diff --git a/tools/dependencies/README.md b/tools/dependencies/README.md
index ec1e80088895..c45f33328bbc 100644
--- a/tools/dependencies/README.md
+++ b/tools/dependencies/README.md
@@ -228,7 +228,6 @@ Please run performance test aginast the MXNet you build before raising the PR.
 - [ ] Python/setup.py
 - [ ] tools/pip/setup.py
 - [ ] ci/docker/install/requirements
-- [ ] ci/docker/install/ubuntu_publish.sh
 - [ ] ci/docker/install/ubuntu_python.sh
 - [ ] ci/qemu/mxnet_requirements.txt
 - [ ] docs/install/requirements.txt 
diff --git a/tools/dependencies/make_shared_dependencies.sh b/tools/dependencies/make_shared_dependencies.sh
index 9c86c11024d5..96d3561d446b 100755
--- a/tools/dependencies/make_shared_dependencies.sh
+++ b/tools/dependencies/make_shared_dependencies.sh
@@ -65,4 +65,5 @@ source $DIR/cityhash.sh
 source $DIR/zmq.sh
 source $DIR/lz4.sh
 
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib
+export LIBRARY_PATH=${LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib:$DEPS_PATH/lib64
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib:$DEPS_PATH/lib64
diff --git a/tools/dependencies/zmq.sh b/tools/dependencies/zmq.sh
index 11d7063200b5..33ea628d53bb 100755
--- a/tools/dependencies/zmq.sh
+++ b/tools/dependencies/zmq.sh
@@ -37,5 +37,11 @@ if [[ ! -f $DEPS_PATH/lib/libzmq.a ]]; then
           -D BUILD_SHARED_LIBS=OFF ..
     $MAKE
     $MAKE install
+
+    if [[ ! -f $DEPS_PATH/lib/libzmq.a ]]; then
+        mkdir -p $DEPS_PATH/lib
+        cp $DEPS_PATH/lib64/*zmq* $DEPS_PATH/lib
+    fi
+
     popd
 fi
diff --git a/tools/pip/doc/CPU_ADDITIONAL.md b/tools/pip/doc/CPU_ADDITIONAL.md
index 07a95d5e451b..34c21268ea2d 100644
--- a/tools/pip/doc/CPU_ADDITIONAL.md
+++ b/tools/pip/doc/CPU_ADDITIONAL.md
@@ -18,13 +18,12 @@
 Prerequisites
 -------------
 This package supports Linux, Mac OSX, and Windows platforms. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.2 support.
+- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
 - [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
 - [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
 - [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
 - [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
+- [mxnet](https://pypi.python.org/pypi/mxnet/).
 - [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To install for other platforms (e.g. Windows, Raspberry Pi/ARM) or other versions, check [Installing MXNet](https://mxnet.apache.org/versions/master/install/index.html) for instructions on building from source.
diff --git a/tools/pip/doc/CU100_ADDITIONAL.md b/tools/pip/doc/CU100_ADDITIONAL.md
index 2e607d766ed8..1a33feb1607f 100644
--- a/tools/pip/doc/CU100_ADDITIONAL.md
+++ b/tools/pip/doc/CU100_ADDITIONAL.md
@@ -18,12 +18,13 @@
 Prerequisites
 -------------
 This package supports Linux and Windows platforms. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.2 support.
+- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
 - [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
 - [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
 - [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
 
diff --git a/tools/pip/doc/CU101_ADDITIONAL.md b/tools/pip/doc/CU101_ADDITIONAL.md
index 278c39942141..75b35dbd3de6 100644
--- a/tools/pip/doc/CU101_ADDITIONAL.md
+++ b/tools/pip/doc/CU101_ADDITIONAL.md
@@ -18,12 +18,13 @@
 Prerequisites
 -------------
 This package supports Linux and Windows platforms. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.2 support.
+- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
 - [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
 - [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
 - [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
 
diff --git a/tools/pip/doc/CU102_ADDITIONAL.md b/tools/pip/doc/CU102_ADDITIONAL.md
index 81829690da29..5a8c87a6f5d7 100644
--- a/tools/pip/doc/CU102_ADDITIONAL.md
+++ b/tools/pip/doc/CU102_ADDITIONAL.md
@@ -18,11 +18,13 @@
 Prerequisites
 -------------
 This package supports Linux and Windows platforms. You may also want to check:
+- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
 - [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
 - [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
 - [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
 
diff --git a/tools/pip/doc/CU75_ADDITIONAL.md b/tools/pip/doc/CU75_ADDITIONAL.md
deleted file mode 100644
index ae382f96ba35..000000000000
--- a/tools/pip/doc/CU75_ADDITIONAL.md
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-**CUDA 7.5 package for MXNet is no longer maintained for new releases.**
-
-Prerequisites
--------------
-This package supports Linux only, up to 1.2.1. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.2 support.
-- [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
-- [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet](https://pypi.python.org/pypi/mxnet/).
-
-To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
-
-To install for other platforms (e.g. Windows, Raspberry Pi/ARM) or other versions, check [Installing MXNet](https://mxnet.apache.org/versions/master/install/index.html) for instructions on building from source.
-
-Installation
-------------
-To install:
-```bash
-pip install mxnet-cu75
-```
diff --git a/tools/pip/doc/CU80_ADDITIONAL.md b/tools/pip/doc/CU80_ADDITIONAL.md
deleted file mode 100644
index 5ce06d764e42..000000000000
--- a/tools/pip/doc/CU80_ADDITIONAL.md
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-**CUDA 8.0 package for MXNet is no longer maintained for new releases.**
-
-Prerequisites
--------------
-This package supports Linux and Windows platforms. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.2 support.
-- [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
-- [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
-- [mxnet](https://pypi.python.org/pypi/mxnet/).
-
-To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
-
-To install for other platforms (e.g. Raspberry Pi/ARM) or other versions of CUDA, check [Installing MXNet](https://mxnet.apache.org/versions/master/install/index.html) for instructions on building from source.
-
-Installation
-------------
-To install:
-```bash
-pip install mxnet-cu80
-```
diff --git a/tools/pip/doc/CU90_ADDITIONAL.md b/tools/pip/doc/CU90_ADDITIONAL.md
index 770914b5a1a0..3f51c50520f2 100644
--- a/tools/pip/doc/CU90_ADDITIONAL.md
+++ b/tools/pip/doc/CU90_ADDITIONAL.md
@@ -21,10 +21,12 @@ Prerequisites
 -------------
 This package supports Linux and Windows platforms. You may also want to check:
 - [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
 - [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
+- [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
 - [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
 
diff --git a/tools/pip/doc/CU92_ADDITIONAL.md b/tools/pip/doc/CU92_ADDITIONAL.md
index 7aec9a1aeb67..0b87c76974c3 100644
--- a/tools/pip/doc/CU92_ADDITIONAL.md
+++ b/tools/pip/doc/CU92_ADDITIONAL.md
@@ -19,10 +19,12 @@ Prerequisites
 -------------
 This package supports Linux and Windows platforms. You may also want to check:
 - [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
+- [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
 - [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet-cu80](https://pypi.python.org/pypi/mxnet-cu80/) with CUDA-8.0 support.
-- [mxnet-cu75](https://pypi.python.org/pypi/mxnet-cu75/) with CUDA-7.5 support.
 - [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
 
diff --git a/tools/pip/doc/NATIVE_ADDITIONAL.md b/tools/pip/doc/NATIVE_ADDITIONAL.md
index 902464c7ab6e..f73a1f22ac89 100644
--- a/tools/pip/doc/NATIVE_ADDITIONAL.md
+++ b/tools/pip/doc/NATIVE_ADDITIONAL.md
@@ -18,9 +18,13 @@
 Prerequisites
 -------------
 This package supports Linux and Windows platforms. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.2 support.
+- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
 - [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
-- [mxnet](https://pypi.python.org/pypi/mxnet/) CPU build with MKLDNN.
+- [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
+- [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
 
 To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
 
diff --git a/tools/pip/doc/PYPI_README.md b/tools/pip/doc/PYPI_README.md
index d323a5545f22..c39d6d6fb5b3 100644
--- a/tools/pip/doc/PYPI_README.md
+++ b/tools/pip/doc/PYPI_README.md
@@ -17,7 +17,7 @@
 
 Apache MXNet (Incubating) Python Package
 ========================================
-[Apache MXNet](http://beta.mxnet.io) is a deep learning framework designed for both *efficiency* and *flexibility*.
+[Apache MXNet](https://mxnet.apache.org/) is a deep learning framework designed for both *efficiency* and *flexibility*.
 It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
 
 For feature requests on the PyPI package, suggestions, and issue reports, create an issue by clicking [here](https://github.com/apache/incubator-mxnet/issues/new).
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
index d01051713f2f..ce49f3c1ce7a 100644
--- a/tools/pip/setup.py
+++ b/tools/pip/setup.py
@@ -145,12 +145,9 @@ def skip_markdown_comments(md):
         libraries.append('CUDA-9.1')
     elif variant.startswith('CU90'):
         libraries.append('CUDA-9.0')
-    elif variant.startswith('CU80'):
-        libraries.append('CUDA-8.0')
-    elif variant.startswith('CU75'):
-        libraries.append('CUDA-7.5')
-    if variant.endswith('MKL'):
-        libraries.append('MKLDNN')
+
+if variant != 'native':
+    libraries.append('MKLDNN')
 
 short_description += ' This version uses {0}.'.format(' and '.join(libraries))
 
diff --git a/tools/setup_gpu_build_tools.sh b/tools/setup_gpu_build_tools.sh
index bba37108b98b..6c5f655f8df9 100755
--- a/tools/setup_gpu_build_tools.sh
+++ b/tools/setup_gpu_build_tools.sh
@@ -18,7 +18,7 @@
 # under the License.
 
 # This script installs the tools and libraries for CUDA GPU on Ubuntu.
-# Usage: VARIANT=cu92mkl; DEPS_PATH=$HOME; setup_gpu_build_tools.sh $VARIANT $DEPS_PATH;
+# Usage: VARIANT=cu102mkl; DEPS_PATH=$HOME; setup_gpu_build_tools.sh $VARIANT $DEPS_PATH;
 # It installs the tools into DEPS_PATH as specified by the second argument, and will set
 # the following environment variables:
 # PATH, CPLUS_INCLUDE_PATH, C_INCLUDE_PATH, LIBRARY_PATH, LD_LIBRARY_PATH, NVCC
@@ -63,18 +63,6 @@ elif [[ $VARIANT == cu90* ]]; then
     LIBCUDA_VERSION='384.145-0ubuntu1'
     LIBCUDNN_VERSION='7.6.5.32-1+cuda9.0'
     LIBNCCL_VERSION='2.5.6-1+cuda9.0'
-elif [[ $VARIANT == cu80* ]]; then
-    CUDA_VERSION='8.0.61-1'
-    CUDA_PATCH_VERSION='8.0.61.2-1'
-    LIBCUDA_VERSION='375.88-0ubuntu1'
-    LIBCUDNN_VERSION='7.2.1.38-1+cuda8.0'
-    LIBNCCL_VERSION='2.3.4-1+cuda8.0'
-elif [[ $VARIANT == cu75* ]]; then
-    CUDA_VERSION='7.5-18'
-    CUDA_PATCH_VERSION='7.5-18'
-    LIBCUDA_VERSION='375.88-0ubuntu1'
-    LIBCUDNN_VERSION='6.0.21-1+cuda7.5'
-    LIBNCCL_VERSION=''
 fi
 if [[ $VARIANT == cu* ]]; then
     CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | tr '-' '.' | cut -d. -f1,2)
@@ -246,51 +234,6 @@ elif [[ $VARIANT == cu90* ]]; then
       "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
       "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
     )
-elif [[ $VARIANT == cu80* ]]; then
-    cuda_files=( \
-      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
-      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
-      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
-      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
-    )
-    ml_files=( \
-      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
-      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
-    )
-elif [[ $VARIANT == cu75* ]]; then
-    cuda_files=( \
-      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
-      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
-      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
-      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
-      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
-      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
-      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
-    )
-    ml_files=( \
-      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
-    )
 fi
 
 
diff --git a/tools/staticbuild/build.sh b/tools/staticbuild/build.sh
index f33ce9d711bc..e5fd24368ed3 100755
--- a/tools/staticbuild/build.sh
+++ b/tools/staticbuild/build.sh
@@ -54,7 +54,7 @@ export FC="gfortran"
 export PKG_CONFIG_PATH=$DEPS_PATH/lib/pkgconfig:$DEPS_PATH/lib64/pkgconfig:$DEPS_PATH/lib/x86_64-linux-gnu/pkgconfig:$PKG_CONFIG_PATH
 export CPATH=$DEPS_PATH/include:$CPATH
 
-if [[ $PLATFORM == 'linux' && $VARIANT == cu* ]]; then
+if [[ -z "$USE_SYSTEM_CUDA" && $PLATFORM == 'linux' && $VARIANT == cu* ]]; then
     source tools/setup_gpu_build_tools.sh $VARIANT $DEPS_PATH
 fi
 
diff --git a/tools/staticbuild/build_lib.sh b/tools/staticbuild/build_lib.sh
index 6cceced6f27a..989070ac7078 100755
--- a/tools/staticbuild/build_lib.sh
+++ b/tools/staticbuild/build_lib.sh
@@ -40,14 +40,8 @@ $MAKE DEPS_PATH=$DEPS_PATH mkldnn
 $MAKE DEPS_PATH=$DEPS_PATH
 
 if [[ $PLATFORM == 'linux' ]]; then
-    if [[ -f /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so ]]; then
-        cp -L /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so lib/libgfortran.so.3
-    elif [[ -f /usr/lib/x86_64-linux-gnu/libgfortran.so.3 ]]; then
-        cp -L /usr/lib/x86_64-linux-gnu/libgfortran.so.3 lib/libgfortran.so.3
-    else
-        cp -L /usr/lib/x86_64-linux-gnu/libgfortran.so.4 lib/libgfortran.so.4
-    fi
-    cp -L /usr/lib/x86_64-linux-gnu/libquadmath.so.0 lib/libquadmath.so.0
+    cp -L $(ldd lib/libmxnet.so | grep libgfortran |  awk '{print $3}') lib/
+    cp -L $(ldd lib/libmxnet.so | grep libquadmath |  awk '{print $3}') lib/
 fi
 
 # Print the linked objects on libmxnet.so
diff --git a/tools/staticbuild/build_lib_cmake.sh b/tools/staticbuild/build_lib_cmake.sh
index 6a4bbec7afcf..5261b2a6942a 100755
--- a/tools/staticbuild/build_lib_cmake.sh
+++ b/tools/staticbuild/build_lib_cmake.sh
@@ -39,14 +39,8 @@ rm -rf lib; mkdir lib;
 if [[ $PLATFORM == 'linux' ]]; then
     cp -L build/libmxnet.so lib/libmxnet.so
     cp -L staticdeps/lib/libopenblas.so lib/libopenblas.so.0
-    if [[ -f /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so ]]; then
-        cp -L /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so lib/libgfortran.so.3
-    elif [[ -f /usr/lib/x86_64-linux-gnu/libgfortran.so.3 ]]; then
-        cp -L /usr/lib/x86_64-linux-gnu/libgfortran.so.3 lib/libgfortran.so.3
-    else
-        cp -L /usr/lib/x86_64-linux-gnu/libgfortran.so.4 lib/libgfortran.so.4
-    fi
-    cp -L /usr/lib/x86_64-linux-gnu/libquadmath.so.0 lib/libquadmath.so.0
+    cp -L $(ldd lib/libmxnet.so | grep libgfortran |  awk '{print $3}') lib/
+    cp -L $(ldd lib/libmxnet.so | grep libquadmath |  awk '{print $3}') lib/
 elif [[ $PLATFORM == 'darwin' ]]; then
     cp -L build/libmxnet.dylib lib/libmxnet.dylib
 fi

From e796ae9684df5eb94202d7ab012772b3d9442a4f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Tue, 14 Apr 2020 13:41:40 -0700
Subject: [PATCH 05/14] Integrate Horovod training API as part of MXNet native
 distributed training API (#17531)

* implement pushpull for horovod

* add local_rank function

* add tests

* Remove in-place broadcast API

* Add kvstore horovod example

* Fix the list to singlton conversion

* Add horood test to CI

* Remove test horovod from unit test

* Add docstring

* Add horovod in test

* sync with master

* Fix horovod dependency in CI

* Fix merge conflict with byteps

* Update __init__.py

* Resolve conflict

* Remove openib warning message

* Add log message in test

* Remove tmp file

* Fix lint

Co-authored-by: Haibin Lin <linhaibin.eric@gmail.com>
---
 ci/docker/runtime_functions.sh                |   5 +-
 .../cifar10_kvstore_hvd.py                    | 237 ++++++++++++++++++
 python/mxnet/gluon/trainer.py                 |   1 +
 python/mxnet/kvstore/__init__.py              |   1 +
 python/mxnet/kvstore/horovod.py               | 161 ++++++++++++
 python/mxnet/kvstore/kvstore.py               |   3 +
 .../dist_device_sync_kvstore_horovod.py       |  80 ++++++
 .../nightly/test_distributed_training-gpu.sh  |  11 +-
 tools/launch.py                               |  63 ++---
 9 files changed, 530 insertions(+), 32 deletions(-)
 create mode 100644 example/distributed_training/cifar10_kvstore_hvd.py
 create mode 100644 python/mxnet/kvstore/horovod.py
 create mode 100644 tests/nightly/dist_device_sync_kvstore_horovod.py

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index dc119bb10256..5ea21bcdd1dd 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1386,7 +1386,10 @@ integrationtest_ubuntu_gpu_scala() {
 integrationtest_ubuntu_gpu_dist_kvstore() {
     set -ex
     pushd .
-    cd tests/nightly
+    cd /work/mxnet/python
+    pip3 install -e .
+    pip3 install --no-cache-dir horovod
+    cd /work/mxnet/tests/nightly
     ./test_distributed_training-gpu.sh
     popd
 }
diff --git a/example/distributed_training/cifar10_kvstore_hvd.py b/example/distributed_training/cifar10_kvstore_hvd.py
new file mode 100644
index 000000000000..e6780e5db85e
--- /dev/null
+++ b/example/distributed_training/cifar10_kvstore_hvd.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""cifar10_dist_hvd.py contains code that runs distributed training of a
+ResNet18 network using Horovod framework"""
+
+import argparse
+import logging
+import time
+import random
+import types
+import warnings
+
+import numpy as np
+import mxnet as mx
+from mxnet import autograd, gluon, kv, nd
+from mxnet.gluon.model_zoo import vision
+
+logging.basicConfig(level=logging.INFO)
+
+# Training settings
+parser = argparse.ArgumentParser(description='MXNet CIFAR Example')
+
+parser.add_argument('--batch-size', type=int, default=64,
+                    help='training batch size per worker (default: 64)')
+parser.add_argument('--epochs', type=int, default=5,
+                    help='number of training epochs (default: 5)')
+parser.add_argument('--lr', type=float, default=0.01,
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disable training on GPU (default: False)')
+args = parser.parse_args()
+
+if not args.no_cuda:
+    # Disable CUDA if there are no GPUs.
+    if mx.context.num_gpus() == 0:
+        args.no_cuda = True
+
+
+# Transform input data
+def transform(data, label):
+    return nd.transpose(data.astype(np.float32), (2, 0, 1))/255,\
+      label.astype(np.float32)
+
+
+# Train a batch using multiple GPUs
+def train(batch_list, context, network, gluon_trainer, metric):
+    """ Training with multiple GPUs
+
+    Parameters
+    ----------
+    batch_list: List
+      list of dataset
+    context: List
+      a list of all GPUs to be used for training
+    network:
+      ResNet
+    gluon_trainer:
+      rain module of gluon
+    """
+
+    # Run one forward and backward pass
+    def forward_backward(network, data, labels, metric):
+        with autograd.record():
+            # Compute outputs
+            outputs = [network(X) for X in data]
+            # Compute the loss
+            losses = [loss(yhat, y) for yhat, y in zip(outputs, labels)]
+
+        # Run the backward pass (calculate gradients)
+        for l in losses:
+            l.backward()
+
+        metric.update(preds=outputs, labels=labels)
+
+    # Use cross entropy loss
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+    # Split and load data
+    data = batch_list[0]
+    data = gluon.utils.split_and_load(data, context)
+
+    # Split and load label
+    label = batch_list[1]
+    label = gluon.utils.split_and_load(label, context)
+
+    # Run the forward and backward pass
+    forward_backward(network, data, label, metric)
+
+    # Update the parameters
+    this_batch_size = batch_list[0].shape[0]
+    gluon_trainer.step(this_batch_size)
+
+
+# Evaluate accuracy of the given network using the given data
+def evaluate(data_iterator, network, context):
+    """ Measure the accuracy of ResNet
+
+    Parameters
+    ----------
+    data_iterator: Iter
+      examples of dataset
+    network:
+      ResNet
+
+    Returns
+    ----------
+    tuple of array element
+    """
+    acc = mx.metric.Accuracy()
+
+    # Iterate through data and label
+    for i, (data, label) in enumerate(data_iterator):
+
+        # Get the data and label into the GPU
+        data = data.as_in_context(context)
+        label = label.as_in_context(context)
+
+        # Get network's output which is a probability distribution
+        # Apply argmax on the probability distribution to get network's
+        # classification.
+        output = network(data)
+        predictions = nd.argmax(output, axis=1)
+
+        # Give network's prediction and the correct label to update the metric
+        acc.update(preds=predictions, labels=label)
+
+    # Return the accuracy
+    return acc.get()[1]
+
+
+class SplitSampler(gluon.data.sampler.Sampler):
+    """ Split the dataset into `num_parts` parts and sample from the part with
+    index `part_index`
+
+    Parameters
+    ----------
+    length: int
+      Number of examples in the dataset
+    num_parts: int
+      Partition the data into multiple parts
+    part_index: int
+      The index of the part to read from
+    """
+    def __init__(self, length, num_parts=1, part_index=0):
+        # Compute the length of each partition
+        self.part_len = length // num_parts
+        # Compute the start index for this partition
+        self.start = self.part_len * part_index
+        # Compute the end index for this partition
+        self.end = self.start + self.part_len
+
+    def __iter__(self):
+        # Extract examples between `start` and `end`, shuffle and return them.
+        indices = list(range(self.start, self.end))
+        random.shuffle(indices)
+        return iter(indices)
+
+    def __len__(self):
+        return self.part_len
+
+
+# Use Horovod as the KVStore
+store = kv.create('horovod')
+
+# Get the number of workers
+num_workers = store.num_workers
+
+# Create the context based on the local rank of the current process
+ctx = mx.cpu(store.local_rank) if args.no_cuda else mx.gpu(store.local_rank)
+
+# Load the training data
+train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=True,
+                                   transform=transform), args.batch_size,
+                                   sampler=SplitSampler(50000,
+                                                        num_workers,
+                                                        store.rank))
+
+# Load the test data
+test_data = gluon.data.DataLoader(gluon.data.vision.CIFAR10(train=False,
+                                  transform=transform),
+                                  args.batch_size, shuffle=False)
+
+# Load ResNet18 model from GluonCV model zoo
+net = vision.resnet18_v1()
+
+# Initialize the parameters with Xavier initializer
+net.initialize(mx.init.Xavier(), ctx=ctx)
+
+# Use Adam optimizer. Ask trainer to use the distributor kv store.
+trainer = gluon.Trainer(net.collect_params(), optimizer='adam',
+                        optimizer_params={'learning_rate': args.lr},
+                        kvstore=store)
+
+train_metric = mx.metric.Accuracy()
+
+# Run as many epochs as required
+for epoch in range(args.epochs):
+    tic = time.time()
+    train_metric.reset()
+
+    # Iterate through batches and run training using multiple GPUs
+    batch_num = 1
+    btic = time.time()
+    for batch in train_data:
+        # Train the batch using multiple GPUs
+        train(batch, [ctx], net, trainer, train_metric)
+        if store.rank == 0 and batch_num % 100 == 0:
+            speed = args.batch_size / (time.time() - btic)
+            logging.info('Epoch[{}] Rank [{}] Batch[{}]\tSpeed: {:.2f} samples/sec'
+                         .format(epoch, store.rank, batch_num, speed))
+            logging.info('{} = {:.2f}'.format(*train_metric.get()))
+
+        btic = time.time()
+        batch_num += 1
+
+    elapsed = time.time() - tic
+    # Print test accuracy after every epoch
+    test_accuracy = evaluate(test_data, net, ctx)
+    if store.rank == 0:
+        logging.info("Epoch %d: Test_acc %f" % (epoch, test_accuracy))
\ No newline at end of file
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 303167d8abf2..fd03393b6374 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -25,6 +25,7 @@
 from .parameter import ParameterDict, Parameter
 from ..kvstore import KVStore
 
+
 class Trainer(object):
     """Applies an `Optimizer` on a set of Parameters. Trainer should
     be used together with `autograd`.
diff --git a/python/mxnet/kvstore/__init__.py b/python/mxnet/kvstore/__init__.py
index ccb58a1c6229..0547ed40631d 100644
--- a/python/mxnet/kvstore/__init__.py
+++ b/python/mxnet/kvstore/__init__.py
@@ -22,3 +22,4 @@
 from .kvstore import *
 from .base import *
 from .kvstore_server import *
+from .horovod import *
diff --git a/python/mxnet/kvstore/horovod.py b/python/mxnet/kvstore/horovod.py
new file mode 100644
index 000000000000..20a0cd89edaa
--- /dev/null
+++ b/python/mxnet/kvstore/horovod.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+""" Key value store interface of MXNet for Horovod """
+from __future__ import absolute_import
+from .base import KVStoreBase
+
+__all__ = ['Horovod']
+
+
+@KVStoreBase.register
+class Horovod(KVStoreBase):
+    """A communication backend using Horovod."""
+
+    def __init__(self):
+        import horovod.mxnet as hvd
+        hvd.init()
+
+    @property
+    def type(self):
+        return 'horovod'
+
+    def broadcast(self, key, value, out, priority=0):
+        """ Broadcast the `value` NDArray at rank 0 to all ranks
+
+        Parameters
+        ----------
+        key : str, or int
+            The key is used to name the tensor for allreduce. Its
+            usage is different from that of parameter servers.
+
+        value : NDArray
+            The tensor that is to be broadcasted.
+
+        out : NDArray, list of NDArray
+            Output tensor that receives value broadcasted from root process
+
+        priority : int, optional
+            The priority of the operation.
+            Higher priority operations are likely to be executed before other actions.
+
+        Examples
+        --------
+        >>> a = mx.nd.ones(shape)
+        >>> b = mx.nd.zeros(shape)
+        >>> kv.broadcast('2', value=a, out=b)
+        >>> print(b.asnumpy)
+        [[ 1.  1.  1.]
+        [ 1.  1.  1.]]
+        """
+        import horovod.mxnet as hvd
+
+        out = out if isinstance(out, list) else [out]
+
+        # TODO (lnyuan): need to copy data to each device memory
+        for o in out:
+            o[:] = hvd.broadcast(tensor=value, root_rank=0, name=str(key),
+                                 priority=priority)
+
+    def pushpull(self, key, value, out=None, priority=0):
+        """ Performs allreduce on a single tensor or a list of tensor objects
+
+        This function performs in-place summation of the input tensor over all the processes.
+
+        The name `pushpull` is a generic term. In Horovod, its action is implemented via
+        ring allreduce. Each operation is identified by the 'key'; if `key` is not provided, an
+        incremented auto-generated name is used. The tensor type and shape must be
+        the same on all processes for a given name. The reduction will not start until all processes
+        are ready to send and receive the tensor.
+
+        Parameters
+        ----------
+        key : str, int, or sequence of str or int
+            Keys used to uniquely tag an operation.
+
+        value : NDArray
+            Tensor value on one process to be summed. If `out` is not specified, the `value` will
+            be modified in-place
+
+        out: NDArray
+            Output tensor after allreduce. If not specified, the input tensor `value` will be
+            modified in-place.
+
+        priority : int, optional
+            The priority of the operation.
+            Higher priority operations are likely to be executed before other actions.
+
+        Examples
+        --------
+        >>> # perform in-place allreduce on tensor a
+        >>> shape = (2, 3)
+        >>> nworker = kv.num_workers # assume there are 8 processes
+        >>> a = mx.nd.ones(shape)
+        >>> kv.pushpull('1', a)
+        >>> print(a.asnumpy())
+        [[ 8.  8.  8.]
+        [ 8.  8.  8.]]
+
+        >>> # perform allreduce on tensor a and output to b
+        >>> a = mx.nd.ones(shape)
+        >>> kv.pushpull('2', a, out=b)
+        >>> print(b.asnumpy())
+        [[ 8.  8.  8.]
+        [ 8.  8.  8.]]
+        """
+        import horovod.mxnet as hvd
+
+        if out is None:
+            value = value if isinstance(value, list) else [value]
+            for v in value:
+                hvd.allreduce_(v, average=False, name=str(key),
+                               priority=priority)
+        else:
+            out = out if isinstance(out, list) else [out]
+            value = value if isinstance(value, list) else [value]
+            for o, v in zip(out, value):
+                o[:] = hvd.allreduce(v, average=False, name=str(key),
+                                     priority=priority)
+
+    def set_optimizer(self, optimizer):
+        pass
+
+    @staticmethod
+    def is_capable(capability):
+        return False
+
+    def save_optimizer_states(self, fname, dump_optimizer=False):
+        pass
+
+    def load_optimizer_states(self, fname):
+        pass
+
+    @property
+    def rank(self):
+        import horovod.mxnet as hvd
+        return hvd.rank()
+
+    @property
+    def local_rank(self):
+        import horovod.mxnet as hvd
+        return hvd.local_rank()
+
+    @property
+    def num_workers(self):
+        import horovod.mxnet as hvd
+        return hvd.size()
diff --git a/python/mxnet/kvstore/kvstore.py b/python/mxnet/kvstore/kvstore.py
index 11ec3f98178f..ad83ad4fac7c 100644
--- a/python/mxnet/kvstore/kvstore.py
+++ b/python/mxnet/kvstore/kvstore.py
@@ -209,6 +209,7 @@ def push(self, key, value, priority=0):
         Examples
         --------
         >>> # push a single key-value pair
+        >>> shape = (2,3)
         >>> kv.push('3', mx.nd.ones(shape)*8)
         >>> kv.pull('3', out=a) # pull out the value
         >>> print a.asnumpy()
@@ -295,6 +296,7 @@ def pull(self, key, out=None, priority=0, ignore_sparse=True):
         Examples
         --------
         >>> # pull a single key-value pair
+        >>> shape = (2,3)
         >>> a = mx.nd.zeros(shape)
         >>> kv.pull('3', out=a)
         >>> print a.asnumpy()
@@ -367,6 +369,7 @@ def pushpull(self, key, value, out=None, priority=0):
         Examples
         --------
         >>> # pushpull a single key-value pair
+        >>> shape = (2,3)
         >>> kv.pushpull('3', mx.nd.ones(shape)*8, out=a)
         >>> print a.asnumpy()
         [[ 8.  8.  8.]
diff --git a/tests/nightly/dist_device_sync_kvstore_horovod.py b/tests/nightly/dist_device_sync_kvstore_horovod.py
new file mode 100644
index 000000000000..b5dfcafc8af1
--- /dev/null
+++ b/tests/nightly/dist_device_sync_kvstore_horovod.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import numpy as np
+import numpy.random as rnd
+import time
+import argparse
+
+# parser
+parser = argparse.ArgumentParser(description='kvstore test')
+args = parser.parse_args()
+
+
+def check_diff_to_scalar(A, x, rank=None):
+    """ assert A == x"""
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
+
+
+# setup
+keys = ['3', '5', '7']
+init_test_keys = [str(i) for i in range(200,300)]
+init_test_keys_big = [str(i) for i in range(300,400)]
+init_test_keys_device = [str(i) for i in range(400,500)]
+init_test_keys_device_big = [str(i) for i in range(500,600)]
+
+shape = (2, 3)
+big_shape = (1200, 1200)        # bigger than MXNET_KVSTORE_BIGARRAY_BOUND
+
+kv = mx.kv.create('horovod')
+my_rank = kv.rank
+my_num_workers = kv.num_workers
+
+
+def test_pushpull():
+    ctx = mx.gpu(kv.local_rank) if mx.context.num_gpus() > 0 else mx.cpu(kv.local_rank)
+    scale = kv.rank + 1
+    tensor = mx.nd.ones(shape, ctx) * scale
+    kv.pushpull('3', tensor)
+
+    expected = (kv.num_workers + 1) * kv.num_workers / 2
+    check_diff_to_scalar(tensor, expected)
+    print('worker ' + str(kv.local_rank) + ' passed test_pushpull')
+
+
+def test_broadcast():
+    ctx = mx.gpu(kv.local_rank) if mx.context.num_gpus() > 0 else mx.cpu(kv.local_rank)
+    val = mx.nd.zeros(shape, ctx)
+    kv.broadcast('0', mx.nd.ones(shape), out=val)
+    expected = 1
+    check_diff_to_scalar(val, expected, kv.rank)
+    print('worker ' + str(kv.local_rank) + ' passed test_broadcast')
+
+
+def test_type():
+    assert kv.type == 'horovod'
+
+
+if __name__ == "__main__":
+    test_type()
+    test_broadcast()
+    test_pushpull()
diff --git a/tests/nightly/test_distributed_training-gpu.sh b/tests/nightly/test_distributed_training-gpu.sh
index 9ce9cccb09da..40b6e1464a0d 100755
--- a/tests/nightly/test_distributed_training-gpu.sh
+++ b/tests/nightly/test_distributed_training-gpu.sh
@@ -31,7 +31,6 @@ test_kvstore() {
     )
 
     for arg in "${test_args[@]}"; do
-        echo $arg
         python3 ../../tools/launch.py $arg
         if [ $? -ne 0 ]; then
             return $?
@@ -39,6 +38,16 @@ test_kvstore() {
     done
 }
 
+test_horovod() {
+    echo "localhost slots=2" > hosts
+    mpirun -np 2 --hostfile hosts --bind-to none --map-by slot -mca pml ob1 \
+        -mca btl ^openib python3 dist_device_sync_kvstore_horovod.py
+    if [ $? -ne 0 ]; then
+        return $?
+    fi
+}
+
 test_kvstore
+test_horovod
 
 exit $errors
\ No newline at end of file
diff --git a/tools/launch.py b/tools/launch.py
index 7000e061fd4b..117dab69b1b6 100755
--- a/tools/launch.py
+++ b/tools/launch.py
@@ -28,6 +28,7 @@
 curr_path = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(curr_path, "../3rdparty/dmlc-core/tracker"))
 
+
 def dmlc_opts(opts):
     """convert from mxnet's opts to dmlc's opts
     """
@@ -41,14 +42,14 @@ def dmlc_opts(opts):
     dopts = vars(opts)
     for key in ['env_server', 'env_worker', 'env']:
         for v in dopts[key]:
-            args.append('--' + key.replace("_","-"))
+            args.append('--' + key.replace("_", "-"))
             args.append(v)
     args += opts.command
     try:
         from dmlc_tracker import opts
     except ImportError:
-        print("Can't load dmlc_tracker package.  Perhaps you need to run")
-        print("    git submodule update --init --recursive")
+        logging.info("Can't load dmlc_tracker package.  Perhaps you need to run")
+        logging.info("    git submodule update --init --recursive")
         raise
     dmlc_opts = opts.get_opts(args)
     return dmlc_opts
@@ -57,39 +58,39 @@ def dmlc_opts(opts):
 def main():
     parser = argparse.ArgumentParser(description='Launch a distributed job')
     parser.add_argument('-n', '--num-workers', required=True, type=int,
-                        help = 'number of worker nodes to be launched')
+                        help='number of worker nodes to be launched')
     parser.add_argument('-s', '--num-servers', type=int,
-                        help = 'number of server nodes to be launched, \
+                        help='number of server nodes to be launched, \
                         in default it is equal to NUM_WORKERS')
     parser.add_argument('-H', '--hostfile', type=str,
                         help = 'the hostfile of slave machines which will run \
                         the job. Required for ssh and mpi launcher')
     parser.add_argument('--sync-dst-dir', type=str,
-                        help = 'if specificed, it will sync the current \
+                        help='if specificed, it will sync the current \
                         directory into slave machines\'s SYNC_DST_DIR if ssh \
                         launcher is used')
     parser.add_argument('--launcher', type=str, default='ssh',
                         choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'],
-                        help = 'the launcher to use')
+                        help='the launcher to use')
     parser.add_argument('--env-server', action='append', default=[],
-                        help = 'Given a pair of environment_variable:value, sets this value of \
+                        help='Given a pair of environment_variable:value, sets this value of \
                         environment variable for the server processes. This overrides values of \
                         those environment variable on the machine where this script is run from. \
                         Example OMP_NUM_THREADS:3')
     parser.add_argument('--env-worker', action='append', default=[],
-                        help = 'Given a pair of environment_variable:value, sets this value of \
+                        help='Given a pair of environment_variable:value, sets this value of \
                         environment variable for the worker processes. This overrides values of \
                         those environment variable on the machine where this script is run from. \
                         Example OMP_NUM_THREADS:3')
     parser.add_argument('--env', action='append', default=[],
-                        help = 'given a environment variable, passes their \
+                        help='given a environment variable, passes their \
                         values from current system to all workers and servers. \
                         Not necessary when launcher is local as in that case \
                         all environment variables which are set are copied.')
     parser.add_argument('--p3', action='store_true', default=False,
                         help = 'Use P3 distributed training')
     parser.add_argument('command', nargs='+',
-                        help = 'command for launching the program')
+                        help='command for launching the program')
     args, unknown = parser.parse_known_args()
     args.command += unknown
     if args.num_servers is None:
@@ -100,31 +101,33 @@ def main():
     args = dmlc_opts(args)
 
     if args.host_file is None or args.host_file == 'None':
-      if args.cluster == 'yarn':
-          from dmlc_tracker import yarn
-          yarn.submit(args)
-      elif args.cluster == 'local':
-          from dmlc_tracker import local
-          local.submit(args)
-      elif args.cluster == 'sge':
-          from dmlc_tracker import sge
-          sge.submit(args)
-      else:
-          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
+        if args.cluster == 'yarn':
+            from dmlc_tracker import yarn
+            yarn.submit(args)
+        elif args.cluster == 'local':
+            from dmlc_tracker import local
+            local.submit(args)
+        elif args.cluster == 'sge':
+            from dmlc_tracker import sge
+            sge.submit(args)
+        else:
+            raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
     else:
-      if args.cluster == 'ssh':
-          from dmlc_tracker import ssh
-          ssh.submit(args)
-      elif args.cluster == 'mpi':
-          from dmlc_tracker import mpi
-          mpi.submit(args)
-      else:
-          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
+        if args.cluster == 'ssh':
+            from dmlc_tracker import ssh
+            ssh.submit(args)
+        elif args.cluster == 'mpi':
+            from dmlc_tracker import mpi
+            mpi.submit(args)
+        else:
+            raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
+
 
 def signal_handler(signal, frame):
     logging.info('Stop launcher')
     sys.exit(0)
 
+
 if __name__ == '__main__':
     fmt = '%(asctime)s %(levelname)s %(message)s'
     logging.basicConfig(format=fmt, level=logging.INFO)

From c7d2b3c81ff43c806a58c924c64203facaa1cee3 Mon Sep 17 00:00:00 2001
From: vexilligera <vexilligera@gmail.com>
Date: Tue, 14 Apr 2020 23:33:41 +0000
Subject: [PATCH 06/14] [NumPy] Add NumPy support for triu (#17614)

* triu

* rebase

* fix ci

* merge

* triu new ffi

* cpplint

* cpplint

* ffi benchmark

* fix style

* merge

* fix conflict

Co-authored-by: Ubuntu <ubuntu@ip-172-31-8-137.us-east-2.compute.internal>
Co-authored-by: Hao Jin <hjjn.amzn@gmail.com>
---
 benchmark/python/ffi/benchmark_ffi.py         |   1 +
 python/mxnet/ndarray/numpy/_op.py             |  27 +-
 python/mxnet/numpy/multiarray.py              |  27 +-
 python/mxnet/symbol/numpy/_symbol.py          |  28 +-
 src/api/operator/numpy/np_triu_op.cc          |  50 ++++
 src/operator/numpy/np_triu_op-inl.h           | 241 ++++++++++++++++++
 src/operator/numpy/np_triu_op.cc              |  61 +++++
 src/operator/numpy/np_triu_op.cu              |  38 +++
 .../unittest/test_numpy_interoperability.py   |  17 ++
 tests/python/unittest/test_numpy_op.py        |  62 +++++
 10 files changed, 549 insertions(+), 3 deletions(-)
 create mode 100644 src/api/operator/numpy/np_triu_op.cc
 create mode 100644 src/operator/numpy/np_triu_op-inl.h
 create mode 100644 src/operator/numpy/np_triu_op.cc
 create mode 100644 src/operator/numpy/np_triu_op.cu

diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 328a74c98c83..8f8da670d64c 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -137,6 +137,7 @@ def prepare_workloads():
                            out=dnp.array([False, False], dtype=bool), keepdims=False)
     OpArgMngr.add_workload("roll", pool["2x2"], 1, axis=0)
     OpArgMngr.add_workload("rot90", pool["2x2"], 2)
+    OpArgMngr.add_workload("triu", pool['3x3'])
     OpArgMngr.add_workload("array_split", pool['2x2'], 2, axis=1)
     OpArgMngr.add_workload("vsplit", pool['2x2'], 2)
     OpArgMngr.add_workload("hsplit", pool['2x2'], 2)
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index e88796c8158d..d1b80cadf484 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -43,7 +43,7 @@
            'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index',
            'diag_indices_from', 'hanning', 'hamming', 'blackman', 'flip', 'flipud', 'fliplr',
            'hypot', 'bitwise_and', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
-           'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'kron',
+           'tril', 'triu', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'kron',
            'equal', 'not_equal', 'greater', 'less', 'greater_equal', 'less_equal', 'roll', 'rot90', 'einsum',
            'true_divide', 'nonzero', 'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'interp',
            'diff', 'ediff1d', 'resize', 'polyval', 'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite',
@@ -2070,6 +2070,31 @@ def tril(m, k=0):
     return _api_internal.tril(m, k)
 
 
+@set_module('mxnet.ndarray.numpy')
+def triu(m, k=0):
+    r"""
+    Upper triangle of an array.
+
+    Return a copy of a matrix with the elements below the `k`-th diagonal
+    zeroed.
+
+    Please refer to the documentation for `tril` for further details.
+
+    See Also
+    --------
+    tril : lower triangle of an array
+
+    Examples
+    --------
+    >>> np.triu(np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]]), -1)
+    array([[ 1,  2,  3],
+           [ 4,  5,  6],
+           [ 0,  8,  9],
+           [ 0,  0, 12]])
+    """
+    return _api_internal.triu(m, k)
+
+
 def _unary_func_helper(x, fn_array, fn_scalar, out=None, **kwargs):
     """Helper function for unary operators with kwargs.
 
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 3b57f1a2a5fd..feec9ca89a9a 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -68,7 +68,7 @@
            'indices', 'copysign', 'ravel', 'unravel_index', 'diag_indices_from', 'hanning', 'hamming', 'blackman',
            'flip', 'flipud', 'fliplr', 'around', 'round', 'round_', 'arctan2', 'hypot',
            'bitwise_and', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad',
-           'unique', 'lcm', 'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'kron',
+           'unique', 'lcm', 'tril', 'triu', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'kron',
            'equal', 'not_equal', 'interp',
            'greater', 'less', 'greater_equal', 'less_equal', 'roll', 'rot90', 'einsum', 'true_divide', 'nonzero',
            'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d', 'resize', 'matmul',
@@ -5614,6 +5614,31 @@ def tril_indices(n, k=0, m=None):
 
 
 # pylint: disable=redefined-outer-name
+@set_module('mxnet.numpy')
+def triu(m, k=0):
+    r"""
+    Upper triangle of an array.
+
+    Return a copy of a matrix with the elements below the `k`-th diagonal
+    zeroed.
+
+    Please refer to the documentation for `tril` for further details.
+
+    See Also
+    --------
+    tril : lower triangle of an array
+
+    Examples
+    --------
+    >>> np.triu(np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]]), -1)
+    array([[ 1,  2,  3],
+           [ 4,  5,  6],
+           [ 0,  8,  9],
+           [ 0,  0, 12]])
+    """
+    return _mx_nd_np.triu(m, k)
+
+
 @set_module('mxnet.numpy')
 def arange(start, stop=None, step=1, dtype=None, ctx=None):
     """Return evenly spaced values within a given interval.
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 9749721ed9e1..5061b772ae8f 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -49,7 +49,7 @@
            'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index',
            'diag_indices_from', 'hanning', 'hamming', 'blackman', 'flip', 'flipud', 'fliplr',
            'hypot', 'bitwise_and', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'interp',
-           'tril', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'kron',
+           'tril', 'triu', 'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'kron',
            'equal', 'not_equal', 'greater', 'less', 'greater_equal', 'less_equal', 'roll', 'rot90', 'einsum',
            'true_divide', 'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d',
            'resize', 'polyval', 'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite',
@@ -2200,6 +2200,32 @@ def tril(m, k=0):
 
 
 @set_module('mxnet.symbol.numpy')
+def triu(m, k=0):
+    r"""
+    Upper triangle of an array.
+
+    Return a copy of an array with elements under the `k`-th diagonal zeroed.
+
+    Parameters
+    ----------
+    m : _Symbol, shape (M, N)
+        Input array.
+    k : int, optional
+        Diagonal under which to zero elements.  `k = 0` (the default) is the
+        main diagonal, `k < 0` is below it and `k > 0` is under.
+
+    Returns
+    -------
+    triu : _Symbol, shape (M, N)
+        Upper triangle of `m`, of same shape and data-type as `m`.
+
+    See Also
+    --------
+    tril : same thing, only for the lower triangle
+    """
+    return _npi.triu(m, k)
+
+
 def tril_indices(n, k=0, m=None):
     """
     Return the indices for the lower-triangle of an (n, m) array.
diff --git a/src/api/operator/numpy/np_triu_op.cc b/src/api/operator/numpy/np_triu_op.cc
new file mode 100644
index 000000000000..e42169aca43b
--- /dev/null
+++ b/src/api/operator/numpy/np_triu_op.cc
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/np_triu_op.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../utils.h"
+#include "../../../operator/numpy/np_triu_op-inl.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.triu")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  op::TriuParam param;
+  nnvm::NodeAttrs attrs;
+  const nnvm::Op* op = Op::Get("_npi_triu");
+  // inputs
+  param.k = args[1].operator int();
+  NDArray* inputs[] = {args[0].operator NDArray*()};
+
+  attrs.op = op;
+  attrs.parsed = param;
+  SetAttrDict<op::TriuParam>(&attrs);
+
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, 1, inputs, &num_outputs, nullptr);
+  *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+});
+
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_triu_op-inl.h b/src/operator/numpy/np_triu_op-inl.h
new file mode 100644
index 000000000000..17a484f26efb
--- /dev/null
+++ b/src/operator/numpy/np_triu_op-inl.h
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_triu_op-inl.h
+ * \brief Function definition of triu (upper triangle of an array) op
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_TRIU_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_TRIU_OP_INL_H_
+
+#include <dmlc/parameter.h>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <sstream>
+#include <algorithm>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+struct TriuParam : public dmlc::Parameter<TriuParam> {
+  int k;
+  DMLC_DECLARE_PARAMETER(TriuParam) {
+    DMLC_DECLARE_FIELD(k)
+      .set_default(0)
+      .describe("Diagonal in question. The default is 0. "
+                "Use k>0 for diagonals above the main diagonal, "
+                "and k<0 for diagonals below the main diagonal. "
+                "If input has shape (S0 S1) k must be between -S0 and S1.");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream k_s;
+    k_s << k;
+    (*dict)["k"] = k_s.str();
+  }
+};
+
+inline bool TriuOpShape(const nnvm::NodeAttrs& attrs,
+                        mxnet::ShapeVector* in_attrs,
+                        mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  const mxnet::TShape& ishape = (*in_attrs)[0];
+  mxnet::TShape oshape;
+
+  if (!mxnet::ndim_is_known(ishape)) {
+    return false;
+  }
+
+  if (ishape.ndim() == 1) {
+    auto s = ishape[0];
+    oshape = mxnet::TShape({s, s});
+  } else {
+    oshape = ishape;
+  }
+
+  if (shape_is_none(oshape)) {
+    LOG(FATAL) << "Diagonal does not exist.";
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+
+  return shape_is_known(out_attrs->at(0));
+}
+
+template<int req>
+struct triu1Dforward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
+                                  mshadow::Shape<2> oshape, int k) {
+    using namespace mxnet_op;
+
+    const index_t row_id = i / oshape[1];
+    const index_t col_id = i % oshape[1];
+    if (col_id < (row_id + k)) {
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(0));
+    } else {
+      KERNEL_ASSIGN(out[i], req, data[col_id]);
+    }
+  }
+};
+
+template<int req>
+struct triu1Dbackward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
+                                  mshadow::Shape<1> oshape, int k) {
+    using namespace mxnet_op;
+    auto m = oshape[0];
+    auto start = i - k;
+    DType res = 0;
+    for (auto y = 0; y <= start && y < m; y++) {
+      res += data[y * m + i];
+    }
+    KERNEL_ASSIGN(out[i], req, res);
+  }
+};
+
+template<int req>
+struct triu2D {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
+                                  mshadow::Shape<2> oshape, int k) {
+    using namespace mxnet_op;
+
+    const index_t row_id = i / oshape[1];
+    const index_t col_id = i % oshape[1];
+    if (col_id < (row_id + k)) {
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(0));
+    } else {
+      KERNEL_ASSIGN(out[i], req, data[i]);
+    }
+  }
+};
+
+template<int req>
+struct triu3D {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
+                                  mshadow::Shape<3> oshape, int k) {
+    using namespace mxnet_op;
+
+    const index_t row_id = i % (oshape[1] * oshape[2]) / oshape[2];
+    const index_t col_id = i % (oshape[1] * oshape[2]) % oshape[2];
+    if (col_id < (row_id + k)) {
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(0));
+    } else {
+      KERNEL_ASSIGN(out[i], req, data[i]);
+    }
+  }
+};
+
+template<typename xpu, bool back>
+void TriuOpProcess(const TBlob& in_data,
+                   const TBlob& out_data,
+                   index_t dsize,
+                   const TriuParam& param,
+                   mxnet_op::Stream<xpu> *s,
+                   const std::vector<OpReqType>& req) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+
+  const mxnet::TShape& ishape = in_data.shape_;
+  const mxnet::TShape& oshape = out_data.shape_;
+
+  if (ishape.ndim() == 2 && oshape.ndim() == 2) {
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<triu2D<req_type>, xpu>::Launch(
+            s, dsize, out_data.dptr<DType>(), in_data.dptr<DType>(),
+            Shape2(oshape[0], oshape[1]), param.k);
+      });
+    });
+  } else if (ishape.ndim() > 2) {
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<triu3D<req_type>, xpu>::Launch(
+            s, dsize, out_data.dptr<DType>(), in_data.dptr<DType>(),
+            oshape.FlatTo3D(oshape.ndim() - 2), param.k);
+      });
+    });
+  } else {
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        if (back) {
+          Kernel<triu1Dbackward<req_type>, xpu>::Launch(
+              s, dsize, out_data.dptr<DType>(), in_data.dptr<DType>(),
+              Shape1(oshape[0]), param.k);
+        } else {
+          Kernel<triu1Dforward<req_type>, xpu>::Launch(
+              s, dsize, out_data.dptr<DType>(), in_data.dptr<DType>(),
+              Shape2(oshape[0], oshape[1]), param.k);
+        }
+      });
+    });
+  }
+}
+
+template<typename xpu>
+void TriuOpForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const TriuParam& param = nnvm::get<TriuParam>(attrs.parsed);
+
+  TriuOpProcess<xpu, false>(in_data, out_data, out_data.Size(), param, s, req);
+}
+
+template<typename xpu>
+void TriuOpBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const TriuParam& param = nnvm::get<TriuParam>(attrs.parsed);
+
+  TriuOpProcess<xpu, true>(in_data, out_data, out_data.Size(), param, s, req);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_TRIU_OP_INL_H_
diff --git a/src/operator/numpy/np_triu_op.cc b/src/operator/numpy/np_triu_op.cc
new file mode 100644
index 000000000000..fdd526060001
--- /dev/null
+++ b/src/operator/numpy/np_triu_op.cc
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+* Copyright (c) 2020 by Contributors
+* \file np_triu_op.cc
+* \brief CPU implementation of numpy triu operator
+*/
+
+#include "./np_triu_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(TriuParam);
+
+NNVM_REGISTER_OP(_npi_triu)
+.set_attr_parser(ParamParser<TriuParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", TriuOpShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", TriuOpForward<cpu>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_triu"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(TriuParam::__FIELDS__());
+
+
+NNVM_REGISTER_OP(_backward_triu)
+.set_attr_parser(ParamParser<TriuParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", TriuOpBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_triu_op.cu b/src/operator/numpy/np_triu_op.cu
new file mode 100644
index 000000000000..a143859e1db6
--- /dev/null
+++ b/src/operator/numpy/np_triu_op.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_triu_op.cu
+ * \brief GPU implementation of numpy triu operator
+ */
+
+#include "./np_triu_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_triu)
+.set_attr<FCompute>("FCompute<gpu>", TriuOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_triu)
+.set_attr<FCompute>("FCompute<gpu>", TriuOpBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 79a45f5e46f6..080fb03a7158 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -722,6 +722,23 @@ def _add_workload_tril():
         OpArgMngr.add_workload('tril', np.zeros((3, 3), dtype=dt))
 
 
+def _add_workload_triu():
+    OpArgMngr.add_workload('triu', np.random.uniform(size=(4, 1)))
+    for dt in ['float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8']:
+        OpArgMngr.add_workload('triu', np.ones((2, 2), dtype=dt))
+        a = np.array([
+            [[1, 1], [1, 1]],
+            [[1, 1], [1, 0]],
+            [[1, 1], [0, 0]],
+        ], dtype=dt)
+        OpArgMngr.add_workload('triu', a)
+        arr = np.array([[1, 1, np.inf],
+                        [1, 1, 1],
+                        [np.inf, 1, 1]])
+        OpArgMngr.add_workload('triu', arr)
+        OpArgMngr.add_workload('triu', np.zeros((3, 3), dtype=dt))
+
+
 def _add_workload_einsum():
     chars = 'abcdefghij'
     sizes = [2, 3, 4, 5, 4, 3, 2, 6, 5, 4]
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 802bfb78bb01..284791763d20 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2085,6 +2085,7 @@ def hybrid_forward(self, F, x):
                 ret_mx = np.tril(data_mx, k*prefix)
             assert same(ret_mx.asnumpy(), ret_np)
             ret_mx.backward()
+            print(data_mx.grad)
             if len(shape) == 2:
                 grad_np = _np.tri(*shape, k=k*prefix)
                 assert same(data_mx.grad.asnumpy(), grad_np)
@@ -2101,6 +2102,67 @@ def hybrid_forward(self, F, x):
                 assert same(ret_mx.asnumpy(), ret_np)
 
 
+@with_seed()
+@use_np
+def test_np_triu():
+    # numpy triu does not support scalar array (zero-dim)
+    config = [
+        ((4, 2), 3),
+        ((4, 2), 9),
+        ((4, 2), 0),
+        ((4, 2), -1),
+        ((4, 5, 6), 0),
+        ((4, 5, 6), 5),
+        ((4, 5, 6), 2),
+        ((4, 5, 6), -2),
+        ((4, 5, 6), -5),
+        ((4, 0), 0),
+        ((4, 0), 2),
+        ((4, 0), 4),
+        ((4, 0), -3),
+        ((4, 0, 5), 0),
+        ((4, 0, 5), 1),
+        ((4, 0, 5), 5),
+        ((4, 0, 5), -3),
+        ((3, ), 0),
+        ((3, ), 2),
+        ((3, ), 5)
+    ]
+
+    class TestTriu(HybridBlock):
+        def __init__(self, k):
+            super(TestTriu, self).__init__()
+            self._k = k
+
+        def hybrid_forward(self, F, x):
+            return F.np.triu(x, k=self._k)
+
+    for prefix in [1, -1]:
+        for shape, k in config:
+            data_np = _np.random.uniform(size=shape)
+            data_mx = np.array(data_np, dtype=data_np.dtype)
+            data_mx.attach_grad()
+            ret_np = _np.triu(data_np, k*prefix)
+            with mx.autograd.record():
+                ret_mx = np.triu(data_mx, k*prefix)
+            assert same(ret_mx.asnumpy(), ret_np)
+            ret_mx.backward()
+            if len(shape) == 2:
+                grad_np = _np.triu(_np.ones_like(data_np), k*prefix)
+                assert same(data_mx.grad.asnumpy(), grad_np)
+            if len(shape) == 1:
+                grad_np = _np.triu(_np.ones(shape), k*prefix)
+                grad_np = grad_np.sum(axis=0, keepdims=False)
+                assert same(data_mx.grad.asnumpy(), grad_np)
+
+            net = TestTriu(k*prefix)
+            for hybrid in [False, True]:
+                if hybrid:
+                    net.hybridize()
+                ret_mx = net(data_mx)
+                assert same(ret_mx.asnumpy(), ret_np)
+
+
 @with_seed()
 @use_np
 def test_np_unary_funcs():

From 2c4732b74c1ccbbeccc7add27d6bbcffe437421b Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Tue, 14 Apr 2020 22:31:41 -0700
Subject: [PATCH 07/14] Fix CI (#18056)

* Fix Julia tests always testing master branch instead of PR branches.

* Fix ci/docker_cache.py

* Fix Dockerfiles for nightly tests
---
 ci/build.py                                   | 2 +-
 ci/docker/Dockerfile.build.ubuntu_nightly_cpu | 3 +++
 ci/docker/Dockerfile.build.ubuntu_nightly_gpu | 3 +++
 ci/docker/runtime_functions.sh                | 1 +
 julia/deps/build.jl                           | 2 +-
 julia/src/base.jl                             | 1 +
 6 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ci/build.py b/ci/build.py
index cbc41218f042..c6a96d5626f0 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -71,7 +71,7 @@ def get_docker_binary(use_nvidia_docker: bool) -> str:
 
 
 def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
-                 cache_intermediate: bool) -> str:
+                 cache_intermediate: bool = False) -> str:
     """
     Build a container for the given platform
     :param platform: Platform
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_cpu b/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
index 49a665e57c33..47754b6e2f69 100644
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
@@ -42,6 +42,9 @@ RUN /work/ubuntu_perl.sh
 COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 
+COPY install/ubuntu_gcc8.sh /work/
+RUN /work/ubuntu_gcc8.sh
+
 COPY install/ubuntu_caffe.sh /work/
 RUN /work/ubuntu_caffe.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
index 82d049792c1b..e4e7bd141622 100644
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
@@ -42,6 +42,9 @@ RUN /work/ubuntu_perl.sh
 COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 
+COPY install/ubuntu_gcc8.sh /work/
+RUN /work/ubuntu_gcc8.sh
+
 COPY install/ubuntu_tvm.sh /work/
 RUN /work/ubuntu_tvm.sh
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 5ea21bcdd1dd..88b6dfec8139 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -2087,6 +2087,7 @@ publish_scala_build() {
     pushd .
     scala_prepare
     source /opt/rh/devtoolset-7/enable
+    export USE_SYSTEM_CUDA=1
     ./ci/publish/scala/build.sh
     popd
 }
diff --git a/julia/deps/build.jl b/julia/deps/build.jl
index a79d2a062c18..5194dc3c622a 100644
--- a/julia/deps/build.jl
+++ b/julia/deps/build.jl
@@ -33,7 +33,7 @@ if haskey(ENV, "MXNET_HOME")
   # In case of macOS, if user build libmxnet from source and set the MXNET_HOME,
   # the output is still named as `libmxnet.so`.
   lib = Libdl.find_library(["libmxnet.$(Libdl.dlext)", "libmxnet.so"],
-                           [joinpath(MXNET_HOME, "lib"), MXNET_HOME])
+                           [joinpath(MXNET_HOME, "lib"), joinpath(MXNET_HOME, "build"), MXNET_HOME])
   if !isempty(lib)
     @info("Existing libmxnet detected at $lib, skip building...")
     libmxnet_detected = true
diff --git a/julia/src/base.jl b/julia/src/base.jl
index 2b60dca4f783..e94b1bbbe37c 100644
--- a/julia/src/base.jl
+++ b/julia/src/base.jl
@@ -42,6 +42,7 @@ const grad_req_map = Dict{Symbol,GRAD_REQ}(
 ################################################################################
 const MXNET_LIB = Libdl.find_library(["libmxnet.$(Libdl.dlext)", "libmxnet.so"],  # see build.jl
                                      [joinpath(get(ENV, "MXNET_HOME", ""), "lib"),
+                                      joinpath(get(ENV, "MXNET_HOME", ""), "build"),
                                       get(ENV, "MXNET_HOME", ""),
                                       joinpath(@__DIR__, "..",
                                                "deps", "usr", "lib")])

From ce48a9d56c7a43fabedd253b674ce1a51555f1ea Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Wed, 15 Apr 2020 18:16:02 -0700
Subject: [PATCH 08/14] Remove code owner (#17928)

* Add dependency for cpp tests

* Remove from CODEOWNERS
---
 CODEOWNERS | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 79836ff303d7..4b755291c17c 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -23,20 +23,18 @@
 /julia/                           @iblis17
 
 # C++ base
-/src/kvstore/     @rahul003 @anirudh2290 @eric-haibin-lin @apeforest
-/include/         @anirudh2290 @pllarroy @eric-haibin-lin
-/src/c_api/       @anirudh2290 @eric-haibin-lin @apeforest
-/src/common/      @anirudh2290
-/src/engine/      @anirudh2290 @eric-haibin-lin @apeforest
-/src/executor/    @anirudh2290 @eric-haibin-lin @apeforest
-/src/imperative/  @anirudh2290 @eric-haibin-lin @apeforest
-/src/io/          @anirudh2290 @eric-haibin-lin @apeforest
-/src/ndarray/     @anirudh2290 @eric-haibin-lin @apeforest
-/src/nnvm/        @anirudh2290 @eric-haibin-lin @apeforest
-/src/operator/    @anirudh2290 @eric-haibin-lin @apeforest
-/src/profiler/    @anirudh2290 @eric-haibin-lin
-/src/storage/     @anirudh2290 @eric-haibin-lin
-/tests/cpp/       @anirudh2290
+/src/kvstore/     @rahul003 @eric-haibin-lin @apeforest
+/include/         @pllarroy @eric-haibin-lin
+/src/c_api/       @eric-haibin-lin @apeforest
+/src/engine/      @eric-haibin-lin @apeforest
+/src/executor/    @eric-haibin-lin @apeforest
+/src/imperative/  @eric-haibin-lin @apeforest
+/src/io/          @eric-haibin-lin @apeforest
+/src/ndarray/     @eric-haibin-lin @apeforest
+/src/nnvm/        @eric-haibin-lin @apeforest
+/src/operator/    @eric-haibin-lin @apeforest
+/src/profiler/    @eric-haibin-lin
+/src/storage/     @eric-haibin-lin
 /cpp-package/     @nswamy @pllarroy
 /src/             @pllarroy
 /plugin/          @pllarroy

From 94f235d5385773ce8a4f21957d9946deed15b7f3 Mon Sep 17 00:00:00 2001
From: vexilligera <vexilligera@gmail.com>
Date: Thu, 16 Apr 2020 02:05:50 +0000
Subject: [PATCH 09/14] [numpy] add new ffi for column_stack and hstack
 (#17831)

* column stack ffi

* remove comment

* benchmark

* hstack ffi

* resolve comments

* cpplint

Co-authored-by: Ubuntu <ubuntu@ip-172-31-8-137.us-east-2.compute.internal>
---
 benchmark/python/ffi/benchmark_ffi.py  |  2 ++
 python/mxnet/ndarray/numpy/_op.py      |  4 +--
 src/api/operator/numpy/np_matrix_op.cc | 41 ++++++++++++++++++++++++++
 src/operator/numpy/np_matrix_op-inl.h  |  5 ++++
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 8f8da670d64c..c8255fe6e9a9 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -137,6 +137,8 @@ def prepare_workloads():
                            out=dnp.array([False, False], dtype=bool), keepdims=False)
     OpArgMngr.add_workload("roll", pool["2x2"], 1, axis=0)
     OpArgMngr.add_workload("rot90", pool["2x2"], 2)
+    OpArgMngr.add_workload("column_stack", (pool['3x3'], pool['3x3'], pool['3x3']))
+    OpArgMngr.add_workload("hstack", (pool['3x3'], pool['3x3'], pool['3x3']))
     OpArgMngr.add_workload("triu", pool['3x3'])
     OpArgMngr.add_workload("array_split", pool['2x2'], 2, axis=1)
     OpArgMngr.add_workload("vsplit", pool['2x2'], 2)
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index d1b80cadf484..f1341f6a7922 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -4341,7 +4341,7 @@ def column_stack(tup):
            [2., 3.],
            [3., 4.]])
     """
-    return _npi.column_stack(*tup)
+    return _api_internal.column_stack(*tup)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -4380,7 +4380,7 @@ def hstack(arrays):
            [2., 3.],
            [3., 4.]])
     """
-    return _npi.hstack(*arrays)
+    return _api_internal.hstack(*arrays)
 
 
 @set_module('mxnet.ndarray.numpy')
diff --git a/src/api/operator/numpy/np_matrix_op.cc b/src/api/operator/numpy/np_matrix_op.cc
index c8870e38552c..998823d37db8 100644
--- a/src/api/operator/numpy/np_matrix_op.cc
+++ b/src/api/operator/numpy/np_matrix_op.cc
@@ -23,6 +23,7 @@
  */
 #include <mxnet/api_registry.h>
 #include <mxnet/runtime/packed_func.h>
+#include <vector>
 #include "../utils.h"
 #include "../../../operator/nn/concat-inl.h"
 #include "../../../operator/tensor/matrix_op-inl.h"
@@ -200,6 +201,46 @@ MXNET_REGISTER_API("_npi.rot90")
   *ret = ndoutputs[0];
 });
 
+MXNET_REGISTER_API("_npi.column_stack")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_column_stack");
+  nnvm::NodeAttrs attrs;
+  op::NumpyColumnStackParam param;
+  param.num_args = args.size();
+
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::NumpyColumnStackParam>(&attrs);
+  int num_outputs = 0;
+  std::vector<NDArray*> inputs;
+  for (int i = 0; i < param.num_args; ++i) {
+    inputs.push_back(args[i].operator mxnet::NDArray*());
+  }
+  auto ndoutputs = Invoke(op, &attrs, param.num_args, &inputs[0], &num_outputs, nullptr);
+  *ret = ndoutputs[0];
+});
+
+MXNET_REGISTER_API("_npi.hstack")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_hstack");
+  nnvm::NodeAttrs attrs;
+  op::ConcatParam param;
+  param.num_args = args.size();
+
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::ConcatParam>(&attrs);
+  int num_outputs = 0;
+  std::vector<NDArray*> inputs;
+  for (int i = 0; i < param.num_args; ++i) {
+    inputs.push_back(args[i].operator mxnet::NDArray*());
+  }
+  auto ndoutputs = Invoke(op, &attrs, param.num_args, &inputs[0], &num_outputs, nullptr);
+  *ret = ndoutputs[0];
+});
+
 MXNET_REGISTER_API("_npi.array_split")
 .set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
   using namespace runtime;
diff --git a/src/operator/numpy/np_matrix_op-inl.h b/src/operator/numpy/np_matrix_op-inl.h
index 57b1c331bad5..09eb10c2982f 100644
--- a/src/operator/numpy/np_matrix_op-inl.h
+++ b/src/operator/numpy/np_matrix_op-inl.h
@@ -63,6 +63,11 @@ struct NumpyColumnStackParam : public dmlc::Parameter<NumpyColumnStackParam> {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs to be column stacked");
   }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream ss;
+    ss << num_args;
+    (*dict)["num_args"] = ss.str();
+  }
 };
 
 struct NumpyReshapeParam : public dmlc::Parameter<NumpyReshapeParam> {

From 37dbbd4ce2bc2f3618ba352a7295747b516945a2 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 15 Apr 2020 22:59:33 -0700
Subject: [PATCH 10/14] Fix CD (#18072)

    Fix NCCL versions in Cuda 10 docker containers. (Previously used version was not available for all cuda 10.x versions)
    Fix missing dependency specification in dynamic CD pipeline
    Fix use of cpu image for gpu builds in static CD pipeline (We use system cuda for static GPU build now, so the static pipeline should run on the correct Centos7 gpu container)
    Fix gcc7 not on PATH in dynamic CD pipeline build step
---
 cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy  |  4 +-
 cd/mxnet_lib/mxnet_lib_pipeline.groovy        |  2 +-
 cd/mxnet_lib/static/Jenkins_pipeline.groovy   |  3 +-
 .../Dockerfile.publish.centos7_gpu_cu100      |  2 +-
 .../Dockerfile.publish.centos7_gpu_cu101      |  2 +-
 .../Dockerfile.publish.centos7_gpu_cu102      |  2 +-
 ci/docker/install/centos7_base.sh             |  1 +
 ci/docker/runtime_functions.sh                | 70 ++++++-------------
 8 files changed, 30 insertions(+), 56 deletions(-)

diff --git a/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy b/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy
index 52ad150fe6f9..4862e89d2db1 100644
--- a/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy
+++ b/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy
@@ -30,8 +30,8 @@ libmxnet = 'build/libmxnet.so'
 licenses = 'licenses/*'
 
 // libmxnet dependencies
-mx_deps = ''
-mx_native_deps = ''
+mx_deps = 'build/3rdparty/openmp/runtime/src/libomp.so'
+mx_native_deps = 'build/3rdparty/openmp/runtime/src/libomp.so'
 
 // library type
 // either static or dynamic - depending on how it links to its dependencies
diff --git a/cd/mxnet_lib/mxnet_lib_pipeline.groovy b/cd/mxnet_lib/mxnet_lib_pipeline.groovy
index 0c49bfa8e2c8..1eab29d21229 100644
--- a/cd/mxnet_lib/mxnet_lib_pipeline.groovy
+++ b/cd/mxnet_lib/mxnet_lib_pipeline.groovy
@@ -98,7 +98,7 @@ def unittest_py3(mxnet_variant) {
       def image = get_environment(mxnet_variant)
       def use_nvidia_docker = mxnet_variant.startsWith('cu')
       ci_utils.unpack_and_init("mxnet_${mxnet_variant}", get_stash(mxnet_variant), false)
-      ci_utils.docker_run(image, "cd_unittest_ubuntu ${mxnet_variant} python3", use_nvidia_docker)
+      ci_utils.docker_run(image, "cd_unittest_ubuntu ${mxnet_variant}", use_nvidia_docker)
     }
   }
 }
diff --git a/cd/mxnet_lib/static/Jenkins_pipeline.groovy b/cd/mxnet_lib/static/Jenkins_pipeline.groovy
index 61d18083e314..127b9a4f71de 100644
--- a/cd/mxnet_lib/static/Jenkins_pipeline.groovy
+++ b/cd/mxnet_lib/static/Jenkins_pipeline.groovy
@@ -45,8 +45,9 @@ libmxnet_pipeline = load('cd/mxnet_lib/mxnet_lib_pipeline.groovy')
 def build(mxnet_variant) {
   node(NODE_LINUX_CPU) {
     ws("workspace/mxnet_${libtype}/${mxnet_variant}/${env.BUILD_NUMBER}") {
+      def image = libmxnet_pipeline.get_environment(mxnet_variant)
       ci_utils.init_git()
-      ci_utils.docker_run('publish.centos7_cpu', "build_static_libmxnet ${mxnet_variant}", false)
+      ci_utils.docker_run(image, "build_static_libmxnet ${mxnet_variant}", false)
       ci_utils.pack_lib("mxnet_${mxnet_variant}", libmxnet_pipeline.get_stash(mxnet_variant))
     }
   }
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu100 b/ci/docker/Dockerfile.publish.centos7_gpu_cu100
index f9469fcb186f..41291f21ac72 100644
--- a/ci/docker/Dockerfile.publish.centos7_gpu_cu100
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu100
@@ -29,7 +29,7 @@ RUN /work/centos7_python.sh
 COPY install/centos7_scala.sh /work/
 RUN /work/centos7_scala.sh
 ENV SHORT_CUDA_VERSION=10.0
-ENV SHORT_NCCL_VERSION=2.4.8
+ENV SHORT_NCCL_VERSION=2.6.4
 COPY install/centos7_nccl.sh /work/
 RUN /work/centos7_nccl.sh
 
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu101 b/ci/docker/Dockerfile.publish.centos7_gpu_cu101
index 00be436c0412..a1e90b9eafc3 100644
--- a/ci/docker/Dockerfile.publish.centos7_gpu_cu101
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu101
@@ -29,7 +29,7 @@ RUN /work/centos7_python.sh
 COPY install/centos7_scala.sh /work/
 RUN /work/centos7_scala.sh
 ENV SHORT_CUDA_VERSION=10.1
-ENV SHORT_NCCL_VERSION=2.4.8
+ENV SHORT_NCCL_VERSION=2.6.4
 COPY install/centos7_nccl.sh /work/
 RUN /work/centos7_nccl.sh
 
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu102 b/ci/docker/Dockerfile.publish.centos7_gpu_cu102
index 27a625e4641d..851d63029a84 100644
--- a/ci/docker/Dockerfile.publish.centos7_gpu_cu102
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu102
@@ -29,7 +29,7 @@ RUN /work/centos7_python.sh
 COPY install/centos7_scala.sh /work/
 RUN /work/centos7_scala.sh
 ENV SHORT_CUDA_VERSION=10.2
-ENV SHORT_NCCL_VERSION=2.4.8
+ENV SHORT_NCCL_VERSION=2.6.4
 COPY install/centos7_nccl.sh /work/
 RUN /work/centos7_nccl.sh
 
diff --git a/ci/docker/install/centos7_base.sh b/ci/docker/install/centos7_base.sh
index 72896cbc42ad..93e2801481c5 100755
--- a/ci/docker/install/centos7_base.sh
+++ b/ci/docker/install/centos7_base.sh
@@ -37,6 +37,7 @@ yum -y install protobuf-compiler
 yum -y install protobuf-devel
 yum -y install zeromq-devel
 yum -y install patchelf
+yum -y install pandoc
 
 # gcc7
 yum -y install centos-release-scl
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 88b6dfec8139..b1e69be97163 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -144,9 +144,6 @@ build_wheel() {
     popd
 }
 
-# Build commands: Every platform in docker/Dockerfile.build.<platform> should have a corresponding
-# function here with the same suffix:
-
 gather_licenses() {
     mkdir -p licenses
 
@@ -156,41 +153,6 @@ gather_licenses() {
     cp DISCLAIMER-WIP licenses/
 }
 
-build_ubuntu_cpu_release() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_MKLDNN=ON \
-        -DUSE_CUDA=OFF \
-        -G Ninja /work/mxnet
-    ninja
-}
-
-build_ubuntu_cpu_native_release() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_MKLDNN=OFF \
-        -DUSE_CUDA=OFF \
-        -G Ninja /work/mxnet
-    ninja
-}
-
-build_ubuntu_gpu_release() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_MKLDNN=ON \
-        -DUSE_DIST_KVSTORE=ON \
-        -DUSE_CUDA=ON \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -G Ninja /work/mxnet
-    ninja
-}
-
 # Compiles the dynamic mxnet library
 # Parameters:
 # $1 -> mxnet_variant: the mxnet variant to build, e.g. cpu, native, cu100, cu92, etc.
@@ -202,16 +164,31 @@ build_dynamic_libmxnet() {
     # relevant licenses will be placed in the licenses directory
     gather_licenses
 
+    source /opt/rh/devtoolset-7/enable
+    cd /work/build
+    export CC=gcc-7
+    export CXX=g++-7
     if [[ ${mxnet_variant} = "cpu" ]]; then
-        build_ubuntu_cpu_release
+        cmake -DUSE_MKL_IF_AVAILABLE=OFF \
+            -DUSE_MKLDNN=ON \
+            -DUSE_CUDA=OFF \
+            -G Ninja /work/mxnet
     elif [[ ${mxnet_variant} = "native" ]]; then
-        build_ubuntu_cpu_native_release
+        cmake -DUSE_MKL_IF_AVAILABLE=OFF \
+            -DUSE_MKLDNN=OFF \
+            -DUSE_CUDA=OFF \
+            -G Ninja /work/mxnet
     elif [[ ${mxnet_variant} =~ cu[0-9]+$ ]]; then
-        build_ubuntu_gpu_release
+        cmake -DUSE_MKL_IF_AVAILABLE=OFF \
+            -DUSE_MKLDNN=ON \
+            -DUSE_DIST_KVSTORE=ON \
+            -DUSE_CUDA=ON \
+            -G Ninja /work/mxnet
     else
         echo "Error: Unrecognized mxnet variant '${mxnet_variant}'"
         exit 1
     fi
+    ninja
 }
 
 build_jetson() {
@@ -1017,7 +994,6 @@ cd_unittest_ubuntu() {
     export DMLC_LOG_STACK_TRACE_DEPTH=10
 
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
-    local python_cmd=${2:?"This function requires a python command as the first argument"}
 
     local nose_cmd="nosetests-3.4"
 
@@ -1033,16 +1009,12 @@ cd_unittest_ubuntu() {
         $nose_cmd $NOSE_TIMER_ARGUMENTS --verbose tests/python/gpu
 
         # Adding these here as CI doesn't test all CUDA environments
-        $python_cmd example/image-classification/test_score.py
+        python3 example/image-classification/test_score.py
         integrationtest_ubuntu_gpu_dist_kvstore
     fi
 
-    if [[ ${mxnet_variant} = *mkl ]]; then
-        # skipping python 2 testing
-        # https://github.com/apache/incubator-mxnet/issues/14675
-        if [[ ${python_cmd} = "python3" ]]; then
-            $nose_cmd $NOSE_TIMER_ARGUMENTS --verbose tests/python/mkl
-        fi
+    if [[ ${mxnet_variant} = cpu ]]; then
+        $nose_cmd $NOSE_TIMER_ARGUMENTS --verbose tests/python/mkl
     fi
 }
 

From 5c768f06f0a1ffdc3687d1e3890eaee8b17e15d8 Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@connect.ust.hk>
Date: Wed, 15 Apr 2020 23:20:35 -0700
Subject: [PATCH 11/14] [Numpy Extension] Add stop_gradient to npx (#18076)

* add stop gradient to npx

* Update test_numpy_op.py
---
 .../tensor/elemwise_unary_op_basic.cc         |  1 +
 tests/python/unittest/test_numpy_op.py        | 28 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 227596d1d1c9..f216cb00896f 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -296,6 +296,7 @@ NNVM_REGISTER_OP(_backward_reshape)
 
 MXNET_OPERATOR_REGISTER_UNARY(BlockGrad)
 MXNET_ADD_SPARSE_OP_ALIAS(stop_gradient)
+.add_alias("_npx_stop_gradient")
 .add_alias("stop_gradient")
 .describe(R"code(Stops gradient computation.
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 284791763d20..6047ac83dc1f 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -8977,6 +8977,34 @@ def hybrid_forward(self, F, a, *args, **kwargs):
                         assert same(mx_out.asnumpy(), np_out)
 
 
+@use_np
+def test_npx_stop_gradient():
+    class TestStopGradient(HybridBlock):
+        def hybrid_forward(self, F, a):
+            return F.npx.stop_gradient(a)
+    dtypes = ['float16', 'float32', 'float64']
+    for hybridize in [False, True]:
+        for dtype in dtypes:
+            for grad_req in ['write', 'add']:
+                dat = np.ones((10,), dtype=dtype)
+                dat.attach_grad(grad_req)
+                dat.grad[:] = 2
+                old_grad = dat.grad.asnumpy()
+                net = TestStopGradient()
+                if hybridize:
+                    net.hybridize()
+                with mx.autograd.record():
+                    out = net(dat)
+                    out = out + dat
+                    out.backward()
+                new_grad = dat.grad.asnumpy()
+                assert same(out.asnumpy(), dat.asnumpy() * 2)
+                if grad_req == 'write':
+                    assert_almost_equal(new_grad, _np.ones_like(dat, dtype=dtype))
+                elif grad_req == 'add':
+                    assert_almost_equal(new_grad, old_grad + 1)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From afae030beb168f09cf08be101714e059157a9507 Mon Sep 17 00:00:00 2001
From: Brenton Chu <bchu@nvidia.com>
Date: Thu, 16 Apr 2020 08:21:57 -0700
Subject: [PATCH 12/14] No tensor cores for fp32 interleaved attention, remove
 div by 8 restriction (#17994)

---
 src/operator/contrib/transformer.cu | 53 ++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/operator/contrib/transformer.cu b/src/operator/contrib/transformer.cu
index 44c8ebdbb959..bcbc18525c09 100644
--- a/src/operator/contrib/transformer.cu
+++ b/src/operator/contrib/transformer.cu
@@ -43,7 +43,7 @@ void CublasStridedBatchedGemm(mshadow::Stream<gpu>* s, bool transA, bool transB,
                               float alpha, const DType* a, int32_t lda, int32_t strideA,
                               const DType *b, int32_t ldb, int32_t strideB, float beta,
                               DType *c, int32_t ldc, int32_t strideC, int32_t batchCount,
-                              cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
+                              cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT) {
 #if CUDA_VERSION >= 9010
   using namespace mxnet::common::cuda;
   CHECK_EQ(s->blas_handle_ownership_, mshadow::Stream<gpu>::OwnHandle)
@@ -142,9 +142,9 @@ void gemm_switch_fp32accum(mshadow::Stream<gpu>* s, bool transA, bool transB,
                            float alpha, const DType *a, int32_t lda,
                            int32_t strideA, const DType *b, int32_t ldb,
                            int32_t strideB, float beta, DType *c, int32_t ldc,
-                           int32_t strideC, int32_t batchCount) {
+                           int32_t strideC, int32_t batchCount, bool using_fp16) {
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
-  if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x7)) {
+  if (using_fp16) {
     CublasStridedBatchedGemm(s, transA, transB, m, n, k, alpha, a, lda, strideA, b, ldb,
       strideB, beta, c, ldc, strideC, batchCount, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
   } else {
@@ -175,6 +175,7 @@ void InterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
     const int32_t batch_stride   = 3 * head_dim;
     const float beta             = req[0] == kAddTo ? 1.f : 0.f;
     const float scale            = 1.0 / sqrt(static_cast<float>(head_dim));
+    const bool using_fp16        = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] == kNullOp)
       return;
@@ -196,7 +197,8 @@ void InterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
                           output,
                           qkv_seq_len,
                           qkv_seq_len * qkv_seq_len,
-                          attn_batches);
+                          attn_batches,
+                          using_fp16);
   })
 }
 
@@ -220,7 +222,8 @@ void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
     const int32_t lead_dim       = attn_batches * 3 * head_dim;
     const int32_t batch_stride   = 3 * head_dim;
     const float scale            = 1.0 / sqrt(static_cast<float>(head_dim));
-    const float beta = req[0] == kAddTo ? 1.f : 0.f;
+    const float beta             = req[0] == kAddTo ? 1.f : 0.f;
+    const bool using_fp16        = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] == kNullOp)
       return;
@@ -247,7 +250,8 @@ void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
                           queries_keys_values_grads,
                           lead_dim,
                           batch_stride,
-                          attn_batches);
+                          attn_batches,
+                          using_fp16);
     gemm_switch_fp32accum(s,
                           false,
                           true,
@@ -265,7 +269,8 @@ void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
                           queries_keys_values_grads + head_dim,
                           lead_dim,
                           batch_stride,
-                          attn_batches);
+                          attn_batches,
+                          using_fp16);
   })
 }
 
@@ -290,6 +295,7 @@ void InterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
     const int32_t batch_stride   = 3 * head_dim;
     const float alpha            = 1.f;
     const float beta             = req[0] == kAddTo ? 1.f : 0.f;
+    const bool using_fp16        = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] == kNullOp)
       return;
@@ -311,7 +317,8 @@ void InterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
                           output,
                           head_dim * attn_batches,
                           head_dim,
-                          attn_batches);
+                          attn_batches,
+                          using_fp16);
   })
 }
 
@@ -337,6 +344,8 @@ void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
     const int32_t lead_dim       = attn_batches * 3 * head_dim;
     const int32_t batch_stride   = 3 * head_dim;
     const float alpha            = 1.f;
+    const bool using_fp16        = inputs[0].type_flag_ == mshadow::kFloat16;
+
     if (req[0] != kNullOp) {
       if (req[0] == kWriteTo) {
         cudaMemsetAsync(queries_keys_values_grads, 0, outputs[0].shape_.Size() * sizeof(DType),
@@ -360,7 +369,8 @@ void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
                             queries_keys_values_grads + 2 * head_dim,
                             lead_dim,
                             batch_stride,
-                            attn_batches);
+                            attn_batches,
+                            using_fp16);
     }
     if (req[1] != kNullOp) {
       const float beta = req[1] == kAddTo ? 1.f : 0.f;
@@ -381,7 +391,8 @@ void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
                             attention_maps_grads,
                             qkv_seq_len,
                             qkv_seq_len * qkv_seq_len,
-                            attn_batches);
+                            attn_batches,
+                            using_fp16);
     }
   })
 }
@@ -412,6 +423,7 @@ void InterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
     const int32_t batch_stride_kv   = head_dim * 2;
     const float beta                = req[0] == kAddTo ? 1.f : 0.f;
     const float scale               = 1.f / sqrt(static_cast<float>(head_dim));
+    const bool using_fp16           = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] == kNullOp)
       return;
@@ -433,7 +445,8 @@ void InterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
                           output,
                           kv_seq_len,
                           kv_seq_len * q_seq_len,
-                          attn_batches);
+                          attn_batches,
+                          using_fp16);
   })
 }
 
@@ -463,6 +476,7 @@ void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
     const int32_t batch_stride_q    = head_dim;
     const int32_t batch_stride_kv   = head_dim * 2;
     const float scale               = 1.f / sqrt(static_cast<float>(head_dim));
+    const bool using_fp16        = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] != kNullOp) {
       const float beta = req[0] == kAddTo ? 1.f : 0.f;
@@ -483,7 +497,8 @@ void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
                             queries_grads,
                             lead_dim_q,
                             batch_stride_q,
-                            attn_batches);
+                            attn_batches,
+                            using_fp16);
     }
     if (req[1] != kNullOp) {
       if (req[1] == kWriteTo) {
@@ -508,7 +523,8 @@ void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
                             keys_values_grads,
                             lead_dim_kv,
                             batch_stride_kv,
-                            attn_batches);
+                            attn_batches,
+                            using_fp16);
     }
   })
 }
@@ -535,6 +551,7 @@ void InterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
     const int32_t batch_stride_kv   = 2 * head_dim;
     const float alpha               = 1.f;
     const float beta                = req[0] == kAddTo ? 1.f : 0.f;
+    const bool using_fp16           = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] == kNullOp)
       return;
@@ -556,7 +573,8 @@ void InterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
                           output,
                           head_dim * attn_batches,
                           head_dim,
-                          attn_batches);
+                          attn_batches,
+                          using_fp16);
   })
 }
 
@@ -583,6 +601,7 @@ void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
     const int32_t lead_dim_kv       = attn_batches * head_dim * 2;
     const int32_t batch_stride_kv   = 2 * head_dim;
     const float alpha               = 1.f;
+    const bool using_fp16           = inputs[0].type_flag_ == mshadow::kFloat16;
 
     if (req[0] != kNullOp) {
       if (req[0] == kWriteTo) {
@@ -607,7 +626,8 @@ void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
                             keys_values_grads + head_dim,
                             lead_dim_kv,
                             batch_stride_kv,
-                            attn_batches);
+                            attn_batches,
+                            using_fp16);
     }
     if (req[1] != kNullOp) {
       const float beta = req[1] == kAddTo ? 1.f : 0.f;
@@ -628,7 +648,8 @@ void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
                             attention_maps_grads,
                             kv_seq_len,
                             kv_seq_len * q_seq_len,
-                            attn_batches);
+                            attn_batches,
+                            using_fp16);
     }
   })
 }

From 9337137c5dbf914404a023c22310380de24756ad Mon Sep 17 00:00:00 2001
From: Joe Evans <github@250hacks.net>
Date: Thu, 16 Apr 2020 09:44:57 -0700
Subject: [PATCH 13/14] For mxnet-validation pipeline, require sanity build to
 complete successfully before running other build pipelines. (#17999)

* Refactor staggered builds - create new full build pipeline that runs sanity check first, then starts all other builds.

* Move list of build jobs to top of file for clarity. Preserve whole job path in case we use nested folders in the future.

Co-authored-by: Joe Evans <joeev@amazon.com>
---
 ci/jenkins/Jenkinsfile_full | 49 +++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 ci/jenkins/Jenkinsfile_full

diff --git a/ci/jenkins/Jenkinsfile_full b/ci/jenkins/Jenkinsfile_full
new file mode 100644
index 000000000000..57b637533c05
--- /dev/null
+++ b/ci/jenkins/Jenkinsfile_full
@@ -0,0 +1,49 @@
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+//
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// timeout in minutes
+max_time = 180
+
+def buildJobs = [
+    'centos-cpu',
+    'centos-gpu',
+    'clang',
+    'edge',
+    'miscellaneous',
+    'unix-cpu',
+    'unix-gpu',
+    'website',
+    'windows-cpu',
+    'windows-gpu'
+]
+
+
+stage("full-build") {
+    // get the base path by removing build and branch portions
+    def jobPath = JOB_NAME.split('/')
+    def pipelineName = jobPath[0..jobPath.size()-3].join('/')
+    build job: pipelineName + "/sanity/" + BRANCH_NAME, wait: true
+    buildJobs.each { subJob ->
+        build job: pipelineName + "/" + subJob + "/" + BRANCH_NAME, wait: false
+    }
+}
+

From 7bef85ecbf9cb064d45479efa5cc46c6f8a4e947 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Thu, 16 Apr 2020 12:32:44 -0700
Subject: [PATCH 14/14] [Numpy] Add ffi for np.sum, np.std, np.var, np.average
 and np.histogram (#17866)

* add ffi for sum, var and std

* add ffi wrapper for np.average

* add ffi wrapper for np.histogram
---
 benchmark/python/ffi/benchmark_ffi.py         |   5 +
 include/mxnet/runtime/ffi_helper.h            |  18 ++
 include/mxnet/runtime/object.h                |   1 +
 python/mxnet/_ffi/_cython/convert.pxi         |   6 +
 python/mxnet/_ffi/node_generic.py             |   2 +
 python/mxnet/_numpy_op_doc.py                 |  92 --------
 python/mxnet/ndarray/numpy/_op.py             | 114 +++++++++-
 python/mxnet/numpy/multiarray.py              | 104 ++++++++-
 python/mxnet/symbol/numpy/_symbol.py          |  51 ++++-
 python/mxnet/symbol/numpy/linalg.py           |   8 +-
 src/api/_api_internal/_api_internal.cc        |  10 +
 src/api/operator/numpy/np_bincount_op.cc      |   4 +-
 .../numpy/np_broadcast_reduce_op_value.cc     |  67 +++++-
 src/api/operator/numpy/np_cumsum.cc           |   4 +-
 src/api/operator/numpy/np_histogram_op.cc     |  81 +++++++
 src/api/operator/numpy/np_moments_op.cc       | 209 ++++++++++++++++++
 src/api/operator/numpy/np_tensordot_op.cc     |   4 +-
 src/api/operator/utils.h                      |  10 +
 src/operator/numpy/np_broadcast_reduce_op.h   |  32 ++-
 .../numpy/np_broadcast_reduce_op_value.cc     |  22 +-
 .../numpy/np_broadcast_reduce_op_value.cu     |   4 +-
 src/operator/tensor/histogram-inl.h           |  42 ++--
 22 files changed, 739 insertions(+), 151 deletions(-)
 create mode 100644 src/api/operator/numpy/np_histogram_op.cc
 create mode 100644 src/api/operator/numpy/np_moments_op.cc

diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index c8255fe6e9a9..7b5fa3e174c4 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -60,6 +60,11 @@ def prepare_workloads():
     OpArgMngr.add_workload("tensordot", pool['2x2'], pool['2x2'], ((1, 0), (0, 1)))
     OpArgMngr.add_workload("kron", pool['2x2'], pool['2x2'])
     OpArgMngr.add_workload("cumsum", pool['3x2'], axis=0, out=pool['3x2'])
+    OpArgMngr.add_workload("sum", pool['2x2'], axis=0, keepdims=True, out=pool['1x2'])
+    OpArgMngr.add_workload("std", pool['2x2'], axis=0, ddof=0, keepdims=True, out=pool['1x2'])
+    OpArgMngr.add_workload("var", pool['2x2'], axis=0, ddof=1, keepdims=True, out=pool['1x2'])
+    OpArgMngr.add_workload("average", pool['2x2'], weights=pool['2'], axis=1, returned=True)
+    OpArgMngr.add_workload("histogram", pool['2x2'], bins=10, range=(0.0, 10.0))
     OpArgMngr.add_workload("add", pool['2x2'], pool['2x2'])
     OpArgMngr.add_workload("linalg.eig", pool['3x3'])
     OpArgMngr.add_workload("linalg.eigh", pool['3x3'])
diff --git a/include/mxnet/runtime/ffi_helper.h b/include/mxnet/runtime/ffi_helper.h
index 49134ca122a7..cfc79a6c4f47 100644
--- a/include/mxnet/runtime/ffi_helper.h
+++ b/include/mxnet/runtime/ffi_helper.h
@@ -99,6 +99,24 @@ class Integer: public ObjectRef {
   MXNET_DEFINE_OBJECT_REF_METHODS(Integer, ObjectRef, IntegerObj)
 };
 
+class FloatObj: public Object {
+ public:
+  double value;
+  static constexpr const uint32_t _type_index = TypeIndex::kFloat;
+  static constexpr const char* _type_key = "MXNet.Float";
+  MXNET_DECLARE_FINAL_OBJECT_INFO(FloatObj, Object)
+};
+
+class Float: public ObjectRef {
+ public:
+  explicit Float(double value,
+                 ObjectPtr<FloatObj>&& data = make_object<FloatObj>()) {
+    data->value = value;
+    data_ = std::move(data);
+  }
+  MXNET_DEFINE_OBJECT_REF_METHODS(Float, ObjectRef, FloatObj)
+};
+
 //  Helper functions for fast FFI implementations
 /*!
  * \brief A builder class that helps to incrementally build ADT.
diff --git a/include/mxnet/runtime/object.h b/include/mxnet/runtime/object.h
index a031a56d88ed..48c9badb3ba7 100644
--- a/include/mxnet/runtime/object.h
+++ b/include/mxnet/runtime/object.h
@@ -58,6 +58,7 @@ enum TypeIndex  {
   kEllipsis = 5,
   kSlice = 6,
   kInteger = 7,
+  kFloat = 8,
   kStaticIndexEnd,
   /*! \brief Type index is allocated during runtime. */
   kDynamic = kStaticIndexEnd
diff --git a/python/mxnet/_ffi/_cython/convert.pxi b/python/mxnet/_ffi/_cython/convert.pxi
index 2cbdc48b49a8..d7b1ea5659dc 100644
--- a/python/mxnet/_ffi/_cython/convert.pxi
+++ b/python/mxnet/_ffi/_cython/convert.pxi
@@ -43,6 +43,10 @@ cdef extern from "mxnet/runtime/ffi_helper.h" namespace "mxnet::runtime":
         Integer()
         Integer(int64_t)
 
+    cdef cppclass Float(ObjectRef):
+        Float()
+        Float(double)
+
 
 cdef inline ADT convert_tuple(tuple src_tuple) except *:
     cdef uint32_t size = len(src_tuple)
@@ -71,5 +75,7 @@ cdef inline ObjectRef convert_object(object src_obj) except *:
         return convert_list(src_obj)
     elif isinstance(src_obj, Integral):
         return Integer(<int64_t>src_obj)
+    elif isinstance(src_obj, float):
+        return Float(<double>src_obj)
     else:
         raise TypeError("Don't know how to convert type %s" % type(src_obj))
diff --git a/python/mxnet/_ffi/node_generic.py b/python/mxnet/_ffi/node_generic.py
index c7f332390ce7..07b4825654d1 100644
--- a/python/mxnet/_ffi/node_generic.py
+++ b/python/mxnet/_ffi/node_generic.py
@@ -52,6 +52,8 @@ def convert_to_node(value):
     """
     if isinstance(value, Integral):
         return _api_internal._Integer(value)
+    elif isinstance(value, float):
+        return _api_internal._Float(value)
     elif isinstance(value, (list, tuple)):
         value = [convert_to_node(x) for x in value]
         return _api_internal._ADT(*value)
diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 8341d43608ce..857b87a7586f 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -231,98 +231,6 @@ def _np_dot(a, b, out=None):
     pass
 
 
-def _np_sum(a, axis=None, dtype=None, keepdims=False, initial=None, out=None):
-    r"""
-    Sum of array elements over a given axis.
-
-    Parameters
-    ----------
-    a : ndarray
-        Input data.
-    axis : None or int, optional
-        Axis or axes along which a sum is performed.  The default,
-        axis=None, will sum all of the elements of the input array.  If
-        axis is negative it counts from the last to the first axis.
-    dtype : dtype, optional
-        The type of the returned array and of the accumulator in which the
-        elements are summed. The default type is float32.
-    keepdims : bool, optional
-        If this is set to True, the axes which are reduced are left
-        in the result as dimensions with size one. With this option,
-        the result will broadcast correctly against the input array.
-
-        If the default value is passed, then `keepdims` will not be
-        passed through to the `sum` method of sub-classes of
-        `ndarray`, however any non-default value will be.  If the
-        sub-classes `sum` method does not implement `keepdims` any
-        exceptions will be raised.
-    initial: Currently only supports None as input, optional
-        Starting value for the sum.
-        Currently not implemented. Please use ``None`` as input or skip this argument.
-    out : ndarray or None, optional
-        Alternative output array in which to place the result. It must have
-        the same shape and dtype as the expected output.
-
-    Returns
-    -------
-    sum_along_axis : ndarray
-        An ndarray with the same shape as `a`, with the specified
-        axis removed. If an output array is specified, a reference to
-        `out` is returned.
-
-    Notes
-    -----
-    - Input type does not support Python native iterables.
-    - "out" param: cannot perform auto type change. out ndarray's dtype must be the same as the expected output.
-    - "initial" param is not supported yet. Please use None as input.
-    - Arithmetic is modular when using integer types, and no error is raised on overflow.
-    - The sum of an empty array is the neutral element 0:
-
-    >>> a = np.empty(1)
-    >>> np.sum(a)
-    array(0.)
-
-    This function differs from the original `numpy.sum
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html>`_ in
-    the following aspects:
-
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - "out" param: cannot perform auto type cast. out ndarray's dtype must be the same as the expected output.
-    - "initial" param is not supported yet. Please use ``None`` as input or skip it.
-
-    Examples
-    --------
-    >>> a = np.array([0.5, 1.5])
-    >>> np.sum(a)
-    array(2.)
-    >>> a = np.array([0.5, 0.7, 0.2, 1.5])
-    >>> np.sum(a, dtype=np.int32)
-    array(2, dtype=int32)
-    >>> a = np.array([[0, 1], [0, 5]])
-    >>> np.sum(a)
-    array(6.)
-    >>> np.sum(a, axis=0)
-    array([0., 6.])
-    >>> np.sum(a, axis=1)
-    array([1., 5.])
-
-    With output ndarray:
-
-    >>> a = np.array([[0, 1], [0, 5]])
-    >>> b = np.ones((2,), dtype=np.float32)
-    >>> np.sum(a, axis = 0, out=b)
-    array([0., 6.])
-    >>> b
-    array([0., 6.])
-
-    If the accumulator is too small, overflow occurs:
-
-    >>> np.ones(128, dtype=np.int8).sum(dtype=np.int8)
-    array(-128, dtype=int8)
-    """
-    pass
-
-
 def _np_copy(a, out=None):
     """
     Return an array copy of the given object.
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index f1341f6a7922..1a66abb1a9be 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -48,7 +48,7 @@
            'true_divide', 'nonzero', 'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'interp',
            'diff', 'ediff1d', 'resize', 'polyval', 'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite',
            'atleast_1d', 'atleast_2d', 'atleast_3d',
-           'where', 'bincount', 'rollaxis', 'pad', 'cumsum', 'diag', 'diagonal']
+           'where', 'bincount', 'rollaxis', 'pad', 'cumsum', 'sum', 'diag', 'diagonal']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -1739,13 +1739,13 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
     if isinstance(bins, numeric_types):
         if range is None:
             raise NotImplementedError("automatic range is not supported yet...")
-        return _npi.histogram(a, bin_cnt=bins, range=range)
+        return tuple(_api_internal.histogram(a, None, bins, range))
     if isinstance(bins, (list, tuple)):
         raise NotImplementedError("array_like bins is not supported yet...")
     if isinstance(bins, str):
         raise NotImplementedError("string bins is not supported yet...")
     if isinstance(bins, NDArray):
-        return _npi.histogram(a, bins=bins)
+        return tuple(_api_internal.histogram(a, bins, None, None))
     raise ValueError("np.histogram fails with", locals())
 
 
@@ -4883,10 +4883,7 @@ def average(a, axis=None, weights=None, returned=False, out=None):
     >>> np.average(data, axis=1, weights=weights)
     array([0.75, 2.75, 4.75])
     """
-    if weights is None:
-        return _npi.average(a, axis=axis, weights=None, returned=returned, weighted=False, out=out)
-    else:
-        return _npi.average(a, axis=axis, weights=weights, returned=returned, out=out)
+    return _api_internal.average(a, weights, axis, returned, weights is not None, out)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -5011,7 +5008,7 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
     >>> np.std(a, dtype=np.float64)
     array(0.45, dtype=float64)
     """
-    return _npi.std(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
+    return _api_internal.std(a, axis, dtype, ddof, keepdims, out)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -5081,7 +5078,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
     >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
     0.2025
     """
-    return _npi.var(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
+    return _api_internal.var(a, axis, dtype, ddof, keepdims, out)
 
 
 # pylint: disable=redefined-outer-name
@@ -6294,7 +6291,7 @@ def outer(a, b):
         [-2., -1.,  0.,  1.,  2.],
         [-2., -1.,  0.,  1.,  2.]])
     """
-    return tensordot(a.flatten(), b.flatten(), 0)
+    return tensordot(a.reshape_view((-1, )), b.reshape_view((-1, )), 0)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -8464,3 +8461,100 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
             [1, 7]])
     """
     return _api_internal.diagonal(a, offset, axis1, axis2)
+
+
+# pylint:disable=redefined-outer-name, too-many-arguments
+@set_module('mxnet.ndarray.numpy')
+def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None):
+    r"""
+    Sum of array elements over a given axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input data.
+    axis : None or int, optional
+        Axis or axes along which a sum is performed.  The default,
+        axis=None, will sum all of the elements of the input array.  If
+        axis is negative it counts from the last to the first axis.
+    dtype : dtype, optional
+        The type of the returned array and of the accumulator in which the
+        elements are summed. The default type is float32.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `sum` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    initial: Currently only supports None as input, optional
+        Starting value for the sum.
+        Currently not implemented. Please use ``None`` as input or skip this argument.
+    out : ndarray or None, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and dtype as the expected output.
+
+    Returns
+    -------
+    sum_along_axis : ndarray
+        An ndarray with the same shape as `a`, with the specified
+        axis removed. If an output array is specified, a reference to
+        `out` is returned.
+
+    Notes
+    -----
+    - Input type does not support Python native iterables.
+    - "out" param: cannot perform auto type change. out ndarray's dtype must be the same as the expected output.
+    - "initial" param is not supported yet. Please use None as input.
+    - Arithmetic is modular when using integer types, and no error is raised on overflow.
+    - The sum of an empty array is the neutral element 0:
+
+    >>> a = np.empty(1)
+    >>> np.sum(a)
+    array(0.)
+
+    This function differs from the original `numpy.sum
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - "out" param: cannot perform auto type cast. out ndarray's dtype must be the same as the expected output.
+    - "initial" param is not supported yet. Please use ``None`` as input or skip it.
+
+    Examples
+    --------
+    >>> a = np.array([0.5, 1.5])
+    >>> np.sum(a)
+    array(2.)
+    >>> a = np.array([0.5, 0.7, 0.2, 1.5])
+    >>> np.sum(a, dtype=np.int32)
+    array(2, dtype=int32)
+    >>> a = np.array([[0, 1], [0, 5]])
+    >>> np.sum(a)
+    array(6.)
+    >>> np.sum(a, axis=0)
+    array([0., 6.])
+    >>> np.sum(a, axis=1)
+    array([1., 5.])
+
+    With output ndarray:
+
+    >>> a = np.array([[0, 1], [0, 5]])
+    >>> b = np.ones((2,), dtype=np.float32)
+    >>> np.sum(a, axis=0, out=b)
+    array([0., 6.])
+    >>> b
+    array([0., 6.])
+
+    If the accumulator is too small, overflow occurs:
+
+    >>> np.ones(128, dtype=np.int8).sum(dtype=np.int8)
+    array(-128, dtype=int8)
+    """
+    if where is not None and where is not True:
+        raise ValueError("only where=None or where=True cases are supported for now")
+    return _api_internal.sum(a, axis, dtype, keepdims, initial, out)
+# pylint:enable=redefined-outer-name, too-many-arguments
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index feec9ca89a9a..2d6a7888a35f 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -74,7 +74,7 @@
            'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d', 'resize', 'matmul',
            'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite', 'polyval', 'where', 'bincount',
            'atleast_1d', 'atleast_2d', 'atleast_3d',
-           'pad', 'cumsum', 'rollaxis', 'diag', 'diagonal']
+           'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal']
 
 __all__ += fallback.__all__
 
@@ -6867,7 +6867,7 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
     >>> np.std(a, dtype=np.float64)
     array(0.45, dtype=float64)
     """
-    return _npi.std(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
+    return _mx_nd_np.std(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
 # pylint: enable=redefined-outer-name
 
 
@@ -6988,7 +6988,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
     >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
     0.2025
     """
-    return _npi.var(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
+    return _mx_nd_np.var(a, axis=axis, dtype=dtype, ddof=ddof, keepdims=keepdims, out=out)
 
 
 # pylint: disable=redefined-outer-name
@@ -7151,6 +7151,7 @@ def ravel(x, order='C'):
     return _mx_nd_np.ravel(x, order)
 
 
+@set_module('mxnet.numpy')
 def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-name
     """
     Converts a flat index or array of flat indices into a tuple of coordinate arrays.
@@ -7181,6 +7182,7 @@ def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-
     return _mx_nd_np.unravel_index(indices, shape, order=order)
 
 
+@set_module('mxnet.numpy')
 def flatnonzero(a):
     r"""
     Return indices that are non-zero in the flattened version of a.
@@ -7220,6 +7222,7 @@ def flatnonzero(a):
     return _mx_nd_np.flatnonzero(a)
 
 
+@set_module('mxnet.numpy')
 def diag_indices_from(arr):
     """
     This returns a tuple of indices that can be used to access the main diagonal of an array
@@ -10572,3 +10575,98 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
             [1, 7]])
     """
     return _mx_nd_np.diagonal(a, offset=offset, axis1=axis1, axis2=axis2)
+
+
+# pylint: disable=redefined-outer-name, too-many-arguments
+@set_module('mxnet.numpy')
+def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None):
+    r"""
+    Sum of array elements over a given axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input data.
+    axis : None or int, optional
+        Axis or axes along which a sum is performed.  The default,
+        axis=None, will sum all of the elements of the input array.  If
+        axis is negative it counts from the last to the first axis.
+    dtype : dtype, optional
+        The type of the returned array and of the accumulator in which the
+        elements are summed. The default type is float32.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `sum` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    initial: Currently only supports None as input, optional
+        Starting value for the sum.
+        Currently not implemented. Please use ``None`` as input or skip this argument.
+    out : ndarray or None, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and dtype as the expected output.
+
+    Returns
+    -------
+    sum_along_axis : ndarray
+        An ndarray with the same shape as `a`, with the specified
+        axis removed. If an output array is specified, a reference to
+        `out` is returned.
+
+    Notes
+    -----
+    - Input type does not support Python native iterables.
+    - "out" param: cannot perform auto type change. out ndarray's dtype must be the same as the expected output.
+    - "initial" param is not supported yet. Please use None as input.
+    - Arithmetic is modular when using integer types, and no error is raised on overflow.
+    - The sum of an empty array is the neutral element 0:
+
+    >>> a = np.empty(1)
+    >>> np.sum(a)
+    array(0.)
+
+    This function differs from the original `numpy.sum
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - "out" param: cannot perform auto type cast. out ndarray's dtype must be the same as the expected output.
+    - "initial" param is not supported yet. Please use ``None`` as input or skip it.
+
+    Examples
+    --------
+    >>> a = np.array([0.5, 1.5])
+    >>> np.sum(a)
+    array(2.)
+    >>> a = np.array([0.5, 0.7, 0.2, 1.5])
+    >>> np.sum(a, dtype=np.int32)
+    array(2, dtype=int32)
+    >>> a = np.array([[0, 1], [0, 5]])
+    >>> np.sum(a)
+    array(6.)
+    >>> np.sum(a, axis=0)
+    array([0., 6.])
+    >>> np.sum(a, axis=1)
+    array([1., 5.])
+
+    With output ndarray:
+
+    >>> a = np.array([[0, 1], [0, 5]])
+    >>> b = np.ones((2,), dtype=np.float32)
+    >>> np.sum(a, axis = 0, out=b)
+    array([0., 6.])
+    >>> b
+    array([0., 6.])
+
+    If the accumulator is too small, overflow occurs:
+
+    >>> np.ones(128, dtype=np.int8).sum(dtype=np.int8)
+    array(-128, dtype=int8)
+    """
+    return _mx_nd_np.sum(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims, initial=initial, where=where)
+# pylint: enable=redefined-outer-name, too-many-arguments
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 5061b772ae8f..8b31c8e38ae4 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -54,7 +54,7 @@
            'true_divide', 'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d',
            'resize', 'polyval', 'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite',
            'atleast_1d', 'atleast_2d', 'atleast_3d',
-           'where', 'bincount', 'rollaxis', 'pad', 'cumsum', 'diag', 'diagonal']
+           'where', 'bincount', 'rollaxis', 'pad', 'cumsum', 'sum', 'diag', 'diagonal']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -689,7 +689,7 @@ def diag(self, k=0, **kwargs):
 
     def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Return the sum of the array elements over the given axis."""
-        return _mx_np_op.sum(self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
+        return _npi.sum(self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
 
     def nansum(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`nansum`.
@@ -7334,4 +7334,51 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
     return _npi.diagonal(a, offset=offset, axis1=axis1, axis2=axis2)
 
 
+# pylint:disable=redefined-outer-name, too-many-arguments
+@set_module('mxnet.symbol.numpy')
+def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=None):
+    r"""
+    Sum of array elements over a given axis.
+
+    Parameters
+    ----------
+    a : _Symbol
+        Input data.
+    axis : None or int, optional
+        Axis or axes along which a sum is performed.  The default,
+        axis=None, will sum all of the elements of the input array.  If
+        axis is negative it counts from the last to the first axis.
+    dtype : dtype, optional
+        The type of the returned array and of the accumulator in which the
+        elements are summed. The default type is float32.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `sum` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    initial: Currently only supports None as input, optional
+        Starting value for the sum.
+        Currently not implemented. Please use ``None`` as input or skip this argument.
+    out : ndarray or None, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and dtype as the expected output.
+
+    Returns
+    -------
+    sum_along_axis : _Symbol
+        An ndarray with the same shape as `a`, with the specified
+        axis removed. If an output array is specified, a reference to
+        `out` is returned.
+    """
+    if where is not None and where is not True:
+        raise ValueError("only where=None or where=True cases are supported for now")
+    return _npi.sum(a, axis=axis, dtype=dtype, keepdims=keepdims, initial=initial, out=out)
+# pylint:enable=redefined-outer-name, too-many-arguments
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index 3cea6ddae157..1fbac50b630a 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -324,18 +324,18 @@ def norm(x, ord=None, axis=None, keepdims=False):
                         if row_axis > col_axis:
                             row_axis -= 1
                     if ord == 'inf':
-                        return _mx_sym_np.sum(_symbol.abs(x), axis=col_axis, keepdims=keepdims).max(axis=row_axis, keepdims=keepdims)  # pylint: disable=line-too-long
+                        return _npi.sum(_symbol.abs(x), axis=col_axis, keepdims=keepdims).max(axis=row_axis, keepdims=keepdims)  # pylint: disable=line-too-long
                     else:
-                        return _mx_sym_np.sum(_symbol.abs(x), axis=col_axis, keepdims=keepdims).min(axis=row_axis, keepdims=keepdims)  # pylint: disable=line-too-long
+                        return _npi.sum(_symbol.abs(x), axis=col_axis, keepdims=keepdims).min(axis=row_axis, keepdims=keepdims)  # pylint: disable=line-too-long
                 if ord in [1, -1]:
                     row_axis, col_axis = axis
                     if not keepdims:
                         if row_axis < col_axis:
                             col_axis -= 1
                     if ord == 1:
-                        return _mx_sym_np.sum(_symbol.abs(x), axis=row_axis, keepdims=keepdims).max(axis=col_axis, keepdims=keepdims)  # pylint: disable=line-too-long
+                        return _npi.sum(_symbol.abs(x), axis=row_axis, keepdims=keepdims).max(axis=col_axis, keepdims=keepdims)  # pylint: disable=line-too-long
                     elif ord == -1:
-                        return _mx_sym_np.sum(_symbol.abs(x), axis=row_axis, keepdims=keepdims).min(axis=col_axis, keepdims=keepdims)  # pylint: disable=line-too-long
+                        return _npi.sum(_symbol.abs(x), axis=row_axis, keepdims=keepdims).min(axis=col_axis, keepdims=keepdims)  # pylint: disable=line-too-long
                 if ord in [2, -2]:
                     return _npi.norm(x, ord=ord, axis=axis, keepdims=keepdims, flag=0)
                 if ord is None:
diff --git a/src/api/_api_internal/_api_internal.cc b/src/api/_api_internal/_api_internal.cc
index 586dce82f383..7e1ce045f353 100644
--- a/src/api/_api_internal/_api_internal.cc
+++ b/src/api/_api_internal/_api_internal.cc
@@ -43,6 +43,16 @@ MXNET_REGISTER_GLOBAL("_Integer")
     }
 });
 
+MXNET_REGISTER_GLOBAL("_Float")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+    using namespace runtime;
+    if (args[0].type_code() == kDLFloat) {
+      *ret = Float(args[0].operator double());
+    } else {
+      LOG(FATAL) << "only accept float";
+    }
+});
+
 MXNET_REGISTER_GLOBAL("_ADT")
 .set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
     using namespace runtime;
diff --git a/src/api/operator/numpy/np_bincount_op.cc b/src/api/operator/numpy/np_bincount_op.cc
index afa3278c24e4..7be884aefb1a 100644
--- a/src/api/operator/numpy/np_bincount_op.cc
+++ b/src/api/operator/numpy/np_bincount_op.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/api/operator/numpy/np_broadcast_reduce_op_value.cc b/src/api/operator/numpy/np_broadcast_reduce_op_value.cc
index c2d87a285cde..4cd2e485d987 100644
--- a/src/api/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/api/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,13 +18,14 @@
  */
 
 /*!
- * \file broadcast_reduce_op_value.cc
+ * \file np_broadcast_reduce_op_value.cc
  * \brief Implementation of the API of functions in
  * src/operator/tensor/np_broadcast_reduce_op_value.cc
  */
 #include <mxnet/api_registry.h>
 #include <mxnet/runtime/packed_func.h>
 #include "../utils.h"
+#include "../../../operator/tensor/broadcast_reduce_op.h"
 #include "../../../operator/numpy/np_broadcast_reduce_op.h"
 
 namespace mxnet {
@@ -51,6 +52,65 @@ MXNET_REGISTER_API("_npi.broadcast_to")
   *ret = ndoutputs[0];
 });
 
+MXNET_REGISTER_API("_npi.sum")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_sum");
+  op::NumpyReduceAxesParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+
+  // parse axis
+  if (args[1].type_code() == kNull) {
+    param.axis = dmlc::nullopt;
+  } else {
+    if (args[1].type_code() == kDLInt) {
+      param.axis = Tuple<int>(1, args[1].operator int64_t());
+    } else {
+      param.axis = Tuple<int>(args[1].operator ObjectRef());
+    }
+  }
+
+  // parse dtype
+  if (args[2].type_code() == kNull) {
+    param.dtype = dmlc::nullopt;
+  } else {
+    param.dtype = String2MXNetTypeWithBool(args[2].operator std::string());
+  }
+
+  // parse keepdims
+  if (args[3].type_code() == kNull) {
+    param.keepdims = false;
+  } else {
+    param.keepdims = args[3].operator bool();
+  }
+
+  // parse initial
+  if (args[4].type_code() == kNull) {
+    param.initial = dmlc::nullopt;
+  } else {
+    param.initial = args[4].operator double();
+  }
+
+  attrs.parsed = std::move(param);
+
+  SetAttrDict<op::NumpyReduceAxesParam>(&attrs);
+
+  NDArray* inputs[] = {args[0].operator NDArray*()};
+  int num_inputs = 1;
+
+  NDArray* outputs[] = {args[5].operator NDArray*()};
+  NDArray** out = (outputs[0] == nullptr) ? nullptr : outputs;
+  int num_outputs = (outputs[0] != nullptr);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, out);
+
+  if (out) {
+    *ret = PythonArg(5);
+  } else {
+    *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+  }
+});
+
 MXNET_REGISTER_API("_npi.mean")
 .set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
   using namespace runtime;
@@ -67,6 +127,7 @@ MXNET_REGISTER_API("_npi.mean")
   } else {
     param.dtype = String2MXNetTypeWithBool(args[2].operator std::string());
   }
+
   if (args[3].type_code() == kNull) {
     param.keepdims = false;
   } else {
diff --git a/src/api/operator/numpy/np_cumsum.cc b/src/api/operator/numpy/np_cumsum.cc
index 0ef3b3fdf7bf..d0b200c66fd4 100644
--- a/src/api/operator/numpy/np_cumsum.cc
+++ b/src/api/operator/numpy/np_cumsum.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/api/operator/numpy/np_histogram_op.cc b/src/api/operator/numpy/np_histogram_op.cc
new file mode 100644
index 000000000000..b517cce80803
--- /dev/null
+++ b/src/api/operator/numpy/np_histogram_op.cc
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_histogram_op.cc
+ * \brief Implementation of the API of functions in src/operator/tensor/histogram.cc
+ */
+
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../utils.h"
+#include "../../../operator/tensor/histogram-inl.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.histogram")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  nnvm::NodeAttrs attrs;
+  const nnvm::Op* op = Op::Get("_npi_histogram");
+  op::HistogramParam param;
+  // parse bin_cnt
+  if (args[2].type_code() == kNull) {
+    param.bin_cnt = dmlc::nullopt;
+  } else {
+    param.bin_cnt = args[2].operator int();
+  }
+
+  // parse range
+  if (args[3].type_code() == kNull) {
+    param.range = dmlc::nullopt;
+  } else {
+    param.range = Obj2Tuple<double, Float>(args[3].operator ObjectRef());
+  }
+
+  attrs.parsed = std::move(param);
+  attrs.op = op;
+  SetAttrDict<op::HistogramParam>(&attrs);
+
+  std::vector<NDArray*> inputs_vec;
+  int num_inputs = 0;
+
+  if (args[2].type_code() != kNull) {
+    CHECK_EQ(args[1].type_code(), kNull)
+      << "bins should be None when bin_cnt is provided";
+    inputs_vec.push_back((args[0].operator NDArray*()));
+    num_inputs = 1;
+  } else {
+    CHECK_NE(args[1].type_code(), kNull)
+      << "bins should not be None when bin_cnt is not provided";
+    // inputs
+    inputs_vec.push_back((args[0].operator NDArray*()));
+    inputs_vec.push_back((args[1].operator NDArray*()));
+    num_inputs = 2;
+  }
+
+  // outputs
+  NDArray** out = nullptr;
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs_vec.data(), &num_outputs, out);
+  *ret = ADT(0, {NDArrayHandle(ndoutputs[0]),
+                 NDArrayHandle(ndoutputs[1])});
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/np_moments_op.cc b/src/api/operator/numpy/np_moments_op.cc
new file mode 100644
index 000000000000..e4e9238bb6c1
--- /dev/null
+++ b/src/api/operator/numpy/np_moments_op.cc
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_moments_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/np_moments_op.cc
+ */
+
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../utils.h"
+#include "../../../operator/numpy/np_broadcast_reduce_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.std")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_std");
+  op::NumpyMomentsParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+
+  // parse axis
+  if (args[1].type_code() == kNull) {
+    param.axis = dmlc::nullopt;
+  } else {
+    if (args[1].type_code() == kDLInt) {
+      param.axis = Tuple<int>(1, args[1].operator int64_t());
+    } else {
+      param.axis = Tuple<int>(args[1].operator ObjectRef());
+    }
+  }
+
+  // parse dtype
+  if (args[2].type_code() == kNull) {
+    param.dtype = dmlc::nullopt;
+  } else {
+    param.dtype = String2MXNetTypeWithBool(args[2].operator std::string());
+  }
+
+  // parse ddof
+  param.ddof = args[3].operator int();
+
+  // parse keepdims
+  if (args[4].type_code() == kNull) {
+    param.keepdims = false;
+  } else {
+    param.keepdims = args[4].operator bool();
+  }
+
+  attrs.parsed = std::move(param);
+
+  SetAttrDict<op::NumpyMomentsParam>(&attrs);
+
+  NDArray* inputs[] = {args[0].operator NDArray*()};
+  int num_inputs = 1;
+
+  NDArray* outputs[] = {args[5].operator NDArray*()};
+  NDArray** out = (outputs[0] == nullptr) ? nullptr : outputs;
+  int num_outputs = (outputs[0] != nullptr);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, out);
+
+  if (out) {
+    *ret = PythonArg(5);
+  } else {
+    *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+  }
+});
+
+MXNET_REGISTER_API("_npi.var")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_var");
+  op::NumpyMomentsParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+
+  // parse axis
+  if (args[1].type_code() == kNull) {
+    param.axis = dmlc::nullopt;
+  } else {
+    if (args[1].type_code() == kDLInt) {
+      param.axis = Tuple<int>(1, args[1].operator int64_t());
+    } else {
+      param.axis = Tuple<int>(args[1].operator ObjectRef());
+    }
+  }
+
+  // parse dtype
+  if (args[2].type_code() == kNull) {
+    param.dtype = dmlc::nullopt;
+  } else {
+    param.dtype = String2MXNetTypeWithBool(args[2].operator std::string());
+  }
+
+  // parse ddof
+  param.ddof = args[3].operator int();
+
+  // parse keepdims
+  if (args[4].type_code() == kNull) {
+    param.keepdims = false;
+  } else {
+    param.keepdims = args[4].operator bool();
+  }
+
+  attrs.parsed = std::move(param);
+
+  SetAttrDict<op::NumpyMomentsParam>(&attrs);
+
+  NDArray* inputs[] = {args[0].operator NDArray*()};
+  int num_inputs = 1;
+
+  NDArray* outputs[] = {args[5].operator NDArray*()};
+  NDArray** out = (outputs[0] == nullptr) ? nullptr : outputs;
+  int num_outputs = (outputs[0] != nullptr);
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, out);
+
+  if (out) {
+    *ret = PythonArg(5);
+  } else {
+    *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+  }
+});
+
+MXNET_REGISTER_API("_npi.average")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_average");
+  op::NumpyWeightedAverageParam param;
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+
+  // parse axis
+  if (args[2].type_code() == kNull) {
+    param.axis = dmlc::nullopt;
+  } else {
+    if (args[2].type_code() == kDLInt) {
+      param.axis = Tuple<int>(1, args[2].operator int64_t());
+    } else {
+      param.axis = Tuple<int>(args[2].operator ObjectRef());
+    }
+  }
+
+  // parse returned
+  CHECK_NE(args[3].type_code(), kNull)
+    << "returned cannot be None";
+  param.returned = args[3].operator bool();
+
+  // parse weighted
+  CHECK_NE(args[4].type_code(), kNull)
+    << "weighted cannot be None";
+  param.weighted = args[4].operator bool();
+
+  attrs.parsed = std::move(param);
+
+  SetAttrDict<op::NumpyWeightedAverageParam>(&attrs);
+
+  int num_inputs = param.weighted ? 2 : 1;
+  NDArray* outputs[] = {args[5].operator NDArray*()};
+  NDArray** out = (outputs[0] == nullptr) ? nullptr : outputs;
+  int num_outputs = (outputs[0] != nullptr);
+
+  if (param.weighted) {
+    NDArray* inputs[] = {args[0].operator NDArray*(), args[1].operator NDArray*()};
+    auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, out);
+    if (out) {
+      *ret = PythonArg(5);
+    } else {
+      if (param.returned) {
+        *ret = ADT(0, {NDArrayHandle(ndoutputs[0]),
+                       NDArrayHandle(ndoutputs[1])});
+      } else {
+        *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+      }
+    }
+  } else {
+    NDArray* inputs[] = {args[0].operator NDArray*()};
+    auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, out);
+    if (out) {
+      *ret = PythonArg(5);
+    } else {
+      if (param.returned) {
+        *ret = ADT(0, {NDArrayHandle(ndoutputs[0]),
+                       NDArrayHandle(ndoutputs[1])});
+      } else {
+        *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+      }
+    }
+  }
+});
+
+};  // namespace mxnet
diff --git a/src/api/operator/numpy/np_tensordot_op.cc b/src/api/operator/numpy/np_tensordot_op.cc
index eef58b5b3389..55c131468b12 100644
--- a/src/api/operator/numpy/np_tensordot_op.cc
+++ b/src/api/operator/numpy/np_tensordot_op.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/api/operator/utils.h b/src/api/operator/utils.h
index 53e62ee7635b..8943e8058a19 100644
--- a/src/api/operator/utils.h
+++ b/src/api/operator/utils.h
@@ -56,6 +56,16 @@ void SetAttrDict(nnvm::NodeAttrs* attrs) {
   }
 }
 
+template<typename ValueType, typename T>
+Tuple<ValueType> Obj2Tuple(const runtime::ObjectRef& src) {
+  runtime::ADT adt = Downcast<runtime::ADT, runtime::ObjectRef>(src);
+  Tuple<ValueType> ret(adt.size(), 0);
+  for (size_t i = 0; i < adt.size(); ++i) {
+    ret[i] = Downcast<T, runtime::ObjectRef>(adt[i])->value;
+  }
+  return ret;
+}
+
 }  // namespace mxnet
 
 #endif  // MXNET_API_OPERATOR_UTILS_H_
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index cbdf29be7826..53b7de1744e3 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -67,6 +67,7 @@ struct NumpyReduceAxesParam : public dmlc::Parameter<NumpyReduceAxesParam> {
     DMLC_DECLARE_FIELD(initial).set_default(dmlc::optional<double>())
       .describe("Starting value for the sum.");
   }
+
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
     std::ostringstream axis_s, dtype_s, keepdims_s, initial_s;
     axis_s << axis;
@@ -449,6 +450,7 @@ inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
   }
 
   BroadcastComputeImpl<xpu>(attrs, ctx, inputs, req, outputs, small);
+
   if (normalize) {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, IType, {
@@ -500,11 +502,27 @@ struct NumpyMomentsParam : public dmlc::Parameter<NumpyMomentsParam> {
                 "precision than the default platform integer. In that case, if a is signed then "
                 "the platform integer is used while if a is unsigned then an unsigned integer of "
                 "the same precision as the platform integer is used.");
-    DMLC_DECLARE_FIELD(ddof).set_default(0)
-      .describe("Starting value for the sum.");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
       .describe("If this is set to `True`, the reduced axes are left "
                 "in the result as dimension with size one.");
+    DMLC_DECLARE_FIELD(ddof).set_default(0)
+      .describe("Starting value for the sum.");
+  }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream axis_s, dtype_s, keepdims_s, ddof_s;
+    axis_s << axis;
+    keepdims_s << keepdims;
+    ddof_s << ddof;
+    (*dict)["axis"] = axis_s.str();
+    dtype_s << dtype;
+    if (dtype.has_value()) {
+      (*dict)["dtype"] = MXNetTypeWithBool2String(dtype.value());
+    } else {
+      (*dict)["dtype"] = dtype_s.str();
+    }
+    (*dict)["keepdims"] = keepdims_s.str();
+    (*dict)["ddof"] = ddof_s.str();
   }
 };
 
@@ -560,6 +578,16 @@ struct NumpyWeightedAverageParam : public dmlc::Parameter<NumpyWeightedAveragePa
       .set_default(true)
       .describe("Auxiliary flag to deal with none weights.");
   }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream axis_s, returned_s, weighted_s;
+    axis_s << axis;
+    returned_s << returned;
+    weighted_s << weighted;
+    (*dict)["axis"] = axis_s.str();
+    (*dict)["returned"] = returned_s.str();
+    (*dict)["weighted"] = weighted_s.str();
+  }
 };
 
 inline bool NumpyWeightedAverageShape(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 026e60e8bb25..33418667dfb7 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -51,12 +51,12 @@ inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
 
   if (param.dtype.has_value()) {
     if (in_attrs->at(0) == mshadow::kBool) {
-      CHECK(param.dtype.value() == mshadow::kInt32
-          || param.dtype.value() == mshadow::kInt64
-          || param.dtype.value() == mshadow::kFloat32
-          || param.dtype.value() == mshadow::kFloat64) << "Only support the following output "
-                                                         "dtypes when input dtype is bool: "
-                                                         "int32, int64, float32, float64.";
+      CHECK(param.dtype.value() == mshadow::kInt32 ||
+            param.dtype.value() == mshadow::kInt64 ||
+            param.dtype.value() == mshadow::kFloat32 ||
+            param.dtype.value() == mshadow::kFloat64)
+        << "Only support the following output dtypes when input dtype is bool: "
+           "int32, int64, float32, float64.";
     }
     TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
   } else if (in_attrs->at(0) == mshadow::kBool) {
@@ -126,7 +126,7 @@ void TVMOpReduce(const OpContext& ctx,
 #endif  // MXNET_USE_TVM_OP
 }
 
-NNVM_REGISTER_OP(_np_sum)
+NNVM_REGISTER_OP(_npi_sum)
 .describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -145,9 +145,9 @@ NNVM_REGISTER_OP(_np_sum)
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
 .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_sum"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_npi_sum"});
 
-NNVM_REGISTER_OP(_backward_np_sum)
+NNVM_REGISTER_OP(_backward_npi_sum)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
@@ -155,8 +155,8 @@ NNVM_REGISTER_OP(_backward_np_sum)
 .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu>);
 
 inline bool NumpyReduceAxesNoDTypeType(const nnvm::NodeAttrs& attrs,
-                         std::vector<int> *in_attrs,
-                         std::vector<int> *out_attrs) {
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index 684348fcaa37..c5111c2954cd 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -26,10 +26,10 @@
 
 namespace mxnet {
 namespace op {
-NNVM_REGISTER_OP(_np_sum)
+NNVM_REGISTER_OP(_npi_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true>);
 
-NNVM_REGISTER_OP(_backward_np_sum)
+NNVM_REGISTER_OP(_backward_npi_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
 
 NNVM_REGISTER_OP(_np_max)
diff --git a/src/operator/tensor/histogram-inl.h b/src/operator/tensor/histogram-inl.h
index 7194445d7b52..29b27c6d659d 100644
--- a/src/operator/tensor/histogram-inl.h
+++ b/src/operator/tensor/histogram-inl.h
@@ -34,6 +34,8 @@
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
 #include <vector>
+#include <string>
+#include <unordered_map>
 #include <type_traits>
 #include "./util/tensor_util-inl.h"
 #include "../elemwise_op_common.h"
@@ -45,22 +47,30 @@ namespace mxnet {
 namespace op {
 
 struct HistogramParam : public dmlc::Parameter<HistogramParam> {
-    dmlc::optional<int> bin_cnt;
-    dmlc::optional<mxnet::Tuple<double>> range;
-    DMLC_DECLARE_PARAMETER(HistogramParam) {
-      DMLC_DECLARE_FIELD(bin_cnt)
-        .set_default(dmlc::optional<int>())
-        .describe("Number of bins for uniform case");
-      DMLC_DECLARE_FIELD(range)
-        .set_default(dmlc::optional<mxnet::Tuple<double>>())
-        .describe("The lower and upper range of the bins. if not provided, "
-                  "range is simply (a.min(), a.max()). values outside the "
-                  "range are ignored. the first element of the range must be "
-                  "less than or equal to the second. range affects the automatic "
-                  "bin computation as well. while bin width is computed to be "
-                  "optimal based on the actual data within range, the bin count "
-                  "will fill the entire range including portions containing no data.");
-    }
+  dmlc::optional<int> bin_cnt;
+  dmlc::optional<mxnet::Tuple<double>> range;
+  DMLC_DECLARE_PARAMETER(HistogramParam) {
+    DMLC_DECLARE_FIELD(bin_cnt)
+      .set_default(dmlc::optional<int>())
+      .describe("Number of bins for uniform case");
+    DMLC_DECLARE_FIELD(range)
+      .set_default(dmlc::optional<mxnet::Tuple<double>>())
+      .describe("The lower and upper range of the bins. if not provided, "
+                "range is simply (a.min(), a.max()). values outside the "
+                "range are ignored. the first element of the range must be "
+                "less than or equal to the second. range affects the automatic "
+                "bin computation as well. while bin width is computed to be "
+                "optimal based on the actual data within range, the bin count "
+                "will fill the entire range including portions containing no data.");
+  }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream bin_cnt_s, range_s;
+    bin_cnt_s << bin_cnt;
+    range_s << range;
+    (*dict)["bin_cnt"] = bin_cnt_s.str();
+    (*dict)["range"] = range_s.str();
+  }
 };
 
 struct FillBinBoundsKernel {